commit 991c615670f6a56fc4c3b06b4d1dfd8117cdf39c Author: CI Date: Tue Sep 2 14:31:15 2025 +0000 Build branch htrnaseq/update_craftbox with version updatecraftbox to htrnaseq on branch updatecraftbox (e6da525) Build pipeline: viash-hub.htrnaseq.updatecraftbox-zzrhd Source commit: https://github.com/viash-hub/htrnaseq/commit/e6da525fc57aaec74f348eb974b68faa647bf800 Source message: Merge branch 'main' into update_craftbox diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..53fcf2b6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,19 @@ +target +testData +resources_test + +# Nextflow related files +.nextflow +.nextflow.log* +work + +# Python related files +*__pycache__* +.venv + +# R related files +.Rproj.user +htrnaseq.Rproj + +# vscode +.vscode diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 00000000..303f3f96 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,310 @@ +# htrnaseq v0.11.0 + +## Breaking changes + +* `runner`: feature annotation data (fData) is now output to a subfolder `fData` (PR #68). + +## New features + +* `runner`: add output results to state in order for the workflow to be used as subworkflow (PR #68). + +## Bug fixes + +* `runner`: disable `publishFilesProc` because this workflow handles publishing itself (PR #68). + +## Minor changes + +* Bump craftbox to v0.3.0 (PR #69). + +# htrnaseq v0.10.0 + +## Breaking changes + +* `runner`: Replaced `ignore` parameter with `pools` paremeter. When set, only the selected pools are included for analysis. + By default all pools are selected (PR #66) + +## Bug fixes + +* Fix an error where processing FASTQ files from multiple lanes would cause an assertion error requesting the well demultiplexing + output to reside in one directory (PR #67). + +# Minor changes + +* `generate_well_statistics`: update base image to `python:3.13-trixie` (PR #67). + +# htrnaseq v0.9.1 + +## Bug fixes + +* Reverted functionality to set `fastq_publish_dir` and `results_publish_dir` using fromState (PR #64). + +* `runner`: fix detection of FASTQ files with non-numerical characters in the sample name (PR #65). + +# htrnaseq v0.9.0 + +## Breaking changes + +* `runner`: removed `plain_output` argument (PR #63). + +## Minor changes + +* `runner`: the `fastq_publish_dir` and `results_publish_dir` can now be set using `fromState` when using the workflow as subworkflow (PR #63). + +# htrnaseq v0.8.3 + +## Minor changes + +* Bump craftbox to v0.2.0 (PR #62). + +# htrnaseq v0.8.2 + +## Under the hood + +* Add the package config (`_viash.yaml`) to every component's target dir. This makes introspection from, e.g. a `runner` workflow much more robust (PR #61) + +# htrnaseq v0.8.1 + +## Bug fixes + +* Fix an issue where the FASTQ files from different samples on the same sequencing run would overwrite each other (PR #56). + +## Under the hood + +* Moved the test resources to their new location (PR #47). + +## Minor changes + +* Bump `biobox` and `craftbox` dependencies to versions `0.3.1` and `0.2.0`, respectively (PR #60). + +# htrnaseq v0.8.0 + +## New functionality + +* `save_params`: added a component to save workflow input parameters as yaml (PR #48). + +* Added `run_params` parameter to `htrnaseq` and `runner` workflows in order to save the input parameters + used for the workflow run (PR #48). + +# htrnaseq v0.7.2 + +## Documentation + +* Update README (PR #54) + +# htrnaseq v0.7.1 + +## Bug fixes + +* Bump viash version to `0.9.4`. This adds support for nextflow versions starting major version 25.01 and +fixes an issue where an integer being passed to a argument with `type: double` resulted in an error (PR #51). + +* `reporting`: updated default colour mapping (PR #50). + +## Minor changes + +* `create_report`: bump bioconductor version to 3.21 in order to accommodate R version 4.5 (PR #52). + +# htrnaseq v0.7.0 + +## Breaking changes + +The `runner` and `htrnaseq` workflow now output FASTQ files corresponding to the barcodes per input ID (per sequencing run). +Previously, when multiple input folders or multiple input FASTQ files were provided +(for the `runner` and `htrnaseq` workflows respectively), the demultiplexed FASTQ files for these inputs were concatenated +and provided as output. For the `htrnaseq` workflow, reads can still be combined by using a newly added `sampleID` argument. +This means that two lists of FASTQ files can be provided for a single sample, and by assigning the same `sampleID`, +these reads will be joined. For example, with other arguments are left out for brevity: + +```yaml +- id: sample1_run1 + input_r1: [sample_1_L001_1_R1.fastq, sample_1_L002_1_R1.fastq] + input_r2: [sample_1_L001_1_R2.fastq, sample_1_L002_1_R2.fastq] + sampleID: "sample_1" +- id: sample1_run2 + input_r1: [sample_1_L001_1_R1.fastq, sample_1_L002_1_R1.fastq] + input_r2: [sample_1_L001_1_R2.fastq, sample_1_L002_1_R2.fastq] + sampleID: "sample_1" +- id: sample_2 + input_r1: [sample_2_L001_1_R1.fastq, sample_2_L002_1_R1.fastq] + input_r2: [sample_2_L001_1_R2.fastq, sample_2_L002_1_R2.fastq] +``` + +For the runner, concatenation of data across samples is automatically inferred. Previously, multiple IDs (events) could be +provided which were processed in parallel. This is no longer possible, as providing multiple will cause the matching +samples for these runs to be concatenated. + + +For example, the following old parameter yaml +```yaml +- id: run1 + input: ["run_folder_1/", run_folder_2/] +``` +should now be provided as: +```yaml +- id: run1 + input: "run_folder_1/" +- id: run2 + input: run_folder_2/ +``` + +## Minor changes + +* Updated viash to `0.9.2` (PR #49) + +# htrnaseq v0.6.0 + +## Breaking changes + +* `runner`: a subdirectory `data_processed` is now added to the output structure, in between + the experiment ID and the directory with the workflow date and version (PR #45). + +# htrnaseq v0.5.5 + +## New functionality + +* Add `umi_length` parameter to the `runner` workflow (PR #46) + +# htrnaseq v0.5.4 + +* Fix missing barcodes in the output from `generate_pool_statistics`, which caused an assertion error in `create_pdata`. + In order to resolve the issue `generate_well_statistics` now outputs results for all chromosomes/scaffolds presented by + the genome annotation, even when no reads were mapped to the regions in question. `generate_pool_statistics` will now + remove regions from the output that have not at least one counts across all barcodes (PR #44). + +# htrnaseq v0.5.3 + +## Bug fixes + +* Fix `create_eset` component failing to create when one of the input samples has no counts (PR #43). + +# htrnaseq v0.5.2 + +## Bug fixes + +* `create_fdata`: remove duplicate entries from feature data (PR #41). + +# htrnaseq v0.5.1 + +## Bug fixes + +* `generate_well_statistics`: fix `ValueError` when an empty .bam file is provided as input (PR #40). +* `create_pdata`: avoid false positive `ValueError` for non-overlapping barcodes when input + data contains empty (`NA`) values (PR #40). + + +# htrnaseq v0.5.0 + +## New functionality + +* Added `ignore` parameter was added to the runner workflow in order to pass over certain input files + from the input directory (PR #39). + +# htrnaseq v0.4.0 + +## Breaking changes + +An effort has been made to align the inputs for the `htrnaseq` and the mapping and demultiplexing of the wells, in order +simplify running these steps as seperate steps (PR #37). + * Changes to the `parallel_map` component: + - The `barcode` argument has been renamed to `barcodesFasta` and the provided + value for this argument must now be single FASTA file instead of a list of barcodes. + - The filenames for the provided FASTQ files must now conform to the format `{name}_R(1|2).fasta`, + where `{name}` is the well identifiers. The well identifiers correspond to the headers + of the FASTA file containing the barcodes (up untill the first whitespace). + Forward and reverse FASTQ files must still be provided in pairs, meaning that the order of + files provided to `input_r1` and `input_r2` remains important. + - The requirement for equal number of barcodes and FASTQ pairs to be provided has been dropped. + Instead, the barcodes provided with `barcodesFasta` are matched to the input FASTQ files by comparing + the header of the FASTA records to the file names of the provided FASTQ input files. Each barcode must + match exactly one FASTQ input pair (forward and reverse reads), but FASTQ files that were not matched to any + barcode are not processed. Basically, the barcodes fasta can now act as a filter for the FASTQ files to be mapped. + * The `utils/groupWells` workflow has been removed. + * `parallel_map_wf` has been removed as its functionality is now incomporated into the `parallel_map` component. + * The `pool`, `well_id`, `barcode`, `lane`, `pair_end` and `n_wells` output arguments have been dropped from the + `well_demultiplexing` workflow. This workflow now only outputs a list of demultiplexed FASTQ files. + * A `well_metadata` workflow has been implemented that extracts the metadata that is no longer output by the `well_demultiplexing` + workflow from the demultiplexed files and the barcodes FASTA. + +## New functionality + +* Multiple input directories can not be provided. The input reads from these from these directories + will be joined per barcode before mapping. This is useful when data has been generated using + multiple sequencing runs in order to increase sequencing depth (PR #38). + +# htrnaseq v0.3.0 + +## New functionality + +* Added `umi_length` argument (PR #27). +* Added `runner` workflow (PR #26, see below) + +## `runner` workflow + +* Removed `wellBarcodesLength` from `parallel_map` workflow (PR #27). + +## Major changes + +A runner workflows has been added, providing two additional features: + +1. Start from an input directory containing fastq files rather than a list of input fastq pairs. +2. Improve the output of the workflow + +### Input directory + +It is now possible to specify a single `--input ` directory as input and the runner will extract the fastq file pairs. An error will be raised if the filename processing leads to errors. + +### Output + +The runner provides a complete different approach to output. A couple of things are important here: + +- Output is split up in 2 parts: + + 1. The well-demultiplexed fastq files (`--fastq_publish_dir`) + 2. All the other results of the workflow (`--results_publish_dir`) + +- The well-demultiplexed fastq file are stored under `--fastq_publish_dir` according to the following format: + + ``` + $fastq_publish_dir/$id/_htrnaseq_/$plate_$lane/_R1/2_001.fastq + ``` + +- The other results are stored under `--results_publish_dir` according to the following format: + + ``` + $results_publish_dir/$project_id/$experiment_id/_htrnaseq_/ + ``` + + This is an example listing of this directory: + + ``` + esets + fData + nrReadsNrGenesPerChrom + pData + report.html + star_output + starLogs + ``` + +This output structure can be circumvented by using the `--output_dir` option, which will store all output in a single directory. + +1. Using the `htrnaseq` workflow directory rather than the `runner` interface +2. Using the argument `--plain_output` with the `runner`. fastq files and other results will still be published in their respective directories, but not in a directory hierarchy as described above. + +## Minor changes + +* Use `v0.2.0` version of cutadapt instead of `main` (PR #23). +* Use `v0.3.0` version of cutadapt +* Bump viash to 0.9.1 (PR #31). +* `create_eset`: Update base container image, `R` version and all dependencies + to newer versions (PR #28). + +# htrnaseq v0.2.0 + +# New functionality + +* Make sure that the Well ID matches the required format (PR #22 and PR #21). + +# htrnaseq v0.1.0 + +Initial release diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..f5b784fa --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2021 OpenPipelines + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 00000000..aed16c90 --- /dev/null +++ b/README.md @@ -0,0 +1,207 @@ + + +# HT-RNAseq + +[![ViashHub](https://img.shields.io/badge/ViashHub-htrnaseq-7a4baa.svg)](https://www.viash-hub.com/packages/htrnaseq) +[![GitHub](https://img.shields.io/badge/GitHub-viash--hub%2Fhtrnaseq-blue.svg)](https://github.com/viash-hub/htrnaseq) +[![GitHub +License](https://img.shields.io/github/license/viash-hub/htrnaseq.svg)](https://github.com/viash-hub/htrnaseq/blob/main/LICENSE) +[![GitHub +Issues](https://img.shields.io/github/issues/viash-hub/htrnaseq.svg)](https://github.com/viash-hub/htrnaseq/issues) +[![Viash +version](https://img.shields.io/badge/Viash-v0.9.4-blue.svg)](https://viash.io) + +## Introduction + +This workflow is designed to process high-throughput RNA-seq data, where +every well of a microarray plate is a sample. A fasta file provided as +input defines the mapping between sample barcodes and wells. + +The workflow is built in a modular fashion, where most of the base +functionality is provided by components from +[`biobox`](https://www.viash-hub.com/packages/biobox/latest) +supplemented by custom base components and workflow components in this +package. + +The full workflow is split in two major subworkflows that can be run +independently: + +- **Well-demultiplexing:** Split the input (plate/pool level) fastq + files per well. +- **Mapping, counting and QC:** Run per-well mapping, counting and + generate QC reports. + +Each of those can be started individually, or the full workflow can be +run in two ways: + +1. Run the [main + workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq) + containing the main functionality. +2. Run the [(opinionated) + `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner) + where a number of choices (input/output structure and location) have + been made. + +Input for the workflow has to be `fastq` files (zipped or not). For bcl +or other formats, please consider running +[demultiplex](https://www.viash-hub.com/packages/demultiplex) first. + +``` mermaid lang="mermaid" +flowchart TB + subgraph runner [runner] + direction TB + subgraph htrnaseq [HT-RNAseq] + direction LR + demultiplex[Well demultiplexing] + map + report + eset + end + end + + demultiplex --> map --> report --> eset + + class runner container + class htrnaseq container + class demultiplex container-inner + class map container-inner + class report container-inner + class eset container-inner + + class demultiplex node + class map node + class report node + class eset node +``` + +## Example usage + +### Test and example data + +If you want to explore this workflow, it’s possible to the use data we +use as test data: [a DRUGseq +dataset](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE176150) +from the [NCBI Sequence Read Archive](https://www.ncbi.nlm.nih.gov/sra). +For the unit and integration tests, this data has been (partly) +subsampled to reduce the test runtime. We used +[seqtk](https://github.com/lh3/seqtk) for this with a seed of 1, e.g.: + +``` bash +seqtk sample -s1 orig/SRR14730302/VH02001614_S8_R1_001.fastq.gz 10000 > 10k/SRR14730302/VH02001614_S8_R1_001.fastq.gz +``` + +This data is available at: `gs://viash-hub-test-data/htrnaseq/v1/`. + +### Run from Viash Hub + +Open [Viash Hub](https://www.viash-hub.com) and browse to the [htrnaseq +component](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq). +Press the ‘Launch’ button and follow the instructions. + +![](assets/htrnaseq-launch-small.png) + +We will start an example run loading just one input and using a barcodes +fasta file containing only 2 wells. + +In the first step, we add the `local` profile to the list of profiles in +order to limit the cpu and memory requirements of the workflow steps: + +![](assets/launch-parameters-1-small.png) + +In the next step, we provide the paramters as follows: + +- `input_r1`: + `gs://viash-hub-test-data/htrnaseq/v1/100k/SRR14730301/VH02001612_S9_R1_001.fastq` +- `input_r2`: + `gs://viash-hub-test-data/htrnaseq/v1/100k/SRR14730301/VH02001612_S9_R2_001.fastq` +- `genomeDir`: + `gs://viash-hub-test-data/htrnaseq/v1/genomeDir/subset/Homo_sapiens/v0.0.3/` +- `barcodesFasta`: + `gs://viash-hub-test-data/htrnaseq/v1/2-wells-with-ids.fasta` +- `annotation`: + `gs://viash-hub-test-data/htrnaseq/v1/genomeDir/gencode.v41.annotation.gtf.gz` + +Please note that both `input_r1` and `input_r2` can take multiple +values. This means that one has to press ENTER after pasting the input +path. + +![](assets/launch-parameters-2-small.png) + +Press the ‘Launch’ button at the end to get the instructions on how to +run the workflow from the CLI. + +### Run using NF-Tower / Seqera Cloud + +It’s possible to run the workflow directly from [Seqera +Cloud](https://cloud.seqera.io). The necessary [Nextflow schema +file](https://nextflow-io.github.io/nf-schema/latest/nextflow_schema/nextflow_schema_specification/) +has been built and provided with the workflows in order to use the +form-based input. However, Seqera Cloud can not deal with multiple-value +parameters when using the form-based input. Therefore, it’s better to +use Viash Hub also here: + +First, select the option to run the workflow using Seqera Cloud. You +will need to create an API token for your account. Once this token is +filled in in the corresponding field, you will get the option to select +a ‘Workspace’ and a ‘Compute environment’. + +![](assets/launch-parameters-3-small.png) + +Next, we need to fill in the parameters for the run. This is similar to +before: + +![](assets/launch-parameters-4-small.png) + +In the next screen, pressing the ‘Launch’ button will actually start the +workflow on Seqera Cloud. A message is shown when the submit was +successful. + +![](assets/launch-parameters-5-small.png) + +### Run from the CLI + +Running from the CLI directly without using Viash hub is possible. The +easiest is to just use the integrated help functionality, for instance +using the following: + +``` bash + nextflow run https://packages.viash-hub.com/vsh/htrnaseq.git \ + -revision v0.8.1 \ + -main-script target/nextflow/workflows/runner/main.nf \ + --help +``` + +### (Optional) Resource usage tuning + +Nextflow’s labels can be used to specify the amount of resources a +process can use. This workflow uses the following labels for CPU and +memory: + +- `verylowmem`, `lowmem`, `midmem`, `highmem` +- `verylowcpu`, `lowcpu`, `midcpu`, `highcpu` + +The defaults for these labels can be found at +`src/config/labels.config`. Nextflow checks that the specified resources +for a process do not exceed what is available on the machine and will +not start if it does. Create your own config file to tune the labels to +your needs, for example: + + // Resource labels + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 8 } + withLabel: midcpu { cpus = 16 } + withLabel: highcpu { cpus = 32 } + + withLabel: verylowmem { memory = { get_memory( 4.GB * task.attempt ) } } + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 16.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 64.GB * task.attempt ) } } + +When starting nextflow using the CLI, you can use `-c` to provide the +file to nextflow and overwrite the defaults. + +## Contributions + +Developed in collaboration with Data Intuitive and Open Analytics. + +Other contributions are welcome. diff --git a/README.qmd b/README.qmd new file mode 100644 index 00000000..0f42f31a --- /dev/null +++ b/README.qmd @@ -0,0 +1,149 @@ +--- +format: gfm +--- + +```{r setup, include=FALSE} +project <- yaml::read_yaml("_viash.yaml") +license <- paste0(project$links$repository, "/blob/main/LICENSE") +contributing <- paste0(project$links$repository, "/blob/main/CONTRIBUTING.md") +``` + +# HT-RNAseq + +[![ViashHub](https://img.shields.io/badge/ViashHub-`r project$name`-7a4baa.svg)](https://www.viash-hub.com/packages/`r project$name`) +[![GitHub](https://img.shields.io/badge/GitHub-viash--hub%2F`r project$name`-blue.svg)](`r project$links$repository`) +[![GitHub License](https://img.shields.io/github/license/viash-hub/`r project$name`.svg)](`r license`) +[![GitHub Issues](https://img.shields.io/github/issues/viash-hub/`r project$name`.svg)](`r project$links$issue_tracker`) +[![Viash version](https://img.shields.io/badge/Viash-v`r gsub("-", "--", project$viash_version)`-blue.svg)](https://viash.io) + +## Introduction + +`r project$description` + + +```{mermaid lang='mermaid'} +flowchart TB + subgraph runner [runner] + direction TB + subgraph htrnaseq [HT-RNAseq] + direction LR + demultiplex[Well demultiplexing] + map + report + eset + end + end + + demultiplex --> map --> report --> eset + + class runner container + class htrnaseq container + class demultiplex container-inner + class map container-inner + class report container-inner + class eset container-inner + + class demultiplex node + class map node + class report node + class eset node +``` + + +## Example usage + +### Test and example data + +If you want to explore this workflow, it's possible to the use data we use as test data: [a DRUGseq dataset](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE176150) from the [NCBI Sequence Read Archive](https://www.ncbi.nlm.nih.gov/sra). For the unit and integration tests, this data has been (partly) subsampled to reduce the test runtime. We used [seqtk](https://github.com/lh3/seqtk) for this with a seed of 1, e.g.: + +```bash +seqtk sample -s1 orig/SRR14730302/VH02001614_S8_R1_001.fastq.gz 10000 > 10k/SRR14730302/VH02001614_S8_R1_001.fastq.gz +``` + +This data is available at: `gs://viash-hub-test-data/htrnaseq/v1/`. + +### Run from Viash Hub + +Open [Viash Hub](https://www.viash-hub.com) and browse to the [htrnaseq component](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq). Press the 'Launch' button and follow the instructions. + + +![](assets/htrnaseq-launch-small.png) + +We will start an example run loading just one input and using a barcodes fasta file containing only 2 wells. + +In the first step, we add the `local` profile to the list of profiles in order to limit the cpu and memory requirements of the workflow steps: + + +![](assets/launch-parameters-1-small.png) + +In the next step, we provide the paramters as follows: + +- `input_r1`: `gs://viash-hub-test-data/htrnaseq/v1/100k/SRR14730301/VH02001612_S9_R1_001.fastq` +- `input_r2`: `gs://viash-hub-test-data/htrnaseq/v1/100k/SRR14730301/VH02001612_S9_R2_001.fastq` +- `genomeDir`: `gs://viash-hub-test-data/htrnaseq/v1/genomeDir/subset/Homo_sapiens/v0.0.3/` +- `barcodesFasta`: `gs://viash-hub-test-data/htrnaseq/v1/2-wells-with-ids.fasta` +- `annotation`: `gs://viash-hub-test-data/htrnaseq/v1/genomeDir/gencode.v41.annotation.gtf.gz` + +Please note that both `input_r1` and `input_r2` can take multiple values. This means that one has to press ENTER after pasting the input path. + +![](assets/launch-parameters-2-small.png) + +Press the 'Launch' button at the end to get the instructions on how to run the workflow from the CLI. + + +### Run using NF-Tower / Seqera Cloud + +It's possible to run the workflow directly from [Seqera Cloud](https://cloud.seqera.io). The necessary [Nextflow schema file](https://nextflow-io.github.io/nf-schema/latest/nextflow_schema/nextflow_schema_specification/) has been built and provided with the workflows in order to use the form-based input. However, Seqera Cloud can not deal with multiple-value parameters when using the form-based input. Therefore, it's better to use Viash Hub also here: + +First, select the option to run the workflow using Seqera Cloud. You will need to create an API token for your account. Once this token is filled in in the corresponding field, you will get the option to select a 'Workspace' and a 'Compute environment'. + +![](assets/launch-parameters-3-small.png) + +Next, we need to fill in the parameters for the run. This is similar to before: + +![](assets/launch-parameters-4-small.png) + +In the next screen, pressing the 'Launch' button will actually start the workflow on Seqera Cloud. A message is shown when the submit was successful. + +![](assets/launch-parameters-5-small.png) + +### Run from the CLI + +Running from the CLI directly without using Viash hub is possible. The easiest is to just use the integrated help functionality, for instance using the following: + +```bash + nextflow run https://packages.viash-hub.com/vsh/htrnaseq.git \ + -revision v0.8.1 \ + -main-script target/nextflow/workflows/runner/main.nf \ + --help +``` + +### (Optional) Resource usage tuning + +Nextflow's labels can be used to specify the amount of resources a process can use. This workflow uses the following labels for CPU and memory: + +* `verylowmem`, `lowmem`, `midmem`, `highmem` +* `verylowcpu`, `lowcpu`, `midcpu`, `highcpu` + +The defaults for these labels can be found at `src/config/labels.config`. Nextflow checks that the specified resources for a process do not exceed what is available on the machine and will not start if it does. Create your own config file to tune the labels to your needs, for example: + +``` +// Resource labels +withLabel: verylowcpu { cpus = 2 } +withLabel: lowcpu { cpus = 8 } +withLabel: midcpu { cpus = 16 } +withLabel: highcpu { cpus = 32 } + +withLabel: verylowmem { memory = { get_memory( 4.GB * task.attempt ) } } +withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } +withLabel: midmem { memory = { get_memory( 16.GB * task.attempt ) } } +withLabel: highmem { memory = { get_memory( 64.GB * task.attempt ) } } +``` + +When starting nextflow using the CLI, you can use `-c` to provide the file to nextflow and overwrite the defaults. + +## Contributions + +Developed in collaboration with Data Intuitive and Open Analytics. + +Other contributions are welcome. diff --git a/_viash.yaml b/_viash.yaml new file mode 100644 index 00000000..d08de7b1 --- /dev/null +++ b/_viash.yaml @@ -0,0 +1,21 @@ +name: htrnaseq +summary: | + A workflow for high-throughput RNA-seq data analyses. +description: "This workflow is designed to process high-throughput RNA-seq data, where every\nwell of a microarray plate is a sample. A fasta file provided as input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow is built in a modular fashion, where most of the base functionality\nis provided by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\nsupplemented by custom base components and workflow components in this package.\n\nThe full workflow is split in two major subworkflows that can be run independently:\n\n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate QC reports.\n\nEach of those can be started individually, or the full workflow can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq) \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner) where a\nnumber of choices (input/output structure and location) have been made.\n\nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex) first.\n" +license: MIT +keywords: [bioinformatics, sequencing, high-throughput, RNAseq, mapping, counting, pipeline, workflow] +links: + issue_tracker: https://github.com/viash-hub/htrnaseq/issues + repository: https://github.com/viash-hub/htrnaseq +viash_version: 0.9.4 +info: + test_resources: + - path: gs://viash-hub-resources/htrnaseq/v2 + dest: resources_test +config_mods: | + .requirements.commands := ['ps'] + .runners[.type == 'nextflow'].config.script += 'includeConfig("nextflow_labels.config")' + .resources += {path: '/src/config/labels.config', dest: 'nextflow_labels.config'} + .resources += {path: '/_viash.yaml', dest: '_viash.yaml'} +version: updatecraftbox +organization: vsh diff --git a/assets/htrnaseq-launch-small.png b/assets/htrnaseq-launch-small.png new file mode 100644 index 00000000..c0872d6f Binary files /dev/null and b/assets/htrnaseq-launch-small.png differ diff --git a/assets/htrnaseq-launch.png b/assets/htrnaseq-launch.png new file mode 100644 index 00000000..bcbd2f25 Binary files /dev/null and b/assets/htrnaseq-launch.png differ diff --git a/assets/launch-parameters-1-small.png b/assets/launch-parameters-1-small.png new file mode 100644 index 00000000..6fa7ef94 Binary files /dev/null and b/assets/launch-parameters-1-small.png differ diff --git a/assets/launch-parameters-1.png b/assets/launch-parameters-1.png new file mode 100644 index 00000000..5a76b1fd Binary files /dev/null and b/assets/launch-parameters-1.png differ diff --git a/assets/launch-parameters-2-small.png b/assets/launch-parameters-2-small.png new file mode 100644 index 00000000..7c4a9eff Binary files /dev/null and b/assets/launch-parameters-2-small.png differ diff --git a/assets/launch-parameters-2.png b/assets/launch-parameters-2.png new file mode 100644 index 00000000..3d8dc5b5 Binary files /dev/null and b/assets/launch-parameters-2.png differ diff --git a/assets/launch-parameters-3-small.png b/assets/launch-parameters-3-small.png new file mode 100644 index 00000000..7e23730b Binary files /dev/null and b/assets/launch-parameters-3-small.png differ diff --git a/assets/launch-parameters-3.png b/assets/launch-parameters-3.png new file mode 100644 index 00000000..cdb842c5 Binary files /dev/null and b/assets/launch-parameters-3.png differ diff --git a/assets/launch-parameters-4-small.png b/assets/launch-parameters-4-small.png new file mode 100644 index 00000000..4f9b7703 Binary files /dev/null and b/assets/launch-parameters-4-small.png differ diff --git a/assets/launch-parameters-4.png b/assets/launch-parameters-4.png new file mode 100644 index 00000000..2289f7c6 Binary files /dev/null and b/assets/launch-parameters-4.png differ diff --git a/assets/launch-parameters-5-small.png b/assets/launch-parameters-5-small.png new file mode 100644 index 00000000..a6e2700c Binary files /dev/null and b/assets/launch-parameters-5-small.png differ diff --git a/assets/launch-parameters-5.png b/assets/launch-parameters-5.png new file mode 100644 index 00000000..abbc1f75 Binary files /dev/null and b/assets/launch-parameters-5.png differ diff --git a/main.nf b/main.nf new file mode 100644 index 00000000..5b3d280c --- /dev/null +++ b/main.nf @@ -0,0 +1,3 @@ +workflow { +print("This is a dummy placeholder for pipeline execution. Please use the corresponding nf files for running pipelines.") +} diff --git a/nextflow.config b/nextflow.config new file mode 100644 index 00000000..fea717b0 --- /dev/null +++ b/nextflow.config @@ -0,0 +1,12 @@ +manifest { + homePage = 'https://github.com/viash-hub/htrnaseq' + description = 'HT-RNAseq pipeline' + mainScript = 'target/nextflow/workflows/htrnaseq/main.nf' +} + +process { + withName: publishStatesProc { + publishDir = [ enabled: false ] + } +} + diff --git a/src/base/authors/dries_schaumont.yaml b/src/base/authors/dries_schaumont.yaml new file mode 100644 index 00000000..c8eaa47d --- /dev/null +++ b/src/base/authors/dries_schaumont.yaml @@ -0,0 +1,11 @@ +name: Dries Schaumont +info: + links: + email: dries@data-intuitive.com + github: DriesSchaumont + orcid: "0000-0002-4389-0440" + linkedin: dries-schaumont + organizations: + - name: Data Intuitive + href: https://www.data-intuitive.com + role: Data Scientist \ No newline at end of file diff --git a/src/base/authors/marijke_van_moerbeke.yaml b/src/base/authors/marijke_van_moerbeke.yaml new file mode 100644 index 00000000..429bd926 --- /dev/null +++ b/src/base/authors/marijke_van_moerbeke.yaml @@ -0,0 +1,10 @@ +name: Marijke Van Moerbeke +info: + links: + github: mvanmoerbeke + orcid: 0000-0002-3097-5621 + linkedin: marijke-van-moerbeke-84303a34 + organizations: + - name: OpenAnalytics + href: https://www.openanalytics.eu + role: Statistical Consultant \ No newline at end of file diff --git a/src/base/authors/toni_verbeiren.yaml b/src/base/authors/toni_verbeiren.yaml new file mode 100644 index 00000000..aa78550d --- /dev/null +++ b/src/base/authors/toni_verbeiren.yaml @@ -0,0 +1,10 @@ +name: Toni Verbeiren +info: + role: Core Team Member + links: + github: tverbeiren + linkedin: verbeiren + organizations: + - name: Data Intuitive + href: https://www.data-intuitive.com + role: Data Scientist and CEO \ No newline at end of file diff --git a/src/config/labels.config b/src/config/labels.config new file mode 100644 index 00000000..2821ec46 --- /dev/null +++ b/src/config/labels.config @@ -0,0 +1,108 @@ +executor { + $k8s { + submitRateLimit = '10sec' + pollInterval = '1 sec' + } +} + +process { + container = 'nextflow/bash:latest' + + // default resources + memory = { 8.Gb * task.attempt } + cpus = 8 + maxForks = 36 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + maxMemory = 192.GB + + // Resource labels + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 8 } + withLabel: midcpu { cpus = 16 } + withLabel: highcpu { cpus = 32 } + + withLabel: verylowmem { memory = { get_memory( 4.GB * task.attempt ) } } + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 16.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 64.GB * task.attempt ) } } + +} + +profiles { + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.fixOwnership = true + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + + local { + // This config is for local processing. + process { + withName: ".*parallel_map_process" { + maxForks = 1 + } + maxMemory = 25.GB + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 6 } + withLabel: highcpu { cpus = 8 } + + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 12.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 20.GB * task.attempt ) } } + } + } +} + +def get_memory(to_compare) { + if (!process.containsKey("maxMemory") || !process.maxMemory) { + return to_compare + } + + try { + if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) { + return process.maxMemory + } + else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) { + return max_memory as nextflow.util.MemoryUnit + } + else { + return to_compare + } + } catch (all) { + println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!" + System.exit(1) + } + } diff --git a/src/eset/create_eset/config.vsh.yaml b/src/eset/create_eset/config.vsh.yaml new file mode 100644 index 00000000..f7837482 --- /dev/null +++ b/src/eset/create_eset/config.vsh.yaml @@ -0,0 +1,56 @@ +name: create_eset +namespace: "eset" +authors: + - __merge__: /src/base/authors/dries_schaumont.yaml + roles: [ maintainer ] + - __merge__: /src/base/authors/marijke_van_moerbeke.yaml + roles: [ author ] +argument_groups: + - name: "Arguments" + arguments: + - type: file + name: "--pDataFile" + required: true + - type: file + name: "--fDataFile" + required: true + - type: file + name: "--mappingDir" + multiple: true + required: true + - type: string + name: --poolName + required: true + - name: "--output" + type: file + required: true + direction: output + default: eset.$id.rds +resources: + - type: r_script + path: script.R +test_resources: + - type: r_script + path: test.R + - path: test_data/pData.tsv + - path: test_data/fData.tsv + - path: test_data/mapping_dir +engines: + - type: docker + image: rocker/r2u:24.04 + setup: + - type: r + cran: + - data.table + - nlcv + bioc: + - Seurat + test_setup: + - type: r + cran: + - testthat +runners: + - type: executable + - type: nextflow + + \ No newline at end of file diff --git a/src/eset/create_eset/script.R b/src/eset/create_eset/script.R new file mode 100644 index 00000000..bab5d6d4 --- /dev/null +++ b/src/eset/create_eset/script.R @@ -0,0 +1,431 @@ +library(Biobase) +library(data.table) +library(nlcv) +library(Matrix) +library(Seurat) + +### VIASH START +par <- list( + pDataFile = "src/eset/create_eset/test_data/pData.tsv", + fDataFile = "src/eset/create_eset/test_data/fData.tsv", + studyType = "Standard", + mappingDir = c("src/eset/create_eset/test_data/mapping_dir/AACAAGGTAC", + "src/eset/create_eset/test_data/mapping_dir/ACGCCTTCGT"), + output = "eset.rds", + poolName = "Foo" +) +### VIASH END + + +Read10X <- function(data_dir = NULL, gene_column = 2, unique_features = TRUE) { + full.data <- list() + for (i in seq_along(along.with = data_dir)) { + run <- data_dir[i] + if (!dir.exists(paths = run)) { + stop("Directory provided does not exist") + } + barcode.loc <- file.path(run, "barcodes.tsv") + gene.loc <- file.path(run, "features.tsv") + features.loc <- file.path(run, "features.tsv.gz") + matrix.loc <- file.path(run, "matrix.mtx") + pre_ver_3 <- file.exists(gene.loc) + if (!pre_ver_3) { + addgz <- function(s) { + return(paste0(s, ".gz")) + } + barcode.loc <- addgz(s = barcode.loc) + matrix.loc <- addgz(s = matrix.loc) + } + if (!file.exists(barcode.loc)) { + stop("Barcode file missing") + } + if (!pre_ver_3 && !file.exists(features.loc)) { + stop("Gene name or features file missing") + } + if (!file.exists(matrix.loc)) { + stop("Expression matrix file missing") + } + data <- readMM(file = matrix.loc) + cell.names <- readLines(barcode.loc) + if (all(grepl(pattern = "\\-1$", x = cell.names))) { + cell.names <- as.vector(x = as.character(x = sapply(X = cell.names, + FUN = ExtractField, field = 1, delim = "-"))) + } + if (is.null(x = names(x = data_dir))) { + if (i < 2) { + colnames(x = data) <- cell.names + } + else { + colnames(x = data) <- paste0(i, "_", cell.names) + } + } + else { + colnames(x = data) <- paste0(names(x = data_dir)[i], + "_", cell.names) + } + feature.names <- read.delim(file = ifelse(test = pre_ver_3, + yes = gene.loc, no = features.loc), header = FALSE, + stringsAsFactors = FALSE) + if (any(is.na(x = feature.names[, gene_column]))) { + warning("Some features names are NA. Replacing NA names with ID from the opposite column requested", + call. = FALSE, immediate. = TRUE) + na.features <- which(x = is.na(x = feature.names[, + gene_column])) + replacement.column <- ifelse(test = gene_column == + 2, yes = 1, no = 2) + feature.names[na.features, gene_column] <- feature.names[na.features, + replacement.column] + } + if (unique_features) { + fcols = ncol(x = feature.names) + if (fcols < gene_column) { + stop(paste0("gene_column was set to ", gene_column, + " but feature.tsv.gz (or genes.tsv) only has ", + fcols, " columns.", " Try setting the gene_column ", + "argument to a value <= to ", + fcols, ".")) + } + rownames(x = data) <- make.unique(names = feature.names[, + gene_column]) + } + if (ncol(x = feature.names) > 2) { + data_types <- factor(x = feature.names$V3) + lvls <- levels(x = data_types) + if (length(x = lvls) > 1 && length(x = full.data) == 0) { + message(paste0("10X data contains more than one type and is ", + "being returned as a list containing matrices ", + "of each type.")) + } + expr_name <- "Gene Expression" + if (expr_name %in% lvls) { + lvls <- c(expr_name, lvls[-which(x = lvls == + expr_name)]) + } + data <- lapply(X = lvls, FUN = function(l) { + return(data[data_types == l, , drop = FALSE]) + }) + names(x = data) <- lvls + } else { + data <- list(data) + } + full.data[[length(x = full.data) + 1]] <- data + } + list_of_data <- list() + for (j in 1:length(x = full.data[[1]])) { + list_of_data[[j]] <- do.call(cbind, lapply(X = full.data, + FUN = `[[`, j)) + list_of_data[[j]] <- as(object = list_of_data[[j]], Class = "CsparseMatrix") + } + names(x = list_of_data) <- names(x = full.data[[1]]) + if (length(x = list_of_data) == 1) { + return(list_of_data[[1]]) + } else { + return(list_of_data) + } +} + +match_features <- function(exprs_matrix, fdata) { + + identical_features <- all(rownames(exprs_matrix) == rownames(fdata)) + + if (nrow(exprs_matrix) != nrow(fdata) || !identical_features) { + message(paste0("Features in 'fData' and expression matrix differ. ", + "Only matching features are returned.")) + } + + features <- intersect(rownames(exprs_matrix), rownames(fdata)) + exprs_matrix <- exprs_matrix[which(rownames(exprs_matrix) %in% features), ] + fdata <- fdata[which(rownames(fdata) %in% features), ] + + fdata[, seq_len(ncol(fdata))] <- lapply(fdata[, seq_len(ncol(fdata)), drop = FALSE], as.character) + # order features in exprs mat according to fdata + exprs_matrix <- exprs_matrix[match(rownames(fdata), rownames(exprs_matrix)), ] + + list(exprs_matrix = exprs_matrix, fdata = fdata) + +} + + +create_pdata <- function(sample_file, pool_name, barcodes) { + cols_to_remove <- c("SampleFileName", "Output", "Measure", "Strandedness") + pData <- sample_file[, !colnames(sample_file) %in% cols_to_remove, + drop = FALSE] + rownames(pData) <- lapply(sample_file$WellBC, + \(x) paste(pool_name, x, sep = "_")) + # pData[, ] <- lapply(pData, as.factor) + pData$PoolName <- pool_name + pData <- pData[match(barcodes, pData$WellBC), ] + return(pData) +} + +check_sample_file <- function(mapping_dir, sample_file){ + + message("Checking sample annotation:") + + requireNamespace("tools") + mapping_dir <- unlist(lapply(mapping_dir, function(x) { + if (!dir.exists(x)) { + stop(sprintf(paste0("Could not find directory ", + "provided in 'mappingDir' argument (%s)."), x)) + } + tools::file_path_as_absolute(x) + })) + + + # additional check for STARsolo + check_STARsolo_output <- function(x) { + files <- c("barcodes.tsv", "features.tsv", "matrix.mtx") + test <- list.files(x) %in% c(files, paste0(files, ".gz")) + length(test) != 0 && all(test) + } + + + if (!"WellBC" %in% colnames(sample_file)) { + stop(paste0("STARsolo output is used. The sample annotation must ", + "contain 'WellBC' column providing cell barcodes.")) + } + + mapping_dir <- unique(mapping_dir) + all_STARsolo_files_present <- all( + unlist( + lapply(mapping_dir, function(x) { + check_STARsolo_output(x) + }) + ) + ) + if (!all_STARsolo_files_present) { + stop(paste0("Could not find files: 'barcodes', 'features' and 'matrix'", + " for STARsolo output. Please check 'mappingDir' argument.")) + } + + message("- 'SampleFileName' column - OK") + + + + list(sample_expression_files = mapping_dir) +} + +create_exprs_matrix <- function(exprs_matrix_path, exprs_file_paths, + output, measure, col_names, cell_barcodes) { + + read_matrix <- Read10X(data_dir = exprs_file_paths, gene_column = 1) + # keep index of feature names containing "_" because Seurat + #changes them to "-" and they no longer match with fdata[, "gene_id"] + idx <- grep("_", rownames(read_matrix)) + + requireNamespace("Seurat") + seurat_object <- Seurat::CreateSeuratObject(counts = read_matrix) + + exprs_matrix <- as.matrix(seurat_object[['RNA']]$counts) + # replace "-" with "_" for features with "_" + # before converting to Seurat object + rownames(exprs_matrix)[idx] <- gsub("-", "_", rownames(exprs_matrix)[idx]) + requireNamespace("stringr") + exprs_matrix <- exprs_matrix[, stringr::str_detect(colnames(exprs_matrix), + paste(cell_barcodes, collapse = "|"))] + + + # check if rownames are ENSEMBL and remove version suffix + isENSEMBL <- all(grepl("ENS", rownames(exprs_matrix))) + if (isENSEMBL) { + # do not use gsub("(.+)[.]\\d+", "\\1", rownames(exprs_matrix)), + # so that ENS000000.1_PAR_Y can be kept + rownames(exprs_matrix) <- gsub("\\.\\d+$", "", rownames(exprs_matrix)) + } + + + colnames(exprs_matrix) <- col_names + + exprs_matrix +} + +create_eset <- function(feature_annotation_path, + sample_annotation_path, + mapping_dir, + barcodes, + output_path, + pool_name, + exprs_matrix_path = NULL, + path = NULL, + add_eset_annotation = NULL) { + if (!file.exists(feature_annotation_path)) { + stop("Could not find feature annotation at '", feature_annotation_path, "'") + } + + if (!file.exists(sample_annotation_path)) { + stop("Could not find sample annotation at '", sample_annotation_path, "'") + } + + if(!is.null(exprs_matrix_path)) { + if(!file.exists(exprs_matrix_path)) { + stop("Could not find expression matrix at '", exprs_matrix_path, "'") + } + } + + if(!is.null(path)) { + if(!dir.exists(path)) { + stop("Provided 'path': '", path, "' does not exist.") + } + } + + ##### Import annotation files ##### + message("Importing feature annotation") + fdata_file <- read.table(feature_annotation_path, header = TRUE, + sep = "\t", quote = "\"", + comment.char = "", stringsAsFactors = FALSE) + + # for backwards compatibility + if("ENSEMBL" %in% colnames(fdata_file) && !all(grepl("ENS", fdata_file[, "ENSEMBL"])) & !"gene_id" %in% colnames(fdata_file)) { + colnames(fdata_file)[which(colnames(fdata_file) == "ENSEMBL")] <- "gene_id" + } + + # Check gene annotation + if(!"gene_id" %in% colnames(fdata_file)) + stop("'gene_id' column with unique feature identifiers must be present in 'feature_annotation_path'.") + + # check if duplicated ids are present + if(any(duplicated(fdata_file$gene_id))) + stop("Duplicated features ids are not allowed. Please check the 'gene_id' column in 'feature_annotation_path'.") + + message("Importing sample annotation") + sample_file <- read.table(sample_annotation_path, header = TRUE, + sep = "\t", quote = "\"", + comment.char = "", stringsAsFactors = FALSE) + # Check sample annotation + check_sample_file_list <- check_sample_file(mapping_dir = mapping_dir, + sample_file = sample_file) + output <- "STARsolo" + measure <- "counts" + sample_expression_files <- check_sample_file_list$sample_expression_files + + ##### Create phenodata ##### + pdata_eset <- create_pdata(sample_file = sample_file, pool_name = pool_name, + barcodes = barcodes) + + ##### Create expression matrix ##### + message("Creating expression matrix") + + exprs_matrix_eset <- create_exprs_matrix( + exprs_matrix_path = exprs_matrix_path, + exprs_file_paths = sample_expression_files, + output = output, + measure = measure, + col_names = rownames(pdata_eset), + cell_barcodes = barcodes + ) + + + ##### Create featuredata ##### + message("Creating feature data") + + fdata_eset <- fdata_file + rownames(fdata_eset) <- fdata_eset[, "gene_id"] + + # intersect features between exprs matrix and fdata + feature_files <- match_features(exprs_matrix = exprs_matrix_eset, + fdata = fdata_eset) + + fdata_eset <- feature_files$fdata + exprs_matrix_eset <- feature_files$exprs_matrix + + ##### Create eSet ##### + message("Creating eset") + + if (nrow(pdata_eset) != ncol(exprs_matrix_eset)) { + stop("nrow(pData) and ncol(exprsMatrix) differ") + } + + if (nrow(fdata_eset) != nrow(exprs_matrix_eset)) { + stop("nrow(fData) and nrow(exprsMatrix) differ") + } + + if (!all(rownames(pdata_eset) == colnames(exprs_matrix_eset))) { + stop("rownames(pData) and colnames(exprsMatrix) differ") + } + + if (!all(rownames(fdata_eset) == rownames(exprs_matrix_eset))) { + stop("rownames(fData) and rownames(exprsMatrix) differ") + } + + if (!inherits(exprs_matrix_eset, "matrix")) { + stop("exprsMatrix must be of class 'matrix'") + } + + + + additional_info <- paste0("Additional information about eSet \n", + " Expression matrix created from ", + output, " output. \n", + " Expression matrix contains non-transformed ", + ifelse(output %in% c("STAR", "STARsolo"), + "counts", + ifelse(measure == "expected_count", + "counts", measure)), ".") + + + if (isTRUE(!is.null(add_eset_annotation) & + is.character(add_eset_annotation))) { + additional_info <- paste0(additional_info, "\n", " ", add_eset_annotation) + } + + fdata_eset <- new("AnnotatedDataFrame", data = fdata_eset) + pdata_eset <- new("AnnotatedDataFrame", data = pdata_eset) + + requireNamespace("Biobase") + eset <- Biobase::ExpressionSet(assayData = exprs_matrix_eset, + phenoData = pdata_eset, + featureData = fdata_eset, + annotation = additional_info) + + eset <- eset[, colSums(exprs(eset)) != 0] + saveRDS(eset, file = output_path) + + message(paste0("eset created succesfully for ", ncol(eset), + " samples and ", nrow(eset), + " genes and saved at ", output_path, ".")) + + eset +} + + +p_data_file <- par$pDataFile +f_data_file <- par$fDataFile +pool_name <- par$poolName +mapping_dir <- lapply(par$mappingDir, + \(x) file.path(x, "Solo.out", "Gene", "raw")) + +get_barcode_from_mapping_dir <- function(raw_dir) { + barcodes_file <- file.path(raw_dir, "barcodes.tsv") + if (!file.exists(barcodes_file)) { + stop(paste0("Expected the 'Solo.out/Gene/raw' directory at ", + raw_dir, " to contain a 'barcodes.tsv' file.")) + } + barcodes <- readLines(barcodes_file) + if (length(barcodes) != 1) { + stop(paste0("A single STAR Solo folder should only have ", + "mapped one (1) barcode, but found '", + length(barcodes), "'for mapping directory ", raw_dir)) + } + return(barcodes) +} + +barcodes <- lapply(mapping_dir, get_barcode_from_mapping_dir) + +print(paste0("mappingDir: ", mapping_dir)) +print(paste0("pDataFile: ", p_data_file)) +print(paste0("fDataFile: ", f_data_file)) +print(paste0("poolName: ", pool_name)) +print(paste0("barcodes: ", barcodes)) + + + +# CREATE ESET WITH RAW UMI COUNTS + +eset <- create_eset(feature_annotation_path = f_data_file, + sample_annotation_path = p_data_file, + mapping_dir = mapping_dir, + barcodes = barcodes, + output_path = par$output, + pool_name = pool_name, + path = NULL, + exprs_matrix_path = NULL) \ No newline at end of file diff --git a/src/eset/create_eset/test.R b/src/eset/create_eset/test.R new file mode 100644 index 00000000..bc0b5ce5 --- /dev/null +++ b/src/eset/create_eset/test.R @@ -0,0 +1,132 @@ +library(testthat) +library(Biobase) + +### VIASH START +meta <- list( + resources_dir = "src/eset/create_eset/test_data", + executable = "target/executable/eset/create_eset/create_eset" +) + +### VIASH END + +output <- tempfile() + +out <- processx::run(meta$executable, c( + "--pDataFile", file.path(meta$resources_dir, "pData.tsv"), + "--fDataFile", file.path(meta$resources_dir, "fData.tsv"), + "--mappingDir", file.path(meta$resources_dir, "mapping_dir", "AACAAGGTAC"), + "--mappingDir", file.path(meta$resources_dir, "mapping_dir", "ACGCCTTCGT"), + "--poolName", "foo", + "--output", output +)) +expect_equal(out$status, 0) +expect_true(file.exists(output)) +result <- readRDS(output) +stopifnot(length(sampleNames(result)) == 2) +stopifnot(all(sampleNames(result) == c("foo_AACAAGGTAC", "foo_ACGCCTTCGT"))) +expected_feature_names <- c( + "ENS0001058", "ENS0000221", "ENS0001387", "ENS0000508", "ENS0001199", + "ENS0000477", "ENS0001457", "ENS0001040", "ENS0000114", "ENS0000821", + "ENS0001429", "ENS0001396", "ENS0000355", "ENS0000122", "ENS0000441", + "ENS0001223", "ENS0001431", "ENS0000042", "ENS0000443", "ENS0000389", + "ENS0001208", "ENS0001140", "ENS0000071", "ENS0001369" +) + +stopifnot(length(featureNames(result)) == 24) +stopifnot(all(featureNames(result) == expected_feature_names)) +expected_expressions <- matrix( + c(0, 0, + 0, 40, + 0, 0, + 0, 0, + 1, 2, + 0, 0, + 0, 0, + 0, 0, + 2, 2, + 0, 0, + 0, 0, + 8, 2, + 0, 0, + 1, 0, + 2, 3, + 0, 0, + 0, 0, + 0, 0, + 1, 0, + 0, 0, + 16, 13, + 0, 0, + 12, 13, + 5, 2), + ncol = 2, + nrow = 24, + byrow = TRUE, +) +rownames(expected_expressions) <- expected_feature_names +colnames(expected_expressions) <- c("foo_AACAAGGTAC", "foo_ACGCCTTCGT") +stopifnot(identical(exprs(result), expected_expressions)) + +input_f_data <- read.table(file.path(meta$resources_dir, "fData.tsv"), + sep = "\t", quote = "\"", comment.char = "", + header = TRUE) +input_f_data <- input_f_data[input_f_data$gene_id %in% expected_feature_names, ] +row.names(input_f_data) <- input_f_data$gene_id +input_f_data[] <- lapply(input_f_data, as.character) +stopifnot(identical(input_f_data, fData(result))) + +# Check results filtering of barcodes with no reads +out <- processx::run(meta$executable, c( + "--pDataFile", file.path(meta$resources_dir, "pData.tsv"), + "--fDataFile", file.path(meta$resources_dir, "fData.tsv"), + "--mappingDir", file.path(meta$resources_dir, "mapping_dir", "AACAAGGTAC"), + "--mappingDir", file.path(meta$resources_dir, "mapping_dir", "EMPTY"), + "--poolName", "bar", + "--output", output +)) +expect_equal(out$status, 0) +expect_true(file.exists(output)) +result <- readRDS(output) +stopifnot(length(sampleNames(result)) == 1) +stopifnot(all(sampleNames(result) == c("bar_AACAAGGTAC"))) +expected_feature_names <- c( + "ENS0001058", "ENS0000221", "ENS0001387", "ENS0000508", "ENS0001199", + "ENS0000477", "ENS0001457", "ENS0001040", "ENS0000114", "ENS0000821", + "ENS0001429", "ENS0001396", "ENS0000355", "ENS0000122", "ENS0000441", + "ENS0001223", "ENS0001431", "ENS0000042", "ENS0000443", "ENS0000389", + "ENS0001208", "ENS0001140", "ENS0000071", "ENS0001369" +) +stopifnot(length(featureNames(result)) == 24) +stopifnot(all(featureNames(result) == expected_feature_names)) +expected_expressions <- matrix( + c(0, + 0, + 0, + 0, + 1, + 0, + 0, + 0, + 2, + 0, + 0, + 8, + 0, + 1, + 2, + 0, + 0, + 0, + 1, + 0, + 16, + 0, + 12, + 5), + ncol = 1, + nrow = 24, + byrow = TRUE, +) +rownames(expected_expressions) <- expected_feature_names +colnames(expected_expressions) <- c("bar_AACAAGGTAC") +stopifnot(identical(exprs(result), expected_expressions)) \ No newline at end of file diff --git a/src/eset/create_eset/test_data/fData.tsv b/src/eset/create_eset/test_data/fData.tsv new file mode 100644 index 00000000..cac8bdd1 --- /dev/null +++ b/src/eset/create_eset/test_data/fData.tsv @@ -0,0 +1,1460 @@ +seqname start end strand gene_biotype gene_id gene_name gene_source gene_version transcript_id ENSEMBL_with_version ENSEMBL SYMBOL +20 87250 97094 + protein_coding ENS0000878 209E3 ensembl_havana 7.0 ENS0000878 ENS0000878 209E3 +20 142590 145751 + protein_coding ENS0000554 A2B9A ensembl_havana 6.0 ENS0000554 ENS0000554 A2B9A +20 157454 159163 + protein_coding ENS0001430 CF168 ensembl_havana 5.0 ENS0001430 ENS0001430 CF168 +20 187853 189711 - protein_coding ENS0000109 3BA5A ensembl_havana 7.0 ENS0000109 ENS0000109 3BA5A +20 227258 229886 + protein_coding ENS0000017 1C968 ensembl_havana 4.0 ENS0000017 ENS0000017 1C968 +20 257724 261096 + protein_coding ENS0000803 E5192 ensembl_havana 5.0 ENS0000803 ENS0000803 E5192 +20 267186 268857 + lncRNA ENS0000998 1821B havana 1.0 ENS0000998 ENS0000998 1821B +20 270863 290778 - protein_coding ENS0001362 5AD11 ensembl_havana 12.0 ENS0001362 ENS0001362 5AD11 +20 297570 300321 + protein_coding ENS0001168 3F0FF havana 4.0 ENS0001168 ENS0001168 3F0FF +20 311731 313237 + lncRNA ENS0000013 265F2 havana 1.0 ENS0000013 ENS0000013 265F2 +20 316860 348490 - lncRNA ENS0000057 28A43 havana_tagene 6.0 ENS0000057 ENS0000057 28A43 +20 325595 330224 + protein_coding ENS0000753 58E28 havana 8.0 ENS0000753 ENS0000753 58E28 +20 346782 359660 + protein_coding ENS0000400 6E614 ensembl_havana 13.0 ENS0000400 ENS0000400 6E614 +20 362835 397559 + protein_coding ENS0001378 EA941 ensembl_havana 11.0 ENS0001378 ENS0001378 EA941 +20 407498 432139 + protein_coding ENS0000572 99DDC ensembl_havana 21.0 ENS0000572 ENS0000572 99DDC +20 435480 462543 - protein_coding ENS0000394 AFCC0 ensembl_havana 14.0 ENS0000394 ENS0000394 AFCC0 +20 450663 450762 + misc_RNA ENS0000532 B58E5 ensembl 1.0 ENS0000532 ENS0000532 B58E5 +20 472498 543835 - protein_coding ENS0000524 7A6C3 ensembl_havana 19.0 ENS0000524 ENS0000524 7A6C3 +20 603797 610398 - protein_coding ENS0001058 65424 ensembl_havana 6.0 ENS0001058 ENS0001058 65424 +20 646615 653200 - protein_coding ENS0001164 077A2 ensembl_havana 2.0 ENS0001164 ENS0001164 077A2 +20 646626 675800 - protein_coding ENS0001177 22A4F havana 1.0 ENS0001177 ENS0001177 22A4F +20 661596 675802 - protein_coding ENS0001548 981E6 ensembl_havana 4.0 ENS0001548 ENS0001548 981E6 +20 760080 776015 - protein_coding ENS0000708 E2D99 ensembl_havana 18.0 ENS0000708 ENS0000708 E2D99 +20 833715 857463 + protein_coding ENS0001350 D90E9 ensembl_havana 13.0 ENS0001350 ENS0001350 D90E9 +20 839447 839977 - processed_pseudogene ENS0000607 97B0F havana 6.0 ENS0000607 ENS0000607 97B0F +20 869900 916334 - protein_coding ENS0000967 F2DD3 ensembl_havana 8.0 ENS0000967 ENS0000967 F2DD3 +20 958452 1002311 - protein_coding ENS0001131 61380 ensembl_havana 9.0 ENS0001131 ENS0001131 61380 +20 1023874 1118467 - lncRNA ENS0000183 DC6B0 havana 1.0 ENS0000183 ENS0000183 DC6B0 +20 1113240 1189415 + protein_coding ENS0000903 7EE8B ensembl_havana 18.0 ENS0000903 ENS0000903 7EE8B +20 1160584 1161697 + processed_pseudogene ENS0000705 51CA8 havana 3.0 ENS0000705 ENS0000705 51CA8 +20 1180561 1185415 - protein_coding ENS0000065 05677 ensembl_havana 5.0 ENS0000065 ENS0000065 05677 +20 1186092 1207036 + lncRNA ENS0001189 69BC1 havana 1.0 ENS0001189 ENS0001189 69BC1 +20 1203454 1208279 + protein_coding ENS0000325 C25F4 ensembl_havana 2.0 ENS0000325 ENS0000325 C25F4 +20 1226056 1296421 + protein_coding ENS0001288 4178C ensembl_havana 5.0 ENS0001288 ENS0001288 4178C +20 1266280 1309328 + protein_coding ENS0000560 0BBDD ensembl_havana 15.0 ENS0000560 ENS0000560 0BBDD +20 1309909 1329139 - protein_coding ENS0000964 CB4D6 ensembl_havana 15.0 ENS0000964 ENS0000964 CB4D6 +20 1317571 1393096 - protein_coding ENS0000262 57498 havana 1.0 ENS0000262 ENS0000262 57498 +20 1320710 1324367 + TEC ENS0000514 6C6F2 havana 1.0 ENS0000514 ENS0000514 6C6F2 +20 1325405 1378735 + lncRNA ENS0000561 54645 havana 7.0 ENS0000561 ENS0000561 54645 +20 1361622 1362585 + lncRNA ENS0000081 49F73 havana 1.0 ENS0000081 ENS0000081 49F73 +20 1368977 1393164 - protein_coding ENS0000125 06230 ensembl_havana 18.0 ENS0000125 ENS0000125 06230 +20 1392900 1392961 - miRNA ENS0001448 35A3E mirbase 1.0 ENS0001448 ENS0001448 35A3E +20 1442162 1473842 - protein_coding ENS0000053 E5E0A ensembl_havana 17.0 ENS0000053 ENS0000053 E5E0A +20 1470741 1491587 - protein_coding ENS0000762 10CCD ensembl_havana 13.0 ENS0000762 ENS0000762 10CCD +20 1516759 1557653 - unprocessed_pseudogene ENS0001478 E567E havana 1.0 ENS0001478 ENS0001478 E567E +20 1529056 1529159 - snRNA ENS0000078 BACB5 ensembl 1.0 ENS0000078 ENS0000078 BACB5 +20 1534251 1557705 - protein_coding ENS0000835 4F4C7 ensembl_havana 13.0 ENS0000835 ENS0000835 4F4C7 +20 1540144 1620009 - protein_coding ENS0000021 FBE60 havana 6.0 ENS0000021 ENS0000021 FBE60 +20 1561385 1620061 - protein_coding ENS0001305 73049 ensembl_havana 16.0 ENS0001305 ENS0001305 73049 +20 1591392 1591848 + processed_pseudogene ENS0000014 9CE7A havana 1.0 ENS0000014 ENS0000014 9CE7A +20 1629152 1657779 - protein_coding ENS0001403 39174 ensembl_havana 14.0 ENS0001403 ENS0001403 39174 +20 1633508 1648472 + lncRNA ENS0000075 3B97C havana 5.0 ENS0000075 ENS0000075 3B97C +20 1665283 1708200 - unprocessed_pseudogene ENS0000562 F2F34 havana 1.0 ENS0000562 ENS0000562 F2F34 +20 1666360 1666640 + misc_RNA ENS0000556 0D5C2 ensembl 3.0 ENS0000556 ENS0000556 0D5C2 +20 1676662 1676907 - unprocessed_pseudogene ENS0000479 87B56 havana 1.0 ENS0000479 ENS0000479 87B56 +20 1722045 1722537 - unprocessed_pseudogene ENS0001504 95110 havana 1.0 ENS0001504 ENS0001504 95110 +20 1729038 1817765 - lncRNA ENS0001474 1EF70 havana 1.0 ENS0001474 ENS0001474 1EF70 +20 1756756 1758940 - processed_pseudogene ENS0001266 D9191 havana 1.0 ENS0001266 ENS0001266 D9191 +20 1767957 1779995 - transcribed_unitary_pseudogene ENS0000717 51BE3 havana 3.0 ENS0000717 ENS0000717 51BE3 +20 1888128 1894374 - lncRNA ENS0000471 F1AE2 havana 1.0 ENS0000471 ENS0000471 F1AE2 +20 1894167 1940592 + protein_coding ENS0001186 048F1 ensembl_havana 12.0 ENS0001186 ENS0001186 048F1 +20 1947246 2030028 + lncRNA ENS0001479 04CDB havana 2.0 ENS0001479 ENS0001479 04CDB +20 1978757 1994285 - protein_coding ENS0000221 025DD ensembl_havana 9.0 ENS0000221 ENS0000221 025DD +20 2075555 2075822 + processed_pseudogene ENS0000243 96A40 havana 1.0 ENS0000243 ENS0000243 96A40 +20 2101827 2177038 + protein_coding ENS0000019 BA9A2 ensembl_havana 13.0 ENS0000019 ENS0000019 BA9A2 +20 2107900 2109991 - lncRNA ENS0000051 6B634 havana 1.0 ENS0000051 ENS0000051 6B634 +20 2194074 2201329 + lncRNA ENS0000619 55A29 havana_tagene 1.0 ENS0000619 ENS0000619 55A29 +20 2206683 2207397 - lncRNA ENS0000582 21741 havana_tagene 1.0 ENS0000582 ENS0000582 21741 +20 2207217 2213151 + lncRNA ENS0001402 24E15 ensembl_havana 5.0 ENS0001402 ENS0001402 24E15 +20 2207328 2341059 + protein_coding ENS0001269 5A7A4 havana 1.0 ENS0001269 ENS0001269 5A7A4 +20 2296001 2341079 + protein_coding ENS0000490 34B67 ensembl_havana 12.0 ENS0000490 ENS0000490 34B67 +20 2380901 2432753 + protein_coding ENS0000324 05296 ensembl_havana 10.0 ENS0000324 ENS0000324 05296 +20 2453010 2453117 - misc_RNA ENS0000815 A1929 ensembl 1.0 ENS0000815 ENS0000815 A1929 +20 2461634 2470853 - protein_coding ENS0001416 6BC70 ensembl_havana 19.0 ENS0001416 ENS0001416 6BC70 +20 2467212 2508907 - protein_coding ENS0001291 3384F havana 1.0 ENS0001291 ENS0001291 3384F +20 2481817 2524702 - protein_coding ENS0000670 D008E ensembl_havana 13.0 ENS0000670 ENS0000670 D008E +20 2536607 2641784 + protein_coding ENS0000795 7113C ensembl_havana 13.0 ENS0000795 ENS0000795 7113C +20 2652593 2658393 + protein_coding ENS0001264 2A7AB ensembl_havana 17.0 ENS0001264 ENS0001264 2A7AB +20 2652777 2652842 + miRNA ENS0000898 EFDCC mirbase 1.0 ENS0000898 ENS0000898 EFDCC +20 2654212 2654286 + snoRNA ENS0000931 B946F ensembl 1.0 ENS0000931 ENS0000931 B946F +20 2655067 2655198 + snoRNA ENS0000984 8713D ensembl 1.0 ENS0000984 ENS0000984 8713D +20 2656097 2656182 + snoRNA ENS0000786 BE49E ensembl 1.0 ENS0000786 ENS0000786 BE49E +20 2656624 2656694 + snoRNA ENS0000133 55B2F ensembl 1.0 ENS0000133 ENS0000133 55B2F +20 2656939 2657010 + snoRNA ENS0001255 2B998 ensembl 1.0 ENS0001255 ENS0001255 2B998 +20 2658395 2664219 - protein_coding ENS0000438 1AF6D ensembl_havana 21.0 ENS0000438 ENS0000438 1AF6D +20 2664352 2665874 + lncRNA ENS0000296 7A4DC havana 1.0 ENS0000296 ENS0000296 7A4DC +20 2692874 2760108 + protein_coding ENS0001026 E9DA7 ensembl_havana 21.0 ENS0001026 ENS0001026 E9DA7 +20 2780614 2781041 + processed_pseudogene ENS0001283 5C3B0 havana 1.0 ENS0001283 ENS0001283 5C3B0 +20 2794074 2800627 - protein_coding ENS0000040 63A70 ensembl_havana 8.0 ENS0000040 ENS0000040 63A70 +20 2814968 2818066 + protein_coding ENS0000853 D98EB havana 3.0 ENS0000853 ENS0000853 D98EB +20 2814987 2815833 + protein_coding ENS0001257 6EAE4 ensembl_havana 3.0 ENS0001257 ENS0001257 6EAE4 +20 2816302 2820284 + protein_coding ENS0000113 04C78 ensembl_havana 10.0 ENS0000113 ENS0000113 04C78 +20 2835314 2841190 - protein_coding ENS0000625 1D345 ensembl_havana 17.0 ENS0000625 ENS0000625 1D345 +20 2840703 2866732 + protein_coding ENS0000678 0B59C ensembl_havana 10.0 ENS0000678 ENS0000678 0B59C +20 2864184 3039076 + protein_coding ENS0000739 1D971 ensembl_havana 20.0 ENS0000739 ENS0000739 1D971 +20 3043622 3045747 + protein_coding ENS0001303 F60D3 ensembl_havana 11.0 ENS0001303 ENS0001303 F60D3 +20 3046052 3048250 + protein_coding ENS0000422 AF91E ensembl_havana 6.0 ENS0000422 ENS0000422 AF91E +20 3071620 3072517 + protein_coding ENS0000703 027B5 ensembl_havana 3.0 ENS0000703 ENS0000703 027B5 +20 3082556 3084724 - protein_coding ENS0001332 F2874 ensembl_havana 5.0 ENS0001332 ENS0001332 F2874 +20 3094171 3094509 + misc_RNA ENS0001101 73393 ensembl 2.0 ENS0001101 ENS0001101 73393 +20 3106913 3150867 + lncRNA ENS0001335 74EF6 havana 5.0 ENS0001335 ENS0001335 74EF6 +20 3107573 3160196 - protein_coding ENS0001508 0EBFE ensembl_havana 17.0 ENS0001508 ENS0001508 0EBFE +20 3146519 3159865 - protein_coding ENS0001341 003B6 ensembl_havana 4.0 ENS0001341 ENS0001341 003B6 +20 3162617 3173592 - protein_coding ENS0000523 82E9F ensembl_havana 15.0 ENS0000523 ENS0000523 82E9F +20 3190350 3204685 - protein_coding ENS0000494 B9A4E ensembl_havana 13.0 ENS0000494 ENS0000494 B9A4E +20 3208868 3223870 + protein_coding ENS0001159 3BA65 ensembl_havana 13.0 ENS0001159 ENS0001159 3BA65 +20 3227417 3239559 - protein_coding ENS0000233 C81E4 ensembl_havana 14.0 ENS0000233 ENS0000233 C81E4 +20 3239705 3245382 + lncRNA ENS0000681 58C5F havana 1.0 ENS0000681 ENS0000681 58C5F +20 3249305 3407625 - protein_coding ENS0000322 34108 ensembl_havana 12.0 ENS0000322 ENS0000322 34108 +20 3360036 3360142 + snRNA ENS0001387 5A72B ensembl 1.0 ENS0001387 ENS0001387 5A72B +20 3361880 3362324 - processed_pseudogene ENS0000954 CCB48 havana 1.0 ENS0000954 ENS0000954 CCB48 +20 3384229 3384440 + snoRNA ENS0000522 E8A38 ensembl 2.0 ENS0000522 ENS0000522 E8A38 +20 3406380 3410036 + lncRNA ENS0001509 76212 havana 1.0 ENS0001509 ENS0001509 76212 +20 3465177 3465444 - processed_pseudogene ENS0001518 6A1B7 havana 1.0 ENS0001518 ENS0001518 6A1B7 +20 3471018 3651118 + protein_coding ENS0000707 04AFD ensembl_havana 18.0 ENS0000707 ENS0000707 04AFD +20 3489529 3491005 - processed_pseudogene ENS0001105 9F959 havana 1.0 ENS0001105 ENS0001105 9F959 +20 3659248 3663399 - protein_coding ENS0000199 FB6B7 ensembl_havana 16.0 ENS0000199 ENS0000199 FB6B7 +20 3667965 3682246 - protein_coding ENS0001290 8AB97 ensembl_havana 18.0 ENS0001290 ENS0001290 8AB97 +20 3686970 3707128 - protein_coding ENS0000437 52026 ensembl_havana 12.0 ENS0000437 ENS0000437 52026 +20 3732685 3753111 + protein_coding ENS0001391 9C98A ensembl_havana 11.0 ENS0001391 ENS0001391 9C98A +20 3753508 3767781 - protein_coding ENS0000631 833DD ensembl_havana 18.0 ENS0000631 ENS0000631 833DD +20 3777504 3781448 - protein_coding ENS0000267 F16DD ensembl_havana 12.0 ENS0000267 ENS0000267 F16DD +20 3783851 3786740 - protein_coding ENS0000328 E77F6 ensembl_havana 8.0 ENS0000328 ENS0000328 E77F6 +20 3786772 3806121 + protein_coding ENS0000720 447B1 ensembl_havana 17.0 ENS0000720 ENS0000720 447B1 +20 3808357 3812434 + lncRNA ENS0000161 DA9E5 havana 1.0 ENS0000161 ENS0000161 DA9E5 +20 3811384 3823882 + lncRNA ENS0000117 F58BB havana 1.0 ENS0000117 ENS0000117 F58BB +20 3820524 3828838 + protein_coding ENS0000106 7561E ensembl_havana 11.0 ENS0000106 ENS0000106 7561E +20 3846799 3876123 + protein_coding ENS0000508 6D581 ensembl_havana 18.0 ENS0000508 ENS0000508 6D581 +20 3888239 3888868 - lncRNA ENS0001199 0A66D havana 1.0 ENS0001199 ENS0001199 0A66D +20 3888839 3929882 + protein_coding ENS0001316 F8A12 ensembl_havana 22.0 ENS0001316 ENS0001316 F8A12 +20 3917494 3917571 + miRNA ENS0001521 372E3 mirbase 1.0 ENS0001521 ENS0001521 372E3 +20 3917502 3917563 - miRNA ENS0000534 FE749 mirbase 1.0 ENS0000534 ENS0000534 FE749 +20 3921279 3923400 - lncRNA ENS0000469 1B421 havana 1.0 ENS0000469 ENS0000469 1B421 +20 3927309 4015558 - protein_coding ENS0000976 D6C6A ensembl_havana 17.0 ENS0000976 ENS0000976 D6C6A +20 4023917 4024444 + processed_pseudogene ENS0001117 45370 havana 3.0 ENS0001117 ENS0001117 45370 +20 4059633 4060113 - processed_pseudogene ENS0001032 7E9F3 havana 3.0 ENS0001032 ENS0001032 7E9F3 +20 4070152 4075165 - lncRNA ENS0000176 F5E40 havana 3.0 ENS0000176 ENS0000176 F5E40 +20 4120980 4187747 + protein_coding ENS0000872 2198E ensembl_havana 18.0 ENS0000872 ENS0000872 2198E +20 4192932 4195953 + lncRNA ENS0001207 FB2C5 havana_tagene 3.0 ENS0001207 ENS0001207 FB2C5 +20 4220630 4249287 - protein_coding ENS0000251 0C663 ensembl_havana 8.0 ENS0000251 ENS0000251 0C663 +20 4422186 4431732 - lncRNA ENS0001346 E43CC havana 2.0 ENS0001346 ENS0001346 E43CC +20 4475638 4525200 + lncRNA ENS0001319 DDB7B havana 1.0 ENS0001319 ENS0001319 DDB7B +20 4590993 4591300 - processed_pseudogene ENS0000841 6D7E7 havana 1.0 ENS0000841 ENS0000841 6D7E7 +20 4627758 4685148 - transcribed_processed_pseudogene ENS0000845 45596 havana 2.0 ENS0000845 ENS0000845 45596 +20 4686350 4701590 + protein_coding ENS0000104 9EE30 ensembl_havana 17.0 ENS0000104 ENS0000104 9EE30 +20 4721909 4728460 + protein_coding ENS0000278 F5295 ensembl_havana 5.0 ENS0000278 ENS0000278 F5295 +20 4731279 4740668 - lncRNA ENS0001519 338AF havana 10.0 ENS0001519 ENS0001519 338AF +20 4735669 4736673 - processed_pseudogene ENS0001055 C3DD5 havana 1.0 ENS0001055 ENS0001055 C3DD5 +20 4761300 4761696 + lncRNA ENS0000957 54B2B havana 1.0 ENS0000957 ENS0000957 54B2B +20 4780023 4823608 - protein_coding ENS0000982 9F33F ensembl_havana 16.0 ENS0000982 ENS0000982 9F33F +20 4852356 5010293 - protein_coding ENS0000088 1E9BF ensembl_havana 15.0 ENS0000088 ENS0000088 1E9BF +20 4868984 4869118 + snoRNA ENS0000512 B9EA8 ensembl 1.0 ENS0000512 ENS0000512 B9EA8 +20 5013520 5013724 - processed_pseudogene ENS0000372 1895C havana 1.0 ENS0000372 ENS0000372 1895C +20 5045761 5046276 - lncRNA ENS0001366 6C5E5 havana 1.0 ENS0001366 ENS0001366 6C5E5 +20 5049857 5050321 - lncRNA ENS0001363 7FF47 havana 1.0 ENS0001363 ENS0001363 7FF47 +20 5061037 5061340 - lncRNA ENS0001380 F6A02 havana 1.0 ENS0001380 ENS0001380 F6A02 +20 5066082 5068154 - lncRNA ENS0001300 1994E havana 1.0 ENS0001300 ENS0001300 1994E +20 5068232 5113103 - protein_coding ENS0001146 A6D55 ensembl_havana 15.0 ENS0001146 ENS0001146 A6D55 +20 5098387 5098476 - rRNA_pseudogene ENS0000094 31B02 ensembl 1.0 ENS0000094 ENS0000094 31B02 +20 5113993 5114083 - misc_RNA ENS0000888 5380E ensembl 1.0 ENS0000888 ENS0000888 5380E +20 5114953 5126626 - protein_coding ENS0000378 C9122 ensembl_havana 11.0 ENS0000378 ENS0000378 C9122 +20 5121417 5121539 - snoRNA ENS0001067 4477B ensembl 1.0 ENS0001067 ENS0001067 4477B +20 5126879 5197887 + protein_coding ENS0000799 29F12 ensembl_havana 14.0 ENS0000799 ENS0000799 29F12 +20 5292388 5292831 - processed_pseudogene ENS0001134 CE0C4 havana 1.0 ENS0001134 ENS0001134 CE0C4 +20 5299218 5316954 - protein_coding ENS0000247 4B61B ensembl_havana 8.0 ENS0000247 ENS0000247 4B61B +20 5318268 5503189 + lncRNA ENS0000164 DA51D havana 2.0 ENS0000164 ENS0000164 DA51D +20 5346006 5346160 - rRNA_pseudogene ENS0000936 276AD ensembl 1.0 ENS0000936 ENS0000936 276AD +20 5407564 5407876 + lncRNA ENS0000227 4ABB7 havana 1.0 ENS0000227 ENS0000227 4ABB7 +20 5426892 5471094 - lncRNA ENS0000477 2DF30 havana 9.0 ENS0000477 ENS0000477 2DF30 +20 5445838 5475483 + lncRNA ENS0000580 CE2E9 havana 4.0 ENS0000580 ENS0000580 CE2E9 +20 5482736 5526709 - lncRNA ENS0000869 D3181 havana 7.0 ENS0000869 ENS0000869 D3181 +20 5507875 5510002 - lncRNA ENS0000294 1ED84 havana 1.0 ENS0000294 ENS0000294 1ED84 +20 5532878 5533335 + processed_pseudogene ENS0001323 02C68 havana 1.0 ENS0001323 ENS0001323 02C68 +20 5533628 5535889 + lncRNA ENS0000159 28CF3 havana 1.0 ENS0000159 ENS0000159 28CF3 +20 5544439 5611006 - protein_coding ENS0001021 7ABCD ensembl_havana 14.0 ENS0001021 ENS0001021 7ABCD +20 5548840 5549481 + processed_pseudogene ENS0000073 D0D20 havana 1.0 ENS0000073 ENS0000073 D0D20 +20 5750393 5864395 + protein_coding ENS0000713 00307 ensembl_havana 15.0 ENS0000713 ENS0000713 00307 +20 5890055 5890212 + snRNA ENS0000259 FAC07 ensembl 1.0 ENS0000259 ENS0000259 FAC07 +20 5911510 5925353 + protein_coding ENS0000151 8F33D ensembl_havana 10.0 ENS0000151 ENS0000151 8F33D +20 5928085 5929318 - processed_pseudogene ENS0000016 2C6D4 havana 3.0 ENS0000016 ENS0000016 2C6D4 +20 5937228 5950558 - protein_coding ENS0001525 3FC84 ensembl_havana 15.0 ENS0001525 ENS0001525 3FC84 +20 5950652 5998977 + protein_coding ENS0000645 4FB60 ensembl_havana 13.0 ENS0000645 ENS0000645 4FB60 +20 5950949 6039856 + protein_coding ENS0000412 45F41 havana 1.0 ENS0000412 ENS0000412 45F41 +20 5964538 5964650 + misc_RNA ENS0000932 F974F ensembl 1.0 ENS0000932 ENS0000932 F974F +20 5965180 5965511 + processed_pseudogene ENS0001228 D4B7F havana 1.0 ENS0001228 ENS0001228 D4B7F +20 5990943 6005821 - lncRNA ENS0000755 17F1D havana 1.0 ENS0000755 ENS0000755 17F1D +20 6000418 6000941 + lncRNA ENS0000209 F5ADD havana 1.0 ENS0000209 ENS0000209 F5ADD +20 6001733 6002030 + misc_RNA ENS0001400 9F3C3 ensembl 2.0 ENS0001400 ENS0001400 9F3C3 +20 6006093 6040053 + protein_coding ENS0000454 578B5 ensembl_havana 12.0 ENS0000454 ENS0000454 578B5 +20 6040546 6054060 - protein_coding ENS0000830 661CD ensembl_havana 8.0 ENS0000830 ENS0000830 661CD +20 6065966 6067897 - lncRNA ENS0000184 06DDE havana 1.0 ENS0000184 ENS0000184 06DDE +20 6074845 6123030 - protein_coding ENS0001270 CDFE4 ensembl_havana 16.0 ENS0001270 ENS0001270 CDFE4 +20 6200989 6202191 - processed_pseudogene ENS0000387 CCDE9 havana 1.0 ENS0000387 ENS0000387 CCDE9 +20 6213699 6215381 + processed_pseudogene ENS0000632 F231B havana 1.0 ENS0000632 ENS0000632 F231B +20 6219199 6219443 + processed_pseudogene ENS0001259 EA9F0 havana 1.0 ENS0001259 ENS0001259 EA9F0 +20 6220330 6220759 + processed_pseudogene ENS0000041 3353C havana 1.0 ENS0000041 ENS0000041 3353C +20 6446723 6528459 + lncRNA ENS0001057 B4CEF havana 1.0 ENS0001057 ENS0001057 B4CEF +20 6731080 6736326 - lncRNA ENS0000685 F090F havana 2.0 ENS0000685 ENS0000685 F090F +20 6767686 6780246 + protein_coding ENS0000659 A1516 ensembl_havana 7.0 ENS0000659 ENS0000659 A1516 +20 7069614 7146656 + lncRNA ENS0001457 D927D havana 1.0 ENS0001457 ENS0001457 D927D +20 7146467 7254202 - lncRNA ENS0001515 A665E havana 1.0 ENS0001515 ENS0001515 A665E +20 7255053 7258323 - lncRNA ENS0000819 6F682 havana 2.0 ENS0000819 ENS0000819 6F682 +20 7302056 7307432 - lncRNA ENS0001072 B23F8 havana 1.0 ENS0001072 ENS0001072 B23F8 +20 7347451 7367933 + lncRNA ENS0000166 E1178 havana 2.0 ENS0000166 ENS0000166 E1178 +20 7371608 7371692 + miRNA ENS0001180 84510 mirbase 1.0 ENS0001180 ENS0001180 84510 +20 7633562 7633851 - misc_RNA ENS0000069 45C6D ensembl 3.0 ENS0000069 ENS0000069 45C6D +20 7831798 7832322 - processed_pseudogene ENS0000537 ABF74 havana 2.0 ENS0000537 ENS0000537 ABF74 +20 7867327 7867614 + processed_pseudogene ENS0001276 0AA43 havana 1.0 ENS0001276 ENS0001276 0AA43 +20 7882985 7940458 - protein_coding ENS0001336 3BA51 ensembl_havana 5.0 ENS0001336 ENS0001336 3BA51 +20 7977346 8019805 - protein_coding ENS0000445 D035C ensembl_havana 9.0 ENS0000445 ENS0000445 D035C +20 8019180 8043512 + lncRNA ENS0001534 0A117 havana 7.0 ENS0001534 ENS0001534 0A117 +20 8077251 8968360 + protein_coding ENS0001185 73A5C ensembl_havana 18.0 ENS0001185 ENS0001185 73A5C +20 8097824 8099774 - processed_pseudogene ENS0001340 9D2C5 havana 1.0 ENS0001340 ENS0001340 9D2C5 +20 8248704 8256918 + lncRNA ENS0000552 00D64 havana 1.0 ENS0000552 ENS0000552 00D64 +20 8831186 8831393 + snoRNA ENS0000677 65ED7 ensembl 1.0 ENS0000677 ENS0000677 65ED7 +20 8929895 8930038 - processed_pseudogene ENS0000146 13631 havana 1.0 ENS0000146 ENS0000146 13631 +20 8934984 8935481 - processed_pseudogene ENS0001390 B387A havana 1.0 ENS0001390 ENS0001390 B387A +20 8998849 8999100 + misc_RNA ENS0000975 BCE38 ensembl 1.0 ENS0000975 ENS0000975 BCE38 +20 9068763 9481242 + protein_coding ENS0001049 52452 ensembl_havana 17.0 ENS0001049 ENS0001049 52452 +20 9505180 9514998 - lncRNA ENS0000417 3E72E havana 1.0 ENS0000417 ENS0000417 3E72E +20 9514358 9530524 + protein_coding ENS0001524 05207 ensembl_havana 10.0 ENS0001524 ENS0001524 05207 +20 9537370 9839076 - protein_coding ENS0001289 87711 ensembl_havana 17.0 ENS0001289 ENS0001289 87711 +20 9562941 9571257 + lncRNA ENS0001028 D0C17 havana_tagene 1.0 ENS0001028 ENS0001028 D0C17 +20 9575608 9577689 + lncRNA ENS0000150 AE309 havana 1.0 ENS0000150 ENS0000150 AE309 +20 9835556 9873900 + lncRNA ENS0001173 19D34 havana_tagene 1.0 ENS0001173 ENS0001173 19D34 +20 9986088 10007116 + lncRNA ENS0000352 86923 havana 2.0 ENS0000352 ENS0000352 86923 +20 9986126 10058303 + protein_coding ENS0000203 2035A ensembl_havana 16.0 ENS0000203 ENS0000203 2035A +20 10006381 10368776 - lncRNA ENS0000430 B58DD havana 8.0 ENS0000430 ENS0000430 B58DD +20 10025917 10026168 + misc_RNA ENS0001079 2D121 ensembl 1.0 ENS0001079 ENS0001079 2D121 +20 10172522 10200824 + lncRNA ENS0000334 5DBF8 havana 2.0 ENS0000334 ENS0000334 5DBF8 +20 10172701 10186740 + lncRNA ENS0001514 111CF havana_tagene 2.0 ENS0001514 ENS0001514 111CF +20 10218830 10307418 + protein_coding ENS0001102 135E9 ensembl_havana 13.0 ENS0001102 ENS0001102 135E9 +20 10317889 10318753 - lncRNA ENS0000239 66BDB havana_tagene 1.0 ENS0000239 ENS0000239 66BDB +20 10334419 10334698 + processed_pseudogene ENS0001360 92C02 havana 1.0 ENS0001360 ENS0001360 92C02 +20 10349157 10349628 - processed_pseudogene ENS0000585 19211 havana 3.0 ENS0000585 ENS0000585 19211 +20 10385779 10387806 + processed_pseudogene ENS0000212 522EC havana 1.0 ENS0000212 ENS0000212 522EC +20 10401009 10434222 - protein_coding ENS0001251 8012A ensembl_havana 20.0 ENS0001251 ENS0001251 8012A +20 10413520 10431922 - protein_coding ENS0000598 59FE3 havana 1.0 ENS0000598 ENS0000598 59FE3 +20 10420546 10420737 - protein_coding ENS0001104 CE242 havana 1.0 ENS0001104 ENS0001104 CE242 +20 10435305 10636829 + protein_coding ENS0001317 7B255 ensembl_havana 15.0 ENS0001317 ENS0001317 7B255 +20 10612861 10614229 - processed_pseudogene ENS0000541 96B11 havana 4.0 ENS0000541 ENS0000541 96B11 +20 10637684 10673999 - protein_coding ENS0000446 3E841 ensembl_havana 12.0 ENS0000446 ENS0000446 3E841 +20 10649636 10649695 - miRNA ENS0000602 B37D2 mirbase 1.0 ENS0000602 ENS0000602 B37D2 +20 10672695 10994924 + lncRNA ENS0001285 D360D havana 6.0 ENS0001285 ENS0001285 D360D +20 10696949 10754033 + lncRNA ENS0001007 A7675 havana 3.0 ENS0001007 ENS0001007 A7675 +20 10753278 10765286 - lncRNA ENS0001453 DA21F havana 1.0 ENS0001453 ENS0001453 DA21F +20 10875333 10909279 - lncRNA ENS0001206 D4AC8 havana_tagene 2.0 ENS0001206 ENS0001206 D4AC8 +20 10888582 10889088 - processed_pseudogene ENS0001009 01B8F havana 1.0 ENS0001009 ENS0001009 01B8F +20 10996293 11029455 + lncRNA ENS0001532 6BBDE havana_tagene 8.0 ENS0001532 ENS0001532 6BBDE +20 11001304 11001763 + lncRNA ENS0001209 876A2 havana 1.0 ENS0001209 ENS0001209 876A2 +20 11234170 11301525 - lncRNA ENS0000276 EE833 havana 2.0 ENS0000276 ENS0000276 EE833 +20 11345383 11345855 - processed_pseudogene ENS0000794 55220 havana 1.0 ENS0000794 ENS0000794 55220 +20 11536406 11541647 - lncRNA ENS0001512 01796 havana 1.0 ENS0001512 ENS0001512 01796 +20 11566484 11581335 - lncRNA ENS0000754 C0E83 havana_tagene 1.0 ENS0000754 ENS0000754 C0E83 +20 11608436 11609048 - processed_pseudogene ENS0000134 1E213 havana 1.0 ENS0000134 ENS0000134 1E213 +20 11621051 11621477 + processed_pseudogene ENS0000913 7B42C havana 1.0 ENS0000913 ENS0000913 7B42C +20 11685144 11687323 - lncRNA ENS0001197 BC73B havana 1.0 ENS0001197 ENS0001197 BC73B +20 11800463 11878429 - lncRNA ENS0000920 F8546 havana 5.0 ENS0000920 ENS0000920 F8546 +20 11818821 11819091 - misc_RNA ENS0000196 15E3B ensembl 1.0 ENS0000196 ENS0000196 15E3B +20 11838355 11838574 + processed_pseudogene ENS0001200 88041 havana 1.0 ENS0001200 ENS0001200 88041 +20 11890723 11926609 + protein_coding ENS0000843 B82E6 ensembl_havana 15.0 ENS0000843 ENS0000843 B82E6 +20 11909404 11918677 - lncRNA ENS0000456 AEABA havana 1.0 ENS0000456 ENS0000456 AEABA +20 12243308 12318221 - lncRNA ENS0001035 A1E56 havana 2.0 ENS0001035 ENS0001035 A1E56 +20 12305613 12316485 - lncRNA ENS0000064 9E166 havana 1.0 ENS0000064 ENS0000064 9E166 +20 12380056 12381255 + processed_pseudogene ENS0000540 638E1 havana 1.0 ENS0000540 ENS0000540 638E1 +20 12865202 12952519 - lncRNA ENS0001475 AC56B havana 1.0 ENS0001475 ENS0001475 AC56B +20 12934877 12937097 - lncRNA ENS0001343 B2F03 havana 2.0 ENS0001343 ENS0001343 B2F03 +20 12936262 12940902 + lncRNA ENS0000686 FC02B havana 1.0 ENS0000686 ENS0000686 FC02B +20 12950288 13008417 + lncRNA ENS0001094 EADBC havana 2.0 ENS0001094 ENS0001094 EADBC +20 13008972 13169103 + protein_coding ENS0000398 6EA76 ensembl_havana 13.0 ENS0000398 ENS0000398 6EA76 +20 13073667 13074312 - processed_pseudogene ENS0000432 75E3D havana 1.0 ENS0000432 ENS0000432 75E3D +20 13221274 13300651 + protein_coding ENS0000121 D0821 ensembl_havana 6.0 ENS0000121 ENS0000121 D0821 +20 13237801 13239674 - lncRNA ENS0001053 A3417 havana 1.0 ENS0001053 ENS0001053 A3417 +20 13244064 13245369 - lncRNA ENS0001162 35C78 havana 1.0 ENS0001162 ENS0001162 35C78 +20 13368291 13368703 - lncRNA ENS0000860 753CA havana 1.0 ENS0000860 ENS0000860 753CA +20 13389392 13638932 - protein_coding ENS0001510 8A157 ensembl_havana 16.0 ENS0001510 ENS0001510 8A157 +20 13392677 13393674 + processed_pseudogene ENS0000098 BF5AE havana 1.0 ENS0000098 ENS0000098 BF5AE +20 13714322 13784886 - protein_coding ENS0001377 3861B ensembl_havana 14.0 ENS0001377 ENS0001377 3861B +20 13785007 13821580 + protein_coding ENS0000165 1F4AE ensembl_havana 18.0 ENS0000165 ENS0000165 1F4AE +20 13849247 13996443 - protein_coding ENS0000658 AE130 ensembl_havana 13.0 ENS0000658 ENS0000658 AE130 +20 13973316 13973424 - snRNA ENS0000157 A844A ensembl 1.0 ENS0000157 ENS0000157 A844A +20 13995369 16053197 + protein_coding ENS0000210 FA91D ensembl_havana 17.0 ENS0000210 ENS0000210 FA91D +20 14003508 14003699 + processed_pseudogene ENS0001425 65E85 havana 1.0 ENS0001425 ENS0001425 65E85 +20 14127400 14128336 + processed_pseudogene ENS0000085 2234D havana 1.0 ENS0000085 ENS0000085 2234D +20 14223041 14223325 + misc_RNA ENS0001254 96536 ensembl 3.0 ENS0001254 ENS0001254 96536 +20 14322985 14337614 - protein_coding ENS0000330 CD310 ensembl_havana 10.0 ENS0000330 ENS0000330 CD310 +20 14352320 14352425 + snRNA ENS0000785 878BF ensembl 1.0 ENS0000785 ENS0000785 878BF +20 14522081 14523314 - lncRNA ENS0000885 FBF0E havana_tagene 1.0 ENS0000885 ENS0000885 FBF0E +20 14547184 14547623 - processed_pseudogene ENS0000660 5099F havana 1.0 ENS0000660 ENS0000660 5099F +20 14554384 14636524 + lncRNA ENS0000070 5788D havana 2.0 ENS0000070 ENS0000070 5788D +20 14757563 14758056 + processed_pseudogene ENS0000415 DC9E7 havana 2.0 ENS0000415 ENS0000415 DC9E7 +20 14884250 14929528 - lncRNA ENS0000938 55B2F havana 2.0 ENS0000938 ENS0000938 55B2F +20 14933843 14935363 - lncRNA ENS0000965 4E6AE havana 1.0 ENS0000965 ENS0000965 4E6AE +20 15021553 15021659 - snRNA ENS0000917 138C8 ensembl 1.0 ENS0000917 ENS0000917 138C8 +20 15022852 15022958 + snRNA ENS0000787 BE47B ensembl 1.0 ENS0000787 ENS0000787 BE47B +20 15280764 15280873 - rRNA_pseudogene ENS0000083 B5006 ensembl 1.0 ENS0000083 ENS0000083 B5006 +20 15552157 15552885 - lncRNA ENS0001220 9350A havana 1.0 ENS0001220 ENS0001220 9350A +20 15619494 15619819 - processed_pseudogene ENS0001357 BEA95 havana 1.0 ENS0001357 ENS0001357 BEA95 +20 15892355 15985876 - lncRNA ENS0001352 8FB64 havana_tagene 1.0 ENS0001352 ENS0001352 8FB64 +20 16177933 16259603 - lncRNA ENS0000950 AA3AB havana_tagene 1.0 ENS0000950 ENS0000950 AA3AB +20 16223750 16224535 - processed_pseudogene ENS0001349 AE12B havana 1.0 ENS0001349 ENS0001349 AE12B +20 16272104 16573448 - protein_coding ENS0000667 A6E19 ensembl_havana 19.0 ENS0000667 ENS0000667 A6E19 +20 16576068 16579615 + lncRNA ENS0000824 826F4 havana 1.0 ENS0000824 ENS0000824 826F4 +20 16580258 16580622 + lncRNA ENS0000934 5A1E4 havana 1.0 ENS0000934 ENS0000934 5A1E4 +20 16586333 16586693 - processed_pseudogene ENS0000466 986E0 havana 2.0 ENS0000466 ENS0000466 986E0 +20 16670641 16670742 - misc_RNA ENS0001379 A8FB7 ensembl 1.0 ENS0001379 ENS0001379 A8FB7 +20 16684108 16684404 - processed_pseudogene ENS0000725 E6B06 havana 2.0 ENS0000725 ENS0000725 E6B06 +20 16714844 16730948 - lncRNA ENS0001011 E5EC3 havana 2.0 ENS0001011 ENS0001011 E5EC3 +20 16729961 16742564 + protein_coding ENS0001275 8121B ensembl_havana 11.0 ENS0001275 ENS0001275 8121B +20 16748358 16770062 + protein_coding ENS0000059 CD284 ensembl_havana 5.0 ENS0000059 ENS0000059 CD284 +20 16857962 16864148 - lncRNA ENS0000163 A6AAA havana 1.0 ENS0000163 ENS0000163 A6AAA +20 16982831 16983033 - snoRNA ENS0000724 3A211 ensembl 2.0 ENS0000724 ENS0000724 3A211 +20 17187510 17187616 + snRNA ENS0000063 73BC6 ensembl 1.0 ENS0000063 ENS0000063 73BC6 +20 17188134 17188344 + processed_pseudogene ENS0001018 B0F14 havana 1.0 ENS0001018 ENS0001018 B0F14 +20 17209060 17209220 + snRNA ENS0000384 59AB1 ensembl 1.0 ENS0000384 ENS0000384 59AB1 +20 17216779 17221913 - lncRNA ENS0000406 7B4DA havana_tagene 1.0 ENS0000406 ENS0000406 7B4DA +20 17222851 17226146 - lncRNA ENS0000308 0F598 havana_tagene 1.0 ENS0000308 ENS0000308 0F598 +20 17226107 17484578 + protein_coding ENS0000124 D32EC ensembl_havana 10.0 ENS0000124 ENS0000124 D32EC +20 17479085 17479431 - processed_pseudogene ENS0001313 8154C havana 1.0 ENS0001313 ENS0001313 8154C +20 17493905 17569220 - protein_coding ENS0001187 EA474 ensembl_havana 14.0 ENS0001187 ENS0001187 EA474 +20 17516843 17517307 + processed_pseudogene ENS0001314 F0DE9 havana 1.0 ENS0001314 ENS0001314 F0DE9 +20 17565501 17565851 - lncRNA ENS0001040 AB5CA havana 1.0 ENS0001040 ENS0001040 AB5CA +20 17570075 17609919 + protein_coding ENS0001462 11D5E ensembl_havana 16.0 ENS0001462 ENS0001462 11D5E +20 17576206 17576510 - misc_RNA ENS0000612 D2371 ensembl 1.0 ENS0000612 ENS0000612 D2371 +20 17599255 17599807 - processed_pseudogene ENS0000553 05479 havana 1.0 ENS0000553 ENS0000553 05479 +20 17613678 17682295 - protein_coding ENS0000045 61FAC ensembl_havana 16.0 ENS0000045 ENS0000045 61FAC +20 17693672 17735871 + protein_coding ENS0000609 E3559 ensembl_havana 15.0 ENS0000609 ENS0000609 E3559 +20 17887363 17888160 - lncRNA ENS0001399 C8EE4 havana 1.0 ENS0001399 ENS0001399 C8EE4 +20 17903762 17903861 + snRNA ENS0001157 8A755 ensembl 1.0 ENS0001157 ENS0001157 8A755 +20 17941597 17968980 - protein_coding ENS0000004 DB0B0 ensembl_havana 17.0 ENS0000004 ENS0000004 DB0B0 +20 17956979 18059188 - protein_coding ENS0000258 9305F ensembl_havana 11.0 ENS0000258 ENS0000258 9305F +20 17962710 17962946 - snoRNA ENS0001409 D0C0D ensembl 1.0 ENS0001409 ENS0001409 D0C0D +20 17969018 17991122 + protein_coding ENS0000392 D37F0 ensembl_havana 14.0 ENS0000392 ENS0000392 D37F0 +20 18011955 18012284 + processed_pseudogene ENS0001129 3B024 havana 3.0 ENS0001129 ENS0001129 3B024 +20 18059493 18071008 + lncRNA ENS0000368 31342 havana 1.0 ENS0000368 ENS0000368 31342 +20 18090116 18090715 - processed_pseudogene ENS0000068 87C38 havana 1.0 ENS0000068 ENS0000068 87C38 +20 18095571 18095632 - snRNA ENS0001090 69076 ensembl 1.0 ENS0001090 ENS0001090 69076 +20 18113467 18113579 - misc_RNA ENS0001158 E0F0B ensembl 1.0 ENS0001158 ENS0001158 E0F0B +20 18137228 18188387 + protein_coding ENS0001273 0557D ensembl_havana 14.0 ENS0001273 ENS0001273 0557D +20 18137863 18143169 + protein_coding ENS0001421 FB081 ensembl_havana 4.0 ENS0001421 ENS0001421 FB081 +20 18220113 18220512 + processed_pseudogene ENS0000594 B8067 havana 1.0 ENS0000594 ENS0000594 B8067 +20 18258554 18258851 - misc_RNA ENS0001091 3837C ensembl 2.0 ENS0001091 ENS0001091 3837C +20 18265690 18265879 - snRNA ENS0000688 47916 ensembl 1.0 ENS0000688 ENS0000688 47916 +20 18270630 18271069 - lncRNA ENS0000361 CD28F havana_tagene 1.0 ENS0000361 ENS0000361 CD28F +20 18288283 18316996 + protein_coding ENS0000831 9503D ensembl_havana 15.0 ENS0000831 ENS0000831 9503D +20 18289343 18359300 - lncRNA ENS0001272 3587C havana 2.0 ENS0001272 ENS0001272 3587C +20 18295143 18295451 - misc_RNA ENS0000179 1B650 ensembl 1.0 ENS0000179 ENS0000179 1B650 +20 18313765 18315111 - lncRNA ENS0000345 FC63C havana 1.0 ENS0000345 ENS0000345 FC63C +20 18379049 18381484 + lncRNA ENS0000320 4E0F9 ensembl_havana 3.0 ENS0000320 ENS0000320 4E0F9 +20 18383367 18467281 - protein_coding ENS0001024 0B0C1 ensembl_havana 16.0 ENS0001024 ENS0001024 0B0C1 +20 18420160 18421454 - processed_pseudogene ENS0001045 EF43F havana 1.0 ENS0001045 ENS0001045 EF43F +20 18433842 18433944 + rRNA_pseudogene ENS0000671 02E34 ensembl 1.0 ENS0000671 ENS0000671 02E34 +20 18467389 18484646 + protein_coding ENS0000634 760F1 ensembl_havana 12.0 ENS0000634 ENS0000634 760F1 +20 18470615 18470691 + miRNA ENS0000818 D03D7 mirbase 3.0 ENS0000818 ENS0000818 D03D7 +20 18485265 18486052 + processed_pseudogene ENS0000323 A1446 havana 1.0 ENS0000323 ENS0000323 A1446 +20 18486540 18497225 - protein_coding ENS0001092 8FE0E ensembl_havana 16.0 ENS0001092 ENS0001092 8FE0E +20 18504633 18505066 + processed_pseudogene ENS0001424 C68A1 havana 3.0 ENS0001424 ENS0001424 C68A1 +20 18507520 18561415 + protein_coding ENS0001143 EE78E ensembl_havana 17.0 ENS0001143 ENS0001143 EE78E +20 18567347 18569563 + protein_coding ENS0000458 B4FB1 havana 5.0 ENS0000458 ENS0000458 B4FB1 +20 18567453 18744216 + protein_coding ENS0001093 4571E havana 1.0 ENS0001093 ENS0001093 4571E +20 18587942 18766644 + protein_coding ENS0000789 95A9F ensembl_havana 13.0 ENS0000789 ENS0000789 95A9F +20 18592330 18592619 + misc_RNA ENS0000682 1119C ensembl 3.0 ENS0000682 ENS0000682 1119C +20 18611047 18611167 - snRNA ENS0001204 D2C94 ensembl 1.0 ENS0001204 ENS0001204 D2C94 +20 18642782 18643634 - processed_pseudogene ENS0001174 78793 havana 2.0 ENS0001174 ENS0001174 78793 +20 18674395 18698709 - lncRNA ENS0001196 05DCB havana 1.0 ENS0001196 ENS0001196 05DCB +20 18782424 18783690 - processed_pseudogene ENS0000806 80B3A havana 1.0 ENS0000806 ENS0000806 80B3A +20 18786065 18794579 - lncRNA ENS0000416 A17BB havana 10.0 ENS0000416 ENS0000416 A17BB +20 18794529 18796067 + lncRNA ENS0001492 7995A havana 1.0 ENS0001492 ENS0001492 7995A +20 18809728 18830153 - lncRNA ENS0000822 7F6ED ensembl_havana 7.0 ENS0000822 ENS0000822 7F6ED +20 18813783 18814378 + protein_coding ENS0000527 E267C ensembl_havana 6.0 ENS0000527 ENS0000527 E267C +20 19000709 19056796 - lncRNA ENS0000529 184A9 havana 1.0 ENS0000529 ENS0000529 184A9 +20 19208652 19212164 - lncRNA ENS0000290 EA1B5 havana_tagene 1.0 ENS0000290 ENS0000290 EA1B5 +20 19212642 19722926 + protein_coding ENS0000377 A31AD ensembl_havana 12.0 ENS0000377 ENS0000377 A31AD +20 19212852 19213477 - lncRNA ENS0001081 47C18 havana 1.0 ENS0001081 ENS0001081 47C18 +20 19242302 19284596 - lncRNA ENS0000116 98230 havana 2.0 ENS0000116 ENS0000116 98230 +20 19693209 19697576 - lncRNA ENS0000947 602E4 havana 7.0 ENS0000947 ENS0000947 602E4 +20 19756390 19758037 - lncRNA ENS0000736 23DE5 havana 2.0 ENS0000736 ENS0000736 23DE5 +20 19757606 20002459 + protein_coding ENS0000145 D4155 ensembl_havana 13.0 ENS0000145 ENS0000145 D4155 +20 19823443 19823943 - processed_pseudogene ENS0001244 976A6 havana 2.0 ENS0001244 ENS0001244 976A6 +20 20017310 20033655 + protein_coding ENS0000994 088FA ensembl_havana 12.0 ENS0000994 ENS0000994 088FA +20 20034368 20056046 - protein_coding ENS0001163 EA1C0 ensembl_havana 15.0 ENS0001163 ENS0001163 EA1C0 +20 20052514 20360703 + protein_coding ENS0000158 95219 ensembl_havana 19.0 ENS0000158 ENS0000158 95219 +20 20094401 20095684 - lncRNA ENS0000246 45B21 havana 1.0 ENS0000246 ENS0000246 45B21 +20 20214299 20215262 - lncRNA ENS0001059 F7725 havana 1.0 ENS0001059 ENS0001059 F7725 +20 20253303 20253838 - processed_pseudogene ENS0001116 1651B havana 1.0 ENS0001116 ENS0001116 1651B +20 20258407 20267809 - lncRNA ENS0001294 B6F94 havana 1.0 ENS0001294 ENS0001294 B6F94 +20 20316755 20316994 + misc_RNA ENS0001292 4AC1F ensembl 2.0 ENS0001292 ENS0001292 4AC1F +20 20363691 20365657 + TEC ENS0001348 ACB93 havana 1.0 ENS0001348 ENS0001348 ACB93 +20 20368104 20370949 + protein_coding ENS0000641 28601 ensembl_havana 5.0 ENS0000641 ENS0000641 28601 +20 20389530 20712644 - protein_coding ENS0000530 75FCE ensembl_havana 15.0 ENS0000530 ENS0000530 75FCE +20 20659710 20659964 + processed_pseudogene ENS0000268 0EE8E havana 2.0 ENS0000268 ENS0000268 0EE8E +20 20721187 20721879 + processed_pseudogene ENS0000930 CE8A7 havana 1.0 ENS0000930 ENS0000930 CE8A7 +20 20738433 20738731 - misc_RNA ENS0000095 159B5 ensembl 1.0 ENS0000095 ENS0000095 159B5 +20 20854121 20854642 - processed_pseudogene ENS0001087 1B7D8 havana 1.0 ENS0001087 ENS0001087 1B7D8 +20 20970448 20972562 + lncRNA ENS0001184 A70A5 havana 1.0 ENS0001184 ENS0001184 A70A5 +20 21085576 21106514 - lncRNA ENS0000793 08DE1 havana 3.0 ENS0000793 ENS0000793 08DE1 +20 21089696 21092539 + lncRNA ENS0000235 7BD93 havana 2.0 ENS0000235 ENS0000235 7BD93 +20 21114723 21115197 + processed_pseudogene ENS0001145 74937 havana 1.0 ENS0001145 ENS0001145 74937 +20 21125983 21246622 + protein_coding ENS0000779 31E13 ensembl_havana 16.0 ENS0000779 ENS0000779 31E13 +20 21138956 21139074 - rRNA_pseudogene ENS0000859 809D2 ensembl 1.0 ENS0000859 ENS0000859 809D2 +20 21148741 21162890 - lncRNA ENS0000112 1EF6C havana 3.0 ENS0000112 ENS0000112 1EF6C +20 21154023 21218289 - lncRNA ENS0000002 4C98F havana 6.0 ENS0000002 ENS0000002 4C98F +20 21166206 21166596 - processed_pseudogene ENS0000333 A9F75 havana 3.0 ENS0000333 ENS0000333 A9F75 +20 21265854 21266465 + unprocessed_pseudogene ENS0000767 44A30 havana 1.0 ENS0000767 ENS0000767 44A30 +20 21302731 21303704 - lncRNA ENS0000223 780D8 havana 1.0 ENS0000223 ENS0000223 780D8 +20 21303331 21389825 + protein_coding ENS0000646 C6962 ensembl_havana 8.0 ENS0000646 ENS0000646 C6962 +20 21395365 21397526 - protein_coding ENS0000402 EB31E ensembl_havana 5.0 ENS0000402 ENS0000402 EB31E +20 21397740 21400391 + lncRNA ENS0000743 6A387 havana 7.0 ENS0000743 ENS0000743 6A387 +20 21424758 21425041 - misc_RNA ENS0001414 95D80 ensembl 1.0 ENS0001414 ENS0001414 95D80 +20 21488244 21488625 - processed_pseudogene ENS0000613 C7FCE havana 1.0 ENS0000613 ENS0000613 C7FCE +20 21499261 21502934 - lncRNA ENS0000238 6B0F4 havana 1.0 ENS0000238 ENS0000238 6B0F4 +20 21501395 21501968 + processed_pseudogene ENS0000780 1DA3A havana 1.0 ENS0000780 ENS0000780 1DA3A +20 21511017 21514064 - protein_coding ENS0000181 E6F89 ensembl_havana 6.0 ENS0000181 ENS0000181 E6F89 +20 21511447 21512309 + lncRNA ENS0000072 D2B5C havana 1.0 ENS0000072 ENS0000072 D2B5C +20 21530122 21535665 + lncRNA ENS0000679 8041F havana 2.0 ENS0000679 ENS0000679 8041F +20 21569976 21666621 + lncRNA ENS0000484 06CCD havana 4.0 ENS0000484 ENS0000484 06CCD +20 21610787 21703585 - lncRNA ENS0000838 06FE6 havana 1.0 ENS0000838 ENS0000838 06FE6 +20 21614978 21615435 + misc_RNA ENS0000114 3D9FE ensembl 1.0 ENS0000114 ENS0000114 3D9FE +20 21705659 21718486 + protein_coding ENS0001408 08F2A ensembl_havana 13.0 ENS0001408 ENS0001408 08F2A +20 21742303 21743077 - processed_pseudogene ENS0000171 A4E04 havana 1.0 ENS0000171 ENS0000171 A4E04 +20 21755270 21755350 + processed_pseudogene ENS0000581 9BE0B havana 5.0 ENS0000581 ENS0000581 9BE0B +20 21947647 21970783 + lncRNA ENS0000623 ADDDB havana 1.0 ENS0000623 ENS0000623 ADDDB +20 22054074 22074654 + lncRNA ENS0000535 349FB havana_tagene 3.0 ENS0000535 ENS0000535 349FB +20 22220554 22223283 + lncRNA ENS0001497 F74FC havana 1.0 ENS0001497 ENS0001497 F74FC +20 22263065 22284029 - lncRNA ENS0000336 1257F havana 2.0 ENS0000336 ENS0000336 1257F +20 22272052 22273454 - processed_pseudogene ENS0001141 952F7 havana 2.0 ENS0001141 ENS0001141 952F7 +20 22343693 22343930 - processed_pseudogene ENS0000135 0924E havana 1.0 ENS0000135 ENS0000135 0924E +20 22371216 22471557 - lncRNA ENS0000821 F9852 havana 4.0 ENS0000821 ENS0000821 F9852 +20 22547671 22578642 - lncRNA ENS0000747 5CA50 havana 3.0 ENS0000747 ENS0000747 5CA50 +20 22580998 22585455 - protein_coding ENS0000374 44057 ensembl_havana 15.0 ENS0000374 ENS0000374 44057 +20 22587522 22607517 - lncRNA ENS0000156 1F226 havana 1.0 ENS0000156 ENS0000156 1F226 +20 22668480 22685344 - lncRNA ENS0000997 F1BBC havana 3.0 ENS0000997 ENS0000997 F1BBC +20 22684958 22746991 + lncRNA ENS0000375 CB82C havana 3.0 ENS0000375 ENS0000375 CB82C +20 22732802 22734089 - processed_pseudogene ENS0000280 07A2A havana 3.0 ENS0000280 ENS0000280 07A2A +20 22885670 22886068 - processed_pseudogene ENS0001240 6E447 havana 2.0 ENS0001240 ENS0001240 6E447 +20 22960534 22966063 - processed_pseudogene ENS0001085 023E2 havana 1.0 ENS0001085 ENS0001085 023E2 +20 23010209 23030785 - lncRNA ENS0001489 C68FC havana 1.0 ENS0001489 ENS0001489 C68FC +20 23030863 23039241 + lncRNA ENS0000009 DF5D9 havana 7.0 ENS0000009 ENS0000009 DF5D9 +20 23035312 23039237 + protein_coding ENS0000143 03A07 ensembl_havana 6.0 ENS0000143 ENS0000143 03A07 +20 23045633 23049672 - protein_coding ENS0000801 F932A ensembl_havana 7.0 ENS0000801 ENS0000801 F932A +20 23071547 23072722 - TEC ENS0001437 68BD2 havana 1.0 ENS0001437 ENS0001437 68BD2 +20 23079360 23086324 - protein_coding ENS0000531 A3C62 ensembl_havana 10.0 ENS0000531 ENS0000531 A3C62 +20 23125068 23132635 - lncRNA ENS0000499 F6121 havana_tagene 2.0 ENS0000499 ENS0000499 F6121 +20 23148086 23149939 + TEC ENS0000191 1D4D9 havana 1.0 ENS0000191 ENS0000191 1D4D9 +20 23160857 23160971 - rRNA_pseudogene ENS0001384 929CC ensembl 1.0 ENS0001384 ENS0001384 929CC +20 23179272 23190419 + lncRNA ENS0000731 E5957 havana 4.0 ENS0000731 ENS0000731 E5957 +20 23320958 23325352 - lncRNA ENS0000775 15D88 havana 1.0 ENS0000775 ENS0000775 15D88 +20 23346941 23351486 - lncRNA ENS0000788 46277 havana 1.0 ENS0000788 ENS0000788 46277 +20 23350791 23354771 + protein_coding ENS0001195 84849 ensembl_havana 4.0 ENS0001195 ENS0001195 84849 +20 23356594 23358116 - lncRNA ENS0001001 2A541 havana 5.0 ENS0001001 ENS0001001 2A541 +20 23362182 23373062 + protein_coding ENS0000162 A7729 ensembl_havana 16.0 ENS0000162 ENS0000162 A7729 +20 23374519 23421519 - protein_coding ENS0001410 A5AC5 ensembl_havana 17.0 ENS0001410 ENS0001410 A5AC5 +20 23380896 23381012 - rRNA_pseudogene ENS0000464 A9D60 ensembl 1.0 ENS0000464 ENS0000464 A9D60 +20 23439685 23444930 + protein_coding ENS0000573 632D0 ensembl_havana 12.0 ENS0000573 ENS0000573 632D0 +20 23450403 23452876 - protein_coding ENS0000751 5E2A1 ensembl_havana 10.0 ENS0000751 ENS0000751 5E2A1 +20 23452623 23461207 + lncRNA ENS0001126 06247 havana 2.0 ENS0001126 ENS0001126 06247 +20 23457584 23457685 - misc_RNA ENS0001142 71BA0 ensembl 1.0 ENS0001142 ENS0001142 71BA0 +20 23475700 23479798 + unprocessed_pseudogene ENS0000335 28406 havana 1.0 ENS0000335 ENS0000335 28406 +20 23481645 23519054 - lncRNA ENS0000829 14752 havana 2.0 ENS0000829 ENS0000829 14752 +20 23489416 23489973 - lncRNA ENS0000526 1FFC7 havana 1.0 ENS0000526 ENS0000526 1FFC7 +20 23491101 23496010 + protein_coding ENS0001086 FA95F ensembl_havana 9.0 ENS0001086 ENS0001086 FA95F +20 23519146 23542018 + transcribed_unprocessed_pseudogene ENS0001122 5F9D8 havana 9.0 ENS0001122 ENS0001122 5F9D8 +20 23547764 23550652 - unprocessed_pseudogene ENS0000423 27886 havana 2.0 ENS0000423 ENS0000423 27886 +20 23564732 23568484 - protein_coding ENS0000687 7B485 ensembl_havana 5.0 ENS0000687 ENS0000687 7B485 +20 23583645 23586510 + unprocessed_pseudogene ENS0000551 2F9E1 havana 5.0 ENS0000551 ENS0000551 2F9E1 +20 23602410 23605917 - protein_coding ENS0000366 1493E ensembl_havana 5.0 ENS0000366 ENS0000366 1493E +20 23626706 23638473 - protein_coding ENS0001486 F700E ensembl_havana 9.0 ENS0001486 ENS0001486 F700E +20 23631826 23632316 - lncRNA ENS0000503 BB81F havana 1.0 ENS0000503 ENS0000503 BB81F +20 23655225 23656390 - lncRNA ENS0000370 EB307 havana 1.0 ENS0000370 ENS0000370 EB307 +20 23656151 23674888 + lncRNA ENS0001147 AB94B havana 2.0 ENS0001147 ENS0001147 AB94B +20 23685640 23689040 - protein_coding ENS0000854 964AE ensembl_havana 4.0 ENS0000854 ENS0000854 964AE +20 23711320 23711406 - unprocessed_pseudogene ENS0000274 20524 havana 1.0 ENS0000274 ENS0000274 20524 +20 23722589 23745524 + lncRNA ENS0000515 A223E havana 2.0 ENS0000515 ENS0000515 A223E +20 23747553 23751268 - protein_coding ENS0000963 409A1 ensembl_havana 8.0 ENS0000963 ENS0000963 409A1 +20 23772785 23776169 - unprocessed_pseudogene ENS0000452 FE8F7 havana 1.0 ENS0000452 ENS0000452 FE8F7 +20 23798092 23805983 + lncRNA ENS0000966 85C12 havana_tagene 2.0 ENS0000966 ENS0000966 85C12 +20 23823769 23826729 - protein_coding ENS0000548 690BC ensembl_havana 4.0 ENS0000548 ENS0000548 690BC +20 23875934 23879748 - protein_coding ENS0000283 C0035 ensembl_havana 5.0 ENS0000283 ENS0000283 C0035 +20 23918538 23928954 - transcribed_unprocessed_pseudogene ENS0001033 DD169 havana 2.0 ENS0001033 ENS0001033 DD169 +20 23980585 23980734 - unprocessed_pseudogene ENS0000420 1001D havana 1.0 ENS0000420 ENS0000420 1001D +20 23985050 23988779 - protein_coding ENS0000664 66C1A ensembl_havana 12.0 ENS0000664 ENS0000664 66C1A +20 23989292 23990454 + unprocessed_pseudogene ENS0000413 B3C04 havana 8.0 ENS0000413 ENS0000413 B3C04 +20 24063255 24226013 + lncRNA ENS0000772 B1335 havana 2.0 ENS0000772 ENS0000772 B1335 +20 24142590 24144386 - lncRNA ENS0000332 6DB1B havana 1.0 ENS0000332 ENS0000332 6DB1B +20 24237255 24237398 + snRNA ENS0000193 3164B ensembl 1.0 ENS0000193 ENS0000193 3164B +20 24297585 24318086 - lncRNA ENS0001245 3282C havana 2.0 ENS0001245 ENS0001245 3282C +20 24445391 24446314 + processed_pseudogene ENS0000281 AB004 havana 1.0 ENS0000281 ENS0000281 AB004 +20 24469629 24666616 + protein_coding ENS0000948 BD7F0 ensembl_havana 6.0 ENS0000948 ENS0000948 BD7F0 +20 24491472 24502345 - lncRNA ENS0000750 B8BE5 havana 1.0 ENS0000750 ENS0000750 B8BE5 +20 24679547 24680991 + lncRNA ENS0000486 049CA havana 1.0 ENS0000486 ENS0000486 049CA +20 24931840 24932983 + lncRNA ENS0000893 10ECF havana 1.0 ENS0000893 ENS0000893 10ECF +20 24949269 24959928 + protein_coding ENS0001214 95E3E ensembl_havana 6.0 ENS0001214 ENS0001214 95E3E +20 24962925 24992751 - protein_coding ENS0000773 A2BF7 ensembl_havana 12.0 ENS0000773 ENS0000773 A2BF7 +20 24974171 24974280 + snRNA ENS0000543 3CFD9 ensembl 1.0 ENS0000543 ENS0000543 3CFD9 +20 24992312 24993322 + lncRNA ENS0000571 C86E6 havana_tagene 1.0 ENS0000571 ENS0000571 C86E6 +20 25006230 25058980 - protein_coding ENS0001063 A6CAB ensembl_havana 15.0 ENS0001063 ENS0001063 A6CAB +20 25062962 25063591 - lncRNA ENS0001263 83E85 havana 1.0 ENS0001263 ENS0001263 83E85 +20 25070885 25082141 - protein_coding ENS0000470 ED758 ensembl_havana 15.0 ENS0000470 ENS0000470 ED758 +20 25099105 25099928 + processed_pseudogene ENS0000186 B08F7 havana 3.0 ENS0000186 ENS0000186 B08F7 +20 25139652 25149372 - lncRNA ENS0001148 03BE6 havana 6.0 ENS0001148 ENS0001148 03BE6 +20 25161167 25162061 - processed_pseudogene ENS0000103 8285B havana 1.0 ENS0000103 ENS0000103 8285B +20 25184658 25187849 - lncRNA ENS0000241 44077 havana 1.0 ENS0000241 ENS0000241 44077 +20 25195693 25228075 + protein_coding ENS0000319 C60E4 ensembl_havana 13.0 ENS0000319 ENS0000319 C60E4 +20 25214698 25214798 + misc_RNA ENS0001005 44FAD ensembl 1.0 ENS0001005 ENS0001005 44FAD +20 25226448 25226735 - TEC ENS0001219 F9CA4 havana 1.0 ENS0001219 ENS0001219 F9CA4 +20 25229150 25231933 + lncRNA ENS0001074 B0D54 havana 1.0 ENS0001074 ENS0001074 B0D54 +20 25239007 25245229 - lncRNA ENS0000305 A6566 havana 1.0 ENS0000305 ENS0000305 A6566 +20 25248085 25298012 + protein_coding ENS0000521 FC15F ensembl_havana 12.0 ENS0000521 ENS0000521 FC15F +20 25251008 25251304 - lncRNA ENS0001429 C01FA havana 1.0 ENS0001429 ENS0001429 C01FA +20 25284915 25285588 - lncRNA ENS0000549 BB958 havana 1.0 ENS0000549 ENS0000549 BB958 +20 25294742 25390835 - protein_coding ENS0000605 87F76 ensembl_havana 20.0 ENS0000605 ENS0000605 87F76 +20 25368275 25368766 - processed_pseudogene ENS0000949 DA6AC havana 2.0 ENS0000949 ENS0000949 DA6AC +20 25407673 25452628 + protein_coding ENS0000401 B81C7 ensembl_havana 10.0 ENS0000401 ENS0000401 B81C7 +20 25452697 25585531 - protein_coding ENS0000462 5BB8F ensembl_havana 15.0 ENS0000462 ENS0000462 5BB8F +20 25611153 25611223 + snRNA ENS0001411 5E748 ensembl 1.0 ENS0001411 ENS0001411 5E748 +20 25612935 25624014 - protein_coding ENS0000764 6F1B9 ensembl_havana 5.0 ENS0000764 ENS0000764 6F1B9 +20 25624045 25689032 + lncRNA ENS0001060 E864F havana 7.0 ENS0001060 ENS0001060 E864F +20 25654595 25654881 - misc_RNA ENS0000331 B2009 ensembl 2.0 ENS0000331 ENS0000331 B2009 +20 25669857 25670386 - processed_pseudogene ENS0001261 3D760 havana 1.0 ENS0001261 ENS0001261 3D760 +20 25673195 25696853 - protein_coding ENS0000855 4BAFD ensembl_havana 14.0 ENS0000855 ENS0000855 4BAFD +20 25680781 25681246 + lncRNA ENS0000304 D932E havana 1.0 ENS0000304 ENS0000304 D932E +20 25697003 25752776 + lncRNA ENS0001483 E215E havana 3.0 ENS0001483 ENS0001483 E215E +20 25734264 25735093 + unprocessed_pseudogene ENS0000168 2F395 havana 1.0 ENS0000168 ENS0000168 2F395 +20 25762384 25868225 - lncRNA ENS0001003 084E8 ensembl_havana 16.0 ENS0001003 ENS0001003 084E8 +20 25783553 25783919 + processed_pseudogene ENS0000970 89EE1 havana 1.0 ENS0000970 ENS0000970 89EE1 +20 25835985 25836832 - unprocessed_pseudogene ENS0000979 A5BFD havana 1.0 ENS0000979 ENS0000979 A5BFD +20 25845497 25845862 + processed_pseudogene ENS0001488 74708 havana 1.0 ENS0001488 ENS0001488 74708 +20 25919499 25919681 - unprocessed_pseudogene ENS0000190 5DA60 havana 1.0 ENS0000190 ENS0000190 5DA60 +20 25955795 25969288 + lncRNA ENS0001277 21441 havana_tagene 2.0 ENS0001277 ENS0001277 21441 +20 25985210 26086917 + lncRNA ENS0001015 1F6C1 havana 14.0 ENS0001015 ENS0001015 1F6C1 +20 26008791 26010531 - lncRNA ENS0001463 FAF75 havana 1.0 ENS0001463 ENS0001463 FAF75 +20 26009792 26011243 + lncRNA ENS0000886 A5CB8 havana_tagene 1.0 ENS0000886 ENS0000886 A5CB8 +20 26018832 26020684 + unprocessed_pseudogene ENS0000300 EFD8A havana 4.0 ENS0000300 ENS0000300 EFD8A +20 26071988 26072351 - processed_pseudogene ENS0001368 B8CA3 havana 1.0 ENS0001368 ENS0001368 B8CA3 +20 26103416 26114028 - transcribed_unprocessed_pseudogene ENS0000450 664AE havana 1.0 ENS0000450 ENS0000450 664AE +20 26132887 26134198 - lncRNA ENS0000805 D0322 havana 1.0 ENS0000805 ENS0000805 D0322 +20 26135983 26188191 + lncRNA ENS0001320 83230 havana_tagene 1.0 ENS0001320 ENS0001320 83230 +20 26167817 26251546 - lncRNA ENS0000544 663A7 havana 11.0 ENS0000544 ENS0000544 663A7 +20 26188408 26196891 + lncRNA ENS0001050 61F22 havana 5.0 ENS0001050 ENS0001050 61F22 +20 26208186 26208278 - miRNA ENS0001445 EF6ED mirbase 1.0 ENS0001445 ENS0001445 EF6ED +20 28563850 28602843 - transcribed_unprocessed_pseudogene ENS0001099 AA73B havana 2.0 ENS0001099 ENS0001099 AA73B +20 29080390 29102761 + unprocessed_pseudogene ENS0000933 DFFAE havana 1.0 ENS0000933 ENS0000933 DFFAE +20 29297095 29297246 - rRNA ENS0000230 AC9E0 ensembl 1.0 ENS0000230 ENS0000230 AC9E0 +20 29318824 29319579 + unprocessed_pseudogene ENS0000115 994EC havana 1.0 ENS0000115 ENS0000115 994EC +20 29321310 29321666 - unprocessed_pseudogene ENS0001389 E985F havana 1.0 ENS0001389 ENS0001389 E985F +20 29324092 29325049 + unprocessed_pseudogene ENS0001036 A14C8 havana 1.0 ENS0001036 ENS0001036 A14C8 +20 29410348 29411600 - unprocessed_pseudogene ENS0000284 6D62F havana 1.0 ENS0000284 ENS0000284 6D62F +20 29415264 29416911 - unprocessed_pseudogene ENS0001281 96E1D havana 1.0 ENS0001281 ENS0001281 96E1D +20 29427400 29427883 - processed_pseudogene ENS0000425 A1C40 havana 1.0 ENS0000425 ENS0000425 A1C40 +20 29430723 29432913 + processed_pseudogene ENS0000439 1D4C2 havana 1.0 ENS0000439 ENS0000439 1D4C2 +20 29448517 29449306 - unprocessed_pseudogene ENS0000880 B6DA0 havana 1.0 ENS0000880 ENS0000880 B6DA0 +20 29480147 29497179 - unprocessed_pseudogene ENS0000826 35B44 havana 1.0 ENS0000826 ENS0000826 35B44 +20 29548661 29559705 + lncRNA ENS0001531 D04D1 havana_tagene 1.0 ENS0001531 ENS0001531 D04D1 +20 29575060 29575171 + misc_RNA ENS0001013 9951B ensembl 1.0 ENS0001013 ENS0001013 9951B +20 29641778 29641976 + processed_pseudogene ENS0001487 B9388 havana 1.0 ENS0001487 ENS0001487 B9388 +20 29741510 29741661 - rRNA ENS0000629 E0603 ensembl 1.0 ENS0000629 ENS0000629 E0603 +20 29746181 29748090 + processed_pseudogene ENS0000350 E0AB7 havana 1.0 ENS0000350 ENS0000350 E0AB7 +20 29874059 29874177 + rRNA ENS0000326 5B3C8 ensembl 1.0 ENS0000326 ENS0000326 5B3C8 +20 29878636 29879363 - unprocessed_pseudogene ENS0000891 B4710 havana 1.0 ENS0000891 ENS0000891 B4710 +20 30214765 30214976 - processed_pseudogene ENS0001232 0AA33 havana 3.0 ENS0001232 ENS0001232 0AA33 +20 30278906 30289956 - lncRNA ENS0000080 F68F4 havana_tagene 5.0 ENS0000080 ENS0000080 F68F4 +20 30281837 30282954 + lncRNA ENS0000058 5AEC0 havana 1.0 ENS0000058 ENS0000058 5AEC0 +20 30294550 30312480 + lncRNA ENS0000271 12822 havana_tagene 1.0 ENS0000271 ENS0000271 12822 +20 30309015 30309126 - misc_RNA ENS0000460 A7238 ensembl 1.0 ENS0000460 ENS0000460 A7238 +20 30323367 30362276 - lncRNA ENS0000683 E4BB7 havana 2.0 ENS0000683 ENS0000683 E4BB7 +20 30377372 30399257 + unprocessed_pseudogene ENS0001246 79955 havana 15.0 ENS0001246 ENS0001246 79955 +20 30403123 30403384 - processed_pseudogene ENS0000128 97C2A havana 1.0 ENS0000128 ENS0000128 97C2A +20 30484925 30485076 - rRNA ENS0000782 2CAF9 ensembl 1.0 ENS0000782 ENS0000782 2CAF9 +20 30545601 30551470 + unprocessed_pseudogene ENS0000141 0DE36 havana 1.0 ENS0000141 ENS0000141 0DE36 +20 30656033 30723932 - transcribed_unprocessed_pseudogene ENS0000718 F800D ensembl_havana 3.0 ENS0000718 ENS0000718 F800D +20 30713240 30713346 - snRNA ENS0000234 20750 ensembl 1.0 ENS0000234 ENS0000234 20750 +20 30816156 30816274 - rRNA ENS0001014 E9C5B ensembl 1.0 ENS0001014 ENS0001014 E9C5B +20 31257664 31259632 + protein_coding ENS0000569 70A25 ensembl_havana 1.0 ENS0000569 ENS0000569 70A25 +20 31274170 31274867 + processed_pseudogene ENS0001339 2298C havana 2.0 ENS0001339 ENS0001339 2298C +20 31285317 31286835 - processed_pseudogene ENS0001216 78426 havana 2.0 ENS0001216 ENS0001216 78426 +20 31303212 31308585 - protein_coding ENS0000638 9182A ensembl_havana 1.0 ENS0000638 ENS0000638 9182A +20 31311303 31311962 - processed_pseudogene ENS0000901 EA6FF havana 1.0 ENS0000901 ENS0000901 EA6FF +20 31341371 31344471 - processed_pseudogene ENS0001151 94BAD havana 1.0 ENS0001151 ENS0001151 94BAD +20 31356870 31356978 + rRNA_pseudogene ENS0000941 6A3B6 ensembl 1.0 ENS0000941 ENS0000941 6A3B6 +20 31360790 31360930 + processed_pseudogene ENS0000734 E14E2 havana 3.0 ENS0000734 ENS0000734 E14E2 +20 31368601 31373923 + protein_coding ENS0001139 2B9BC ensembl_havana 4.0 ENS0001139 ENS0001139 2B9BC +20 31376319 31376446 + snoRNA ENS0000719 7942E ensembl 1.0 ENS0000719 ENS0000719 7942E +20 31377164 31390590 - protein_coding ENS0001217 98A65 ensembl_havana 7.0 ENS0001217 ENS0001217 98A65 +20 31404845 31412838 - protein_coding ENS0000006 F36BF ensembl_havana 4.0 ENS0000006 ENS0000006 F36BF +20 31421436 31429180 - transcribed_unprocessed_pseudogene ENS0000604 07A9C havana 7.0 ENS0000604 ENS0000604 07A9C +20 31440632 31450257 + protein_coding ENS0001374 6BD68 ensembl_havana 7.0 ENS0001374 ENS0001374 6BD68 +20 31465506 31476757 - protein_coding ENS0001248 5C369 ensembl_havana 3.0 ENS0001248 ENS0001248 5C369 +20 31475288 31484895 + protein_coding ENS0001396 E86A0 ensembl_havana 4.0 ENS0001396 ENS0001396 E86A0 +20 31485778 31487574 + lncRNA ENS0001198 F7DFE havana 1.0 ENS0001198 ENS0001198 F7DFE +20 31514410 31577923 + protein_coding ENS0000759 4EAA0 ensembl_havana 18.0 ENS0000759 ENS0000759 4EAA0 +20 31535263 31535626 + lncRNA ENS0001176 62252 havana 1.0 ENS0001176 ENS0001176 62252 +20 31547412 31548081 + protein_coding ENS0001274 CBB6F havana 6.0 ENS0001274 ENS0001274 CBB6F +20 31563166 31564076 + lncRNA ENS0000355 254D9 havana 1.0 ENS0000355 ENS0000355 254D9 +20 31567707 31573263 - lncRNA ENS0000279 A06CF havana 1.0 ENS0000279 ENS0000279 A06CF +20 31580890 31581214 + lncRNA ENS0000407 CF41F havana 1.0 ENS0000407 ENS0000407 CF41F +20 31581089 31581192 + snRNA ENS0001393 9D9C5 ensembl 1.0 ENS0001393 ENS0001393 9D9C5 +20 31605283 31606515 + protein_coding ENS0000908 35146 ensembl_havana 9.0 ENS0000908 ENS0000908 35146 +20 31607186 31607240 + miRNA ENS0001466 7A4E0 mirbase 1.0 ENS0001466 ENS0001466 7A4E0 +20 31637912 31645006 + protein_coding ENS0000840 2FBA0 ensembl_havana 5.0 ENS0000840 ENS0000840 2FBA0 +20 31664452 31723989 - protein_coding ENS0000297 EFBFE ensembl_havana 14.0 ENS0000297 ENS0000297 EFBFE +20 31686216 31716825 + lncRNA ENS0000255 39BBC havana 1.0 ENS0000255 ENS0000255 39BBC +20 31721507 31723409 + lncRNA ENS0000844 DF310 havana 1.0 ENS0000844 ENS0000844 DF310 +20 31739271 31801805 + protein_coding ENS0001321 6860B ensembl_havana 16.0 ENS0001321 ENS0001321 6860B +20 31808503 31808749 - processed_pseudogene ENS0000959 05634 havana 1.0 ENS0000959 ENS0000959 05634 +20 31819308 31834689 + protein_coding ENS0000557 2604A ensembl_havana 11.0 ENS0000557 ENS0000557 2604A +20 31844303 31845604 - protein_coding ENS0001541 8E3D3 ensembl_havana 8.0 ENS0001541 ENS0001541 8E3D3 +20 31847637 31870747 - protein_coding ENS0000011 F9AFE ensembl_havana 15.0 ENS0000011 ENS0000011 F9AFE +20 31870702 31944963 + protein_coding ENS0000001 23305 ensembl_havana 17.0 ENS0000001 ENS0000001 23305 +20 31901254 31901426 + snRNA ENS0001417 9283F ensembl 1.0 ENS0001417 ENS0001417 9283F +20 31944337 31952046 - protein_coding ENS0001361 8BCCB ensembl_havana 6.0 ENS0001361 ENS0001361 8BCCB +20 31968151 32003387 + protein_coding ENS0001550 EB7C5 ensembl_havana 3.0 ENS0001550 ENS0001550 EB7C5 +20 31970181 31970831 + lncRNA ENS0000340 CA6E9 havana 1.0 ENS0000340 ENS0000340 CA6E9 +20 32005671 32031591 - lncRNA ENS0000978 CBE84 havana 2.0 ENS0000978 ENS0000978 CBE84 +20 32008351 32008472 + rRNA_pseudogene ENS0000449 F77B5 ensembl 1.0 ENS0000449 ENS0000449 F77B5 +20 32010438 32032180 + protein_coding ENS0001434 20DC2 ensembl_havana 17.0 ENS0001434 ENS0001434 20DC2 +20 32050588 32050705 + rRNA_pseudogene ENS0001044 2635E ensembl 1.0 ENS0001044 ENS0001044 2635E +20 32052197 32101856 + protein_coding ENS0001503 8D3CC ensembl_havana 17.0 ENS0001503 ENS0001503 8D3CC +20 32109714 32167258 + protein_coding ENS0000999 4ADC9 ensembl_havana 16.0 ENS0000999 ENS0000999 4ADC9 +20 32116171 32116629 + lncRNA ENS0000861 3E8D1 havana 1.0 ENS0000861 ENS0000861 3E8D1 +20 32170390 32170790 - processed_pseudogene ENS0001236 EB734 havana 1.0 ENS0001236 ENS0001236 EB734 +20 32186477 32190527 - transcribed_processed_pseudogene ENS0000219 2F164 havana 6.0 ENS0000219 ENS0000219 2F164 +20 32192504 32207743 - protein_coding ENS0000684 ADFA1 ensembl_havana 7.0 ENS0000684 ENS0000684 ADFA1 +20 32207880 32238658 + protein_coding ENS0000457 A0561 ensembl_havana 14.0 ENS0000457 ENS0000457 A0561 +20 32237795 32237847 + miRNA ENS0001150 156F6 mirbase 1.0 ENS0001150 ENS0001150 156F6 +20 32275117 32277510 - lncRNA ENS0000716 DA337 havana_tagene 1.0 ENS0000716 ENS0000716 DA337 +20 32277651 32335011 + protein_coding ENS0001490 5E0DE ensembl_havana 8.0 ENS0001490 ENS0001490 5E0DE +20 32355053 32355734 + lncRNA ENS0001401 99A42 havana 1.0 ENS0001401 ENS0001401 99A42 +20 32358330 32439319 + protein_coding ENS0000431 31211 ensembl_havana 20.0 ENS0000431 ENS0000431 31211 +20 32443059 32585074 - protein_coding ENS0000231 A2E42 ensembl_havana 14.0 ENS0000231 ENS0000231 A2E42 +20 32449755 32453607 + lncRNA ENS0000048 AF8E6 havana 1.0 ENS0000048 ENS0000048 AF8E6 +20 32485149 32487019 + lncRNA ENS0001239 9824A havana_tagene 1.0 ENS0001239 ENS0001239 9824A +20 32509959 32520285 + lncRNA ENS0000100 92C77 havana 1.0 ENS0000100 ENS0000100 92C77 +20 32561093 32573888 + lncRNA ENS0001241 D3B72 havana 1.0 ENS0001241 ENS0001241 D3B72 +20 32564992 32608893 + lncRNA ENS0000848 A2827 ensembl_havana 7.0 ENS0000848 ENS0000848 A2827 +20 32581963 32593900 + lncRNA ENS0000380 C59A9 havana 1.0 ENS0000380 ENS0000380 C59A9 +20 32631625 32673941 - protein_coding ENS0001155 B1DD9 ensembl_havana 10.0 ENS0001155 ENS0001155 B1DD9 +20 32632183 32743567 - protein_coding ENS0001527 8E693 havana 1.0 ENS0001527 ENS0001527 8E693 +20 32690180 32690815 - processed_pseudogene ENS0000694 9C28D havana 8.0 ENS0000694 ENS0000694 9C28D +20 32702699 32743467 - protein_coding ENS0000935 E0350 ensembl_havana 12.0 ENS0000935 ENS0000935 E0350 +20 32762385 32809356 + protein_coding ENS0000925 44584 ensembl_havana 18.0 ENS0000925 ENS0000925 44584 +20 32819954 32850405 + protein_coding ENS0001089 014B3 ensembl_havana 9.0 ENS0001089 ENS0001089 014B3 +20 32843128 32854257 - lncRNA ENS0001450 27ADE havana 1.0 ENS0001450 ENS0001450 27ADE +20 32856621 32858751 + lncRNA ENS0001287 B3C86 havana 2.0 ENS0001287 ENS0001287 B3C86 +20 32858923 32961609 + protein_coding ENS0001113 6587A ensembl_havana 12.0 ENS0001113 ENS0001113 6587A +20 32983773 33004433 - protein_coding ENS0001309 F1BAD ensembl_havana 12.0 ENS0001309 ENS0001309 F1BAD +20 33007704 33023703 + protein_coding ENS0001481 1AAD4 ensembl_havana 7.0 ENS0001481 ENS0001481 1AAD4 +20 33031648 33044047 + protein_coding ENS0001328 A4B5A ensembl_havana 11.0 ENS0001328 ENS0001328 A4B5A +20 33047517 33048152 - processed_pseudogene ENS0001213 1806C havana 3.0 ENS0001213 ENS0001213 1806C +20 33055424 33073628 + protein_coding ENS0001308 21215 ensembl_havana 7.0 ENS0001308 ENS0001308 21215 +20 33079644 33111751 + protein_coding ENS0000928 E60A1 ensembl_havana 8.0 ENS0000928 ENS0000928 E60A1 +20 33161768 33181412 + protein_coding ENS0000369 A216F ensembl_havana 11.0 ENS0000369 ENS0000369 A216F +20 33187391 33188292 + processed_pseudogene ENS0000588 273F3 havana 1.0 ENS0000588 ENS0000588 273F3 +20 33193585 33210462 + transcribed_unprocessed_pseudogene ENS0000546 280FC havana 10.0 ENS0000546 ENS0000546 280FC +20 33214920 33217067 - lncRNA ENS0000558 3EE9B havana 1.0 ENS0000558 ENS0000558 3EE9B +20 33217310 33227806 + protein_coding ENS0000037 BB42F ensembl_havana 12.0 ENS0000037 ENS0000037 BB42F +20 33235995 33243311 + protein_coding ENS0000697 0B363 ensembl_havana 12.0 ENS0000697 ENS0000697 0B363 +20 33272262 33272657 - processed_pseudogene ENS0000700 EC396 havana 1.0 ENS0000700 ENS0000700 EC396 +20 33273480 33309871 + protein_coding ENS0000626 F0227 ensembl_havana 11.0 ENS0000626 ENS0000626 F0227 +20 33316869 33329969 + unitary_pseudogene ENS0000915 DBE48 havana 1.0 ENS0000915 ENS0000915 DBE48 +20 33347720 33354444 + transcribed_unitary_pseudogene ENS0001039 F69F5 havana 5.0 ENS0001039 ENS0001039 F69F5 +20 33358839 33401561 - protein_coding ENS0000695 87AE0 ensembl_havana 21.0 ENS0000695 ENS0000695 87AE0 +20 33407957 33443763 - protein_coding ENS0001522 DB9B1 ensembl_havana 6.0 ENS0001522 ENS0001522 DB9B1 +20 33490075 33650036 + protein_coding ENS0000752 8B954 ensembl_havana 22.0 ENS0000752 ENS0000752 8B954 +20 33655701 33656423 - lncRNA ENS0001365 28F94 havana 1.0 ENS0001365 ENS0001365 28F94 +20 33657087 33674463 - protein_coding ENS0000434 08EAB ensembl_havana 17.0 ENS0000434 ENS0000434 08EAB +20 33662327 33665619 + protein_coding ENS0000343 57DE0 ensembl_havana 6.0 ENS0000343 ENS0000343 57DE0 +20 33666498 33668525 + protein_coding ENS0001546 3D35F ensembl 6.0 ENS0001546 ENS0001546 3D35F +20 33666943 33668525 + protein_coding ENS0000946 4BDC2 havana 1.0 ENS0000946 ENS0000946 4BDC2 +20 33674517 33675380 + lncRNA ENS0001233 F03C3 havana 1.0 ENS0001233 ENS0001233 F03C3 +20 33675477 33686385 - protein_coding ENS0000680 6519C ensembl_havana 13.0 ENS0000680 ENS0000680 6519C +20 33702758 33720319 - protein_coding ENS0001025 37B31 ensembl_havana 12.0 ENS0001025 ENS0001025 37B31 +20 33728931 33731828 - lncRNA ENS0000919 CF24F havana 2.0 ENS0000919 ENS0000919 CF24F +20 33731657 33792269 + protein_coding ENS0001297 D1DF7 ensembl_havana 14.0 ENS0001297 ENS0001297 D1DF7 +20 33752167 33752543 - processed_pseudogene ENS0000426 4E6FD havana 1.0 ENS0000426 ENS0000426 4E6FD +20 33787373 33811109 - lncRNA ENS0000538 DF6A0 havana 6.0 ENS0000538 ENS0000538 DF6A0 +20 33811348 33854366 + protein_coding ENS0001002 2B61D ensembl_havana 4.0 ENS0001002 ENS0001002 2B61D +20 33912323 33913336 - processed_pseudogene ENS0000578 35F45 havana 2.0 ENS0000578 ENS0000578 35F45 +20 33965162 33965576 + processed_pseudogene ENS0000252 A69C6 havana 1.0 ENS0000252 ENS0000252 A69C6 +20 33980679 33994357 - lncRNA ENS0000622 B2DAA havana 2.0 ENS0000622 ENS0000622 B2DAA +20 33993646 34108308 + protein_coding ENS0001538 00CAD ensembl_havana 12.0 ENS0001538 ENS0001538 00CAD +20 34014969 34017749 - lncRNA ENS0000122 845BA havana 2.0 ENS0000122 ENS0000122 845BA +20 34049119 34049190 + miRNA ENS0001495 41C71 mirbase 1.0 ENS0001495 ENS0001495 41C71 +20 34088309 34112243 - protein_coding ENS0000441 B5240 ensembl_havana 7.0 ENS0000441 ENS0000441 B5240 +20 34122470 34123290 + processed_pseudogene ENS0000152 325A8 havana 1.0 ENS0000152 ENS0000152 325A8 +20 34132270 34136485 + lncRNA ENS0001542 A86A1 havana_tagene 1.0 ENS0001542 ENS0001542 A86A1 +20 34194569 34269344 + protein_coding ENS0000996 57DDA ensembl_havana 9.0 ENS0000996 ENS0000996 57DDA +20 34213495 34215892 - processed_pseudogene ENS0000781 EE489 havana 3.0 ENS0000781 ENS0000781 EE489 +20 34234840 34281173 - lncRNA ENS0001227 40762 havana 1.0 ENS0001227 ENS0001227 40762 +20 34245953 34246537 - processed_pseudogene ENS0000170 1279B havana 1.0 ENS0000170 ENS0000170 1279B +20 34280268 34311802 - protein_coding ENS0001062 24542 ensembl_havana 13.0 ENS0001062 ENS0001062 24542 +20 34281632 34286466 - lncRNA ENS0000910 DE9B2 havana 1.0 ENS0000910 ENS0000910 DE9B2 +20 34347671 34347785 - processed_pseudogene ENS0000568 6F286 havana 1.0 ENS0000568 ENS0000568 6F286 +20 34363241 34540748 + protein_coding ENS0000039 402BB havana 16.0 ENS0000039 ENS0000039 402BB +20 34384165 34384265 - misc_RNA ENS0000111 1D832 ensembl 1.0 ENS0000111 ENS0000111 1D832 +20 34402149 34402696 - processed_pseudogene ENS0001496 B31AE havana 1.0 ENS0001496 ENS0001496 B31AE +20 34441592 34442025 - lncRNA ENS0001381 09358 havana 1.0 ENS0001381 ENS0001381 09358 +20 34450930 34454552 + lncRNA ENS0000864 69EC6 havana 1.0 ENS0000864 ENS0000864 69EC6 +20 34466325 34466418 + miRNA ENS0001212 93F44 mirbase 1.0 ENS0001212 ENS0001212 93F44 +20 34475924 34476474 - processed_pseudogene ENS0000657 20804 havana 7.0 ENS0000657 ENS0000657 20804 +20 34476205 34476787 + lncRNA ENS0000757 F96F0 havana 1.0 ENS0000757 ENS0000757 F96F0 +20 34516414 34540958 + protein_coding ENS0000047 9DA20 ensembl_havana 17.0 ENS0000047 ENS0000047 9DA20 +20 34526510 34526606 - misc_RNA ENS0001459 431BC ensembl 1.0 ENS0001459 ENS0001459 431BC +20 34546854 34560345 + protein_coding ENS0000814 E1221 ensembl_havana 13.0 ENS0000814 ENS0000814 E1221 +20 34560542 34698790 - protein_coding ENS0000723 4A381 ensembl_havana 11.0 ENS0000723 ENS0000723 4A381 +20 34688688 34688978 + processed_pseudogene ENS0000067 319E4 havana 1.0 ENS0000067 ENS0000067 319E4 +20 34689097 34825651 - protein_coding ENS0000288 9AF6B ensembl_havana 14.0 ENS0000288 ENS0000288 9AF6B +20 34704339 34713439 + protein_coding ENS0001110 86EE1 ensembl_havana 13.0 ENS0001110 ENS0001110 86EE1 +20 34741401 34807447 + lncRNA ENS0000849 0ED44 havana_tagene 1.0 ENS0000849 ENS0000849 0ED44 +20 34833575 34834462 - processed_pseudogene ENS0000811 7DAE0 havana 2.0 ENS0000811 ENS0000811 7DAE0 +20 34844720 34872856 - protein_coding ENS0000973 75E19 ensembl_havana 17.0 ENS0000973 ENS0000973 75E19 +20 34872146 34927962 + protein_coding ENS0000492 23E42 ensembl_havana 20.0 ENS0000492 ENS0000492 23E42 +20 34928432 34956027 - protein_coding ENS0001160 30F96 ensembl_havana 12.0 ENS0001160 ENS0001160 30F96 +20 34955810 35002437 + protein_coding ENS0001252 A68E0 ensembl_havana 17.0 ENS0001252 ENS0001252 A68E0 +20 34990376 34990497 + miRNA ENS0001528 B6737 mirbase 1.0 ENS0001528 ENS0001528 B6737 +20 34990400 34990472 - miRNA ENS0001043 F6029 mirbase 1.0 ENS0001043 ENS0001043 F6029 +20 35002404 35092807 - protein_coding ENS0001370 EF78D ensembl_havana 12.0 ENS0001370 ENS0001370 EF78D +20 35030317 35030420 - snRNA ENS0001329 B6344 ensembl 1.0 ENS0001329 ENS0001329 B6344 +20 35115364 35147336 - protein_coding ENS0000968 542A6 ensembl_havana 13.0 ENS0000968 ENS0000968 542A6 +20 35117136 35117207 - snoRNA ENS0000107 F48E9 ensembl 1.0 ENS0000107 ENS0000107 F48E9 +20 35172072 35216240 + protein_coding ENS0000807 51764 ensembl_havana 6.0 ENS0000807 ENS0000807 51764 +20 35174355 35174919 - lncRNA ENS0000636 8906D havana 1.0 ENS0000636 ENS0000636 8906D +20 35195306 35195423 + rRNA_pseudogene ENS0000599 A49D8 ensembl 1.0 ENS0000599 ENS0000599 A49D8 +20 35201745 35278131 - protein_coding ENS0000907 20AB2 havana 17.0 ENS0000907 ENS0000907 20AB2 +20 35218009 35218185 + processed_pseudogene ENS0000242 192EF havana 3.0 ENS0000242 ENS0000242 192EF +20 35226690 35276998 + protein_coding ENS0001513 A88FF ensembl_havana 10.0 ENS0001513 ENS0001513 A88FF +20 35262727 35264187 - TEC ENS0001442 11575 havana 1.0 ENS0001442 ENS0001442 11575 +20 35267885 35280043 - protein_coding ENS0001306 983F9 havana 1.0 ENS0001306 ENS0001306 983F9 +20 35278907 35284985 - protein_coding ENS0000344 80C31 ensembl_havana 9.0 ENS0000344 ENS0000344 80C31 +20 35285251 35285756 + lncRNA ENS0001191 94E8C havana 1.0 ENS0001191 ENS0001191 94E8C +20 35285731 35292425 - protein_coding ENS0000295 FC69A ensembl_havana 8.0 ENS0000295 ENS0000295 FC69A +20 35302566 35412031 - protein_coding ENS0001041 0B4AE ensembl_havana 22.0 ENS0001041 ENS0001041 0B4AE +20 35433029 35435450 + lncRNA ENS0000927 01BCD havana 3.0 ENS0000927 ENS0000927 01BCD +20 35433347 35454746 - protein_coding ENS0000329 EB770 ensembl_havana 9.0 ENS0000329 ENS0000329 EB770 +20 35453954 35454097 - miRNA ENS0001337 1D440 mirbase 1.0 ENS0001337 ENS0001337 1D440 +20 35455164 35519280 + protein_coding ENS0001078 1D7D6 ensembl_havana 16.0 ENS0001078 ENS0001078 1D7D6 +20 35476203 35490982 - lncRNA ENS0000972 26AB4 havana 6.0 ENS0000972 ENS0000972 26AB4 +20 35520852 35528255 - transcribed_unprocessed_pseudogene ENS0000902 60911 havana 6.0 ENS0000902 ENS0000902 60911 +20 35523186 35529652 - protein_coding ENS0001017 C82F5 ensembl_havana 14.0 ENS0001017 ENS0001017 C82F5 +20 35542038 35557634 + protein_coding ENS0000076 8C392 ensembl_havana 20.0 ENS0000076 ENS0000076 8C392 +20 35544430 35544747 - processed_pseudogene ENS0001426 7A2EE havana 1.0 ENS0001426 ENS0001426 7A2EE +20 35558737 35607562 - transcribed_unitary_pseudogene ENS0000674 93EAF ensembl_havana 17.0 ENS0000674 ENS0000674 93EAF +20 35588324 35588607 + processed_pseudogene ENS0001467 75526 havana 1.0 ENS0001467 ENS0001467 75526 +20 35615829 35621094 + protein_coding ENS0000890 EA036 ensembl_havana 10.0 ENS0000890 ENS0000890 EA036 +20 35626031 35664956 - protein_coding ENS0000403 32479 ensembl_havana 13.0 ENS0000403 ENS0000403 32479 +20 35632340 35674544 - protein_coding ENS0000385 C2013 havana 5.0 ENS0000385 ENS0000385 C2013 +20 35643143 35643462 + misc_RNA ENS0000952 4F575 ensembl 1.0 ENS0000952 ENS0000952 4F575 +20 35647328 35647430 - snRNA ENS0000390 671C3 ensembl 1.0 ENS0000390 ENS0000390 671C3 +20 35648925 35664956 - protein_coding ENS0000696 F599C ensembl_havana 8.0 ENS0000696 ENS0000696 F599C +20 35668052 35699355 - protein_coding ENS0000842 ACB9A ensembl_havana 13.0 ENS0000842 ENS0000842 ACB9A +20 35699272 35700984 + protein_coding ENS0001412 02AB4 ensembl_havana 16.0 ENS0001412 ENS0001412 02AB4 +20 35701347 35742312 - protein_coding ENS0000510 8CF90 ensembl_havana 24.0 ENS0000510 ENS0000510 8CF90 +20 35752814 35753719 + processed_pseudogene ENS0000722 EF812 havana 3.0 ENS0000722 ENS0000722 EF812 +20 35769785 35769867 - snRNA ENS0000338 59AEC ensembl 1.0 ENS0000338 ENS0000338 59AEC +20 35771974 35950370 + protein_coding ENS0000189 BA150 ensembl_havana 17.0 ENS0000189 ENS0000189 BA150 +20 35850674 35850915 - processed_pseudogene ENS0001029 93DDD havana 1.0 ENS0001029 ENS0001029 93DDD +20 35853252 35853364 - misc_RNA ENS0000244 65FAA ensembl 1.0 ENS0000244 ENS0000244 65FAA +20 35878476 35878733 + processed_pseudogene ENS0000675 2F483 havana 3.0 ENS0000675 ENS0000675 2F483 +20 35887659 35887801 + snRNA ENS0000576 75613 ensembl 1.0 ENS0000576 ENS0000576 75613 +20 35928570 35928675 - snRNA ENS0001318 8B2F2 ensembl 1.0 ENS0001318 ENS0001318 8B2F2 +20 35953617 35959472 - protein_coding ENS0000202 0D01E ensembl_havana 10.0 ENS0000202 ENS0000202 0D01E +20 35954564 36030700 + protein_coding ENS0000988 49F9F ensembl_havana 12.0 ENS0000988 ENS0000988 49F9F +20 36045618 36051018 - lncRNA ENS0000592 3C2A0 havana 2.0 ENS0000592 ENS0000592 3C2A0 +20 36051491 36085703 + lncRNA ENS0000883 3BC61 havana 4.0 ENS0000883 ENS0000883 3BC61 +20 36064243 36064563 - lncRNA ENS0000937 8BB1C havana 1.0 ENS0000937 ENS0000937 8BB1C +20 36076018 36076601 - processed_pseudogene ENS0000182 4F7ED havana 1.0 ENS0000182 ENS0000182 4F7ED +20 36086252 36088192 - lncRNA ENS0000995 67DC1 havana 1.0 ENS0000995 ENS0000995 67DC1 +20 36091504 36232799 + protein_coding ENS0000493 AFD0A ensembl_havana 23.0 ENS0000493 ENS0000493 AFD0A +20 36147334 36155760 - lncRNA ENS0000698 2FC68 havana 7.0 ENS0000698 ENS0000698 2FC68 +20 36233851 36234297 + lncRNA ENS0000550 CAFCF havana 1.0 ENS0000550 ENS0000550 CAFCF +20 36236459 36270918 + protein_coding ENS0000043 DE080 ensembl_havana 12.0 ENS0000043 ENS0000043 DE080 +20 36306336 36528637 + protein_coding ENS0000505 9F73E ensembl_havana 18.0 ENS0000505 ENS0000505 9F73E +20 36507702 36573391 - lncRNA ENS0001023 28FE0 havana 8.0 ENS0001023 ENS0001023 28FE0 +20 36541497 36551447 + protein_coding ENS0001121 90216 ensembl_havana 10.0 ENS0001121 ENS0001121 90216 +20 36561237 36561530 - misc_RNA ENS0000177 EC271 ensembl 1.0 ENS0000177 ENS0000177 EC271 +20 36573488 36593950 + protein_coding ENS0001284 403F9 ensembl_havana 11.0 ENS0001284 ENS0001284 403F9 +20 36574553 36612384 + protein_coding ENS0000353 E7848 ensembl_havana 1.0 ENS0000353 ENS0000353 E7848 +20 36605779 36612557 + protein_coding ENS0000912 9144F ensembl_havana 18.0 ENS0000912 ENS0000912 9144F +20 36612318 36646196 - protein_coding ENS0001356 FC071 ensembl_havana 14.0 ENS0001356 ENS0001356 FC071 +20 36620403 36621423 + unprocessed_pseudogene ENS0000488 44A3B havana 2.0 ENS0000488 ENS0000488 44A3B +20 36651766 36746090 - protein_coding ENS0001182 8AA98 ensembl_havana 21.0 ENS0001182 ENS0001182 8AA98 +20 36668559 36668670 - misc_RNA ENS0001144 11D46 ensembl 1.0 ENS0001144 ENS0001144 11D46 +20 36751791 36773818 - protein_coding ENS0000839 2F550 ensembl_havana 16.0 ENS0000839 ENS0000839 2F550 +20 36777447 36863538 - protein_coding ENS0001154 7748D ensembl_havana 15.0 ENS0001154 ENS0001154 7748D +20 36870099 36870402 + misc_RNA ENS0000712 61F34 ensembl 3.0 ENS0000712 ENS0000712 61F34 +20 36876121 36894235 + protein_coding ENS0000084 D54F6 ensembl_havana 10.0 ENS0000084 ENS0000084 D54F6 +20 36890229 36951843 - protein_coding ENS0000253 81D3B ensembl_havana 10.0 ENS0000253 ENS0000253 81D3B +20 36996349 37095997 - protein_coding ENS0000654 3FB96 ensembl_havana 12.0 ENS0000654 ENS0000654 3FB96 +20 37021743 37021984 - processed_pseudogene ENS0000169 01113 havana 1.0 ENS0000169 ENS0000169 01113 +20 37049254 37049707 + processed_pseudogene ENS0000178 51C51 havana 2.0 ENS0000178 ENS0000178 51C51 +20 37095785 37097178 + lncRNA ENS0001342 31E5A havana 1.0 ENS0001342 ENS0001342 31E5A +20 37101226 37179588 - protein_coding ENS0000028 4BAD5 havana 15.0 ENS0000028 ENS0000028 4BAD5 +20 37165774 37166312 - processed_pseudogene ENS0000528 D4413 havana 1.0 ENS0000528 ENS0000528 D4413 +20 37178410 37241619 + protein_coding ENS0000761 FF19A ensembl_havana 17.0 ENS0000761 ENS0000761 FF19A +20 37251082 37261835 - protein_coding ENS0001547 2B8DE ensembl_havana 9.0 ENS0001547 ENS0001547 2B8DE +20 37289638 37317260 + protein_coding ENS0000951 6D2F6 ensembl_havana 12.0 ENS0000951 ENS0000951 6D2F6 +20 37338886 37344109 - lncRNA ENS0001435 53F70 havana 1.0 ENS0001435 ENS0001435 53F70 +20 37344685 37406050 + protein_coding ENS0000706 0F91D ensembl_havana 12.0 ENS0000706 ENS0000706 0F91D +20 37444733 37445534 - processed_pseudogene ENS0000137 AAC2D havana 3.0 ENS0000137 ENS0000137 AAC2D +20 37492472 37527931 - protein_coding ENS0000497 C7CC8 ensembl_havana 15.0 ENS0000497 ENS0000497 C7CC8 +20 37521206 37523690 + protein_coding ENS0001330 DCCBB ensembl_havana 11.0 ENS0001330 ENS0001330 DCCBB +20 37526642 37527060 - lncRNA ENS0001006 C2F94 havana 1.0 ENS0001006 ENS0001006 C2F94 +20 37530039 37530543 + processed_pseudogene ENS0000502 6C473 havana 1.0 ENS0000502 ENS0000502 6C473 +20 37571103 37591493 + lncRNA ENS0000820 21830 havana 2.0 ENS0000820 ENS0000820 21830 +20 37602852 37603185 - processed_pseudogene ENS0001517 95823 havana 1.0 ENS0001517 ENS0001517 95823 +20 37619298 37623119 - lncRNA ENS0001507 FFAD4 havana 1.0 ENS0001507 ENS0001507 FFAD4 +20 37676889 37683234 + lncRNA ENS0000315 F4539 havana_tagene 2.0 ENS0000315 ENS0000315 F4539 +20 37693955 37872129 + protein_coding ENS0001398 173FC ensembl_havana 19.0 ENS0001398 ENS0001398 173FC +20 37903111 37945350 + protein_coding ENS0001406 78A85 ensembl_havana 12.0 ENS0001406 ENS0001406 78A85 +20 37975156 37975481 - misc_RNA ENS0000728 5FF45 ensembl 1.0 ENS0000728 ENS0000728 5FF45 +20 37983007 38033461 - protein_coding ENS0000661 27D9A ensembl_havana 13.0 ENS0000661 ENS0000661 27D9A +20 38033725 38127780 + protein_coding ENS0001020 0E6F6 ensembl_havana 12.0 ENS0001020 ENS0001020 0E6F6 +20 38098414 38098700 - misc_RNA ENS0000616 01697 ensembl 2.0 ENS0000616 ENS0000616 01697 +20 38103524 38104961 - lncRNA ENS0001203 49BA2 havana 1.0 ENS0001203 ENS0001203 49BA2 +20 38127385 38166578 - protein_coding ENS0000884 6B8C7 ensembl_havana 12.0 ENS0000884 ENS0000884 6B8C7 +20 38210503 38260772 - protein_coding ENS0000008 9EC40 ensembl_havana 12.0 ENS0000008 ENS0000008 9EC40 +20 38233251 38233799 - lncRNA ENS0000945 672EC havana 1.0 ENS0000945 ENS0000945 672EC +20 38260149 38304332 + lncRNA ENS0001223 1125C havana 1.0 ENS0001223 ENS0001223 1125C +20 38260175 38293892 + lncRNA ENS0001322 CFD1F havana 1.0 ENS0001322 ENS0001322 CFD1F +20 38270778 38274020 + lncRNA ENS0000507 6D3A8 havana 1.0 ENS0000507 ENS0000507 6D3A8 +20 38288331 38326536 - lncRNA ENS0000923 585A8 havana 3.0 ENS0000923 ENS0000923 585A8 +20 38304150 38337505 + protein_coding ENS0000153 EA30F ensembl_havana 14.0 ENS0000153 ENS0000153 EA30F +20 38346482 38377013 + protein_coding ENS0000127 EEC29 ensembl_havana 6.0 ENS0000127 ENS0000127 EEC29 +20 38378825 38383179 + lncRNA ENS0000475 86413 havana 1.0 ENS0000475 ENS0000475 86413 +20 38404893 38416797 - lncRNA ENS0001358 852D5 havana_tagene 2.0 ENS0001358 ENS0001358 852D5 +20 38418483 38419202 - lncRNA ENS0001383 3440D havana 1.0 ENS0001383 ENS0001383 3440D +20 38419638 38435409 - lncRNA ENS0000055 A7027 havana 13.0 ENS0000055 ENS0000055 A7027 +20 38421647 38421774 - snoRNA ENS0001202 4510A ensembl 1.0 ENS0001202 ENS0001202 4510A +20 38425083 38425354 - snoRNA ENS0000496 F0C99 havana 6.0 ENS0000496 ENS0000496 F0C99 +20 38427309 38427442 - snoRNA ENS0000817 7E0A9 ensembl 3.0 ENS0000817 ENS0000817 7E0A9 +20 38429670 38429803 - snoRNA ENS0000254 222A0 ensembl 1.0 ENS0000254 ENS0000254 222A0 +20 38433865 38433998 - snoRNA ENS0000615 A8E41 ensembl 1.0 ENS0000615 ENS0000615 A8E41 +20 38442036 38442169 + snoRNA ENS0000730 E8909 ensembl 1.0 ENS0000730 ENS0000730 E8909 +20 38446343 38450940 + lncRNA ENS0000690 76189 ensembl_havana 20.0 ENS0000690 ENS0000690 76189 +20 38448084 38448215 + snoRNA ENS0000483 17139 ensembl 1.0 ENS0000483 ENS0000483 17139 +20 38449370 38449503 + snoRNA ENS0001431 BDEC4 ensembl 1.0 ENS0001431 ENS0001431 BDEC4 +20 38472816 38578859 + protein_coding ENS0001535 30BAE ensembl_havana 15.0 ENS0001535 ENS0001535 30BAE +20 38516563 38516632 + miRNA ENS0000097 6715A mirbase 2.0 ENS0000097 ENS0000097 6715A +20 38537758 38538458 + processed_pseudogene ENS0001215 A1861 havana 1.0 ENS0001215 ENS0001215 A1861 +20 38581195 38588463 + protein_coding ENS0000310 AE7B2 ensembl_havana 12.0 ENS0000310 ENS0000310 AE7B2 +20 38601934 38651035 + protein_coding ENS0000292 EB445 ensembl_havana 10.0 ENS0000292 ENS0000292 EB445 +20 38614913 38615476 - processed_pseudogene ENS0000298 5A226 havana 2.0 ENS0000298 ENS0000298 5A226 +20 38711821 38712103 - misc_RNA ENS0000504 D2B81 ensembl 1.0 ENS0000504 ENS0000504 D2B81 +20 38724486 38729372 + protein_coding ENS0000816 0AF9C ensembl_havana 4.0 ENS0000816 ENS0000816 0AF9C +20 38748460 38772520 + protein_coding ENS0000263 3B866 ensembl_havana 10.0 ENS0000263 ENS0000263 3B866 +20 38761528 38761842 + misc_RNA ENS0000396 27C22 ensembl 1.0 ENS0000396 ENS0000396 27C22 +20 38805697 38923024 + protein_coding ENS0000536 6E548 ensembl_havana 10.0 ENS0000536 ENS0000536 6E548 +20 38872770 38873074 + misc_RNA ENS0000887 5C1E3 ensembl 3.0 ENS0000887 ENS0000887 5C1E3 +20 38926312 38953106 + protein_coding ENS0001210 3695B ensembl_havana 15.0 ENS0001210 ENS0001210 3695B +20 38955910 38956547 + lncRNA ENS0001299 DD04C havana 1.0 ENS0001299 ENS0001299 DD04C +20 38961925 38962111 - lncRNA ENS0001271 A8D7F havana 1.0 ENS0001271 ENS0001271 A8D7F +20 38962299 39039723 + protein_coding ENS0000327 68D68 ensembl_havana 15.0 ENS0000327 ENS0000327 68D68 +20 38962472 38962993 + lncRNA ENS0000639 ED997 havana 1.0 ENS0000639 ENS0000639 ED997 +20 38977711 38978422 - processed_pseudogene ENS0001493 5AF7B havana 1.0 ENS0001493 ENS0001493 5AF7B +20 39213777 39224748 - lncRNA ENS0000120 F031C havana 1.0 ENS0000120 ENS0000120 F031C +20 39231115 39233836 - lncRNA ENS0000089 53559 havana_tagene 1.0 ENS0000089 ENS0000089 53559 +20 39329014 39329924 - processed_pseudogene ENS0001278 50F86 havana 1.0 ENS0001278 ENS0001278 50F86 +20 39349094 39349392 - misc_RNA ENS0001432 8B900 ensembl 1.0 ENS0001432 ENS0001432 8B900 +20 39655325 39663552 - lncRNA ENS0000215 67B5F havana 1.0 ENS0000215 ENS0000215 67B5F +20 39757202 39815097 + lncRNA ENS0000131 C5DB2 havana 2.0 ENS0000131 ENS0000131 C5DB2 +20 39932922 39933239 - processed_pseudogene ENS0000358 D8C05 havana 1.0 ENS0000358 ENS0000358 D8C05 +20 40004216 40011323 + lncRNA ENS0000606 CB32B havana 2.0 ENS0000606 ENS0000606 CB32B +20 40031585 40043889 + lncRNA ENS0001452 5EC0A havana 1.0 ENS0001452 ENS0001452 5EC0A +20 40121041 40126357 + lncRNA ENS0001030 0395F havana 1.0 ENS0001030 ENS0001030 0395F +20 40685848 40689236 - protein_coding ENS0000282 8EFD1 ensembl_havana 4.0 ENS0000282 ENS0000282 8EFD1 +20 40696499 40698616 - lncRNA ENS0000132 4C2AA havana 2.0 ENS0000132 ENS0000132 4C2AA +20 40717257 40758636 + lncRNA ENS0001097 41570 havana_tagene 1.0 ENS0001097 ENS0001097 41570 +20 40854119 40854229 - rRNA_pseudogene ENS0000424 2BB72 ensembl 1.0 ENS0000424 ENS0000424 2BB72 +20 41024205 41024394 + snRNA ENS0000005 BB4B0 ensembl 1.0 ENS0000005 ENS0000005 BB4B0 +20 41028822 41124487 + protein_coding ENS0000485 0C4A3 ensembl_havana 6.0 ENS0000485 ENS0000485 0C4A3 +20 41098019 41138003 - lncRNA ENS0000453 64B83 havana 2.0 ENS0000453 ENS0000453 64B83 +20 41136960 41196801 + protein_coding ENS0000593 0B56F ensembl_havana 14.0 ENS0000593 ENS0000593 0B56F +20 41169023 41169078 + miRNA ENS0000809 02406 mirbase 1.0 ENS0000809 ENS0000809 02406 +20 41178448 41317672 - protein_coding ENS0001543 C9C09 ensembl_havana 21.0 ENS0001543 ENS0001543 C9C09 +20 41196647 41196938 - transcribed_processed_pseudogene ENS0000961 7D0D7 havana 1.0 ENS0000961 ENS0000961 7D0D7 +20 41196691 41197157 + transcribed_processed_pseudogene ENS0000749 060C6 havana 1.0 ENS0000749 ENS0000749 060C6 +20 41290280 41290577 - misc_RNA ENS0000347 20B49 ensembl 2.0 ENS0000347 ENS0000347 20B49 +20 41331123 41331660 + processed_pseudogene ENS0001307 E7D7E havana 1.0 ENS0001307 ENS0001307 E7D7E +20 41340821 41360582 + protein_coding ENS0001265 002BA ensembl_havana 12.0 ENS0001265 ENS0001265 002BA +20 41359962 41366818 - protein_coding ENS0001296 23D4F ensembl_havana 5.0 ENS0001296 ENS0001296 23D4F +20 41382383 41383107 + processed_pseudogene ENS0000518 065F0 havana 1.0 ENS0000518 ENS0000518 065F0 +20 41402083 41618384 - protein_coding ENS0001295 BF287 ensembl_havana 15.0 ENS0001295 ENS0001295 BF287 +20 41459354 41461944 - processed_pseudogene ENS0001344 E2432 havana 1.0 ENS0001344 ENS0001344 E2432 +20 41485571 41486225 - lncRNA ENS0001460 FAE2D havana 1.0 ENS0001460 ENS0001460 FAE2D +20 41547422 41547922 + processed_pseudogene ENS0001243 A04EA havana 2.0 ENS0001243 ENS0001243 A04EA +20 41684986 41685091 - snRNA ENS0001230 5DB48 ensembl 1.0 ENS0001230 ENS0001230 5DB48 +20 41712479 41712600 - snoRNA ENS0000768 C7081 ensembl 1.0 ENS0000768 ENS0000768 C7081 +20 41991140 42063969 + lncRNA ENS0001008 9E3D2 havana 1.0 ENS0001008 ENS0001008 9E3D2 +20 41997131 41999397 - processed_pseudogene ENS0000617 2A55D havana 1.0 ENS0000617 ENS0000617 2A55D +20 42014034 42015478 - lncRNA ENS0001404 A5F7A havana 1.0 ENS0001404 ENS0001404 A5F7A +20 42072752 43189970 - protein_coding ENS0001133 0D79C ensembl_havana 12.0 ENS0001133 ENS0001133 0D79C +20 42606694 42607559 + processed_pseudogene ENS0000126 A7193 havana 1.0 ENS0000126 ENS0000126 A7193 +20 42685404 42688562 + lncRNA ENS0000491 7C55B havana 2.0 ENS0000491 ENS0000491 7C55B +20 42968743 42971492 + lncRNA ENS0000270 58CA9 havana 1.0 ENS0000270 ENS0000270 58CA9 +20 43063326 43063609 + misc_RNA ENS0000307 DB163 ensembl 1.0 ENS0000307 ENS0000307 DB163 +20 43189998 43202599 + lncRNA ENS0000482 C1A63 havana 3.0 ENS0000482 ENS0000482 C1A63 +20 43222206 43222509 + misc_RNA ENS0000648 F5680 ensembl 3.0 ENS0000648 ENS0000648 F5680 +20 43230760 43231260 + processed_pseudogene ENS0000953 7EF4A havana 5.0 ENS0000953 ENS0000953 7EF4A +20 43304555 43304679 + scaRNA ENS0000012 A09B6 ensembl 1.0 ENS0000012 ENS0000012 A09B6 +20 43388642 43389041 - processed_pseudogene ENS0000916 FB15E havana 1.0 ENS0000916 ENS0000916 FB15E +20 43405473 43405578 + snRNA ENS0000405 786E8 ensembl 1.0 ENS0000405 ENS0000405 786E8 +20 43423196 43423538 - processed_pseudogene ENS0000567 A4F16 havana 1.0 ENS0000567 ENS0000567 A4F16 +20 43451159 43451260 - misc_RNA ENS0000748 CEDC3 ensembl 1.0 ENS0000748 ENS0000748 CEDC3 +20 43457893 43466046 + protein_coding ENS0001355 C047E ensembl_havana 16.0 ENS0001355 ENS0001355 C047E +20 43458054 43541881 + protein_coding ENS0000211 6E9D8 havana 1.0 ENS0000211 ENS0000211 6E9D8 +20 43473013 43473119 + snRNA ENS0001051 BCC91 ensembl 1.0 ENS0001051 ENS0001051 BCC91 +20 43489442 43550954 + protein_coding ENS0001253 ED4DB ensembl_havana 17.0 ENS0001253 ENS0001253 ED4DB +20 43523261 43523572 - processed_pseudogene ENS0000205 F93C0 havana 1.0 ENS0000205 ENS0000205 F93C0 +20 43540171 43569498 + protein_coding ENS0000049 92EAA havana 1.0 ENS0000049 ENS0000049 92EAA +20 43549389 43550949 - lncRNA ENS0000926 087CC havana 1.0 ENS0000926 ENS0000926 087CC +20 43558968 43588237 + protein_coding ENS0000248 5C1D4 ensembl_havana 17.0 ENS0000248 ENS0000248 5C1D4 +20 43590009 43590596 - lncRNA ENS0000533 3226F havana 1.0 ENS0000533 ENS0000533 3226F +20 43590937 43647299 + protein_coding ENS0001106 08E91 ensembl_havana 13.0 ENS0001106 ENS0001106 08E91 +20 43652523 43653291 + processed_pseudogene ENS0001458 79870 havana 2.0 ENS0001458 ENS0001458 79870 +20 43667019 43716495 + protein_coding ENS0001166 E35E6 ensembl_havana 16.0 ENS0001166 ENS0001166 E35E6 +20 43726164 43727002 - protein_coding ENS0000291 A8BD3 ensembl_havana 6.0 ENS0000291 ENS0000291 A8BD3 +20 43796437 43796543 - snRNA ENS0000495 4DEFE ensembl 1.0 ENS0000495 ENS0000495 4DEFE +20 43894720 43895468 + lncRNA ENS0000024 99E39 havana 1.0 ENS0000024 ENS0000024 99E39 +20 43914852 44069616 + protein_coding ENS0000451 7BDFA ensembl_havana 18.0 ENS0000451 ENS0000451 7BDFA +20 44007248 44007537 + misc_RNA ENS0000096 5806A ensembl 3.0 ENS0000096 ENS0000096 5806A +20 44106590 44187188 - protein_coding ENS0000769 720B9 ensembl_havana 7.0 ENS0000769 ENS0000769 720B9 +20 44181111 44182712 + lncRNA ENS0000086 9AAB9 havana_tagene 1.0 ENS0000086 ENS0000086 9AAB9 +20 44195939 44210771 - protein_coding ENS0001529 6FDF5 ensembl_havana 11.0 ENS0001529 ENS0001529 6FDF5 +20 44210907 44226027 + lncRNA ENS0001353 D9FBA havana 6.0 ENS0001353 ENS0001353 D9FBA +20 44247099 44280947 + protein_coding ENS0000050 5E499 ensembl_havana 17.0 ENS0000050 ENS0000050 5E499 +20 44302840 44311202 - protein_coding ENS0000637 BEA57 ensembl_havana 6.0 ENS0000637 ENS0000637 BEA57 +20 44337043 44351238 + protein_coding ENS0000429 97BED ensembl_havana 5.0 ENS0000429 ENS0000429 97BED +20 44344839 44344933 + misc_RNA ENS0001076 2F48A ensembl 2.0 ENS0001076 ENS0001076 2F48A +20 44347552 44355185 - lncRNA ENS0000715 77B8B havana 2.0 ENS0000715 ENS0000715 77B8B +20 44355700 44434596 + protein_coding ENS0000517 8B96C ensembl_havana 17.0 ENS0000517 ENS0000517 8B96C +20 44372746 44395706 - lncRNA ENS0001119 DB1DD havana 2.0 ENS0001119 ENS0001119 DB1DD +20 44389624 44391537 - lncRNA ENS0000148 22CE1 havana 1.0 ENS0000148 ENS0000148 22CE1 +20 44408120 44408203 + miRNA ENS0000904 3EBB0 mirbase 1.0 ENS0000904 ENS0000904 3EBB0 +20 44436172 44465349 - lncRNA ENS0000038 3D4E0 ensembl_havana 8.0 ENS0000038 ENS0000038 3D4E0 +20 44466564 44466842 - processed_pseudogene ENS0001473 9E500 havana 1.0 ENS0001473 ENS0001473 9E500 +20 44475874 44494603 + protein_coding ENS0001084 58C7F ensembl_havana 11.0 ENS0001084 ENS0001084 58C7F +20 44496221 44522085 - protein_coding ENS0000354 C9EE1 ensembl_havana 14.0 ENS0000354 ENS0000354 C9EE1 +20 44531785 44624247 + protein_coding ENS0001088 C73D3 ensembl_havana 14.0 ENS0001088 ENS0001088 C73D3 +20 44619522 44652233 - protein_coding ENS0001193 8FE6B ensembl_havana 13.0 ENS0001193 ENS0001193 8FE6B +20 44656451 44663498 - lncRNA ENS0001499 832CE havana 1.0 ENS0001499 ENS0001499 832CE +20 44656451 44696096 - lncRNA ENS0001533 287D7 havana 10.0 ENS0001533 ENS0001533 287D7 +20 44694892 44746021 - lncRNA ENS0000702 19DB6 havana 6.0 ENS0000702 ENS0000702 19DB6 +20 44714844 44728509 + protein_coding ENS0001451 FB478 ensembl_havana 10.0 ENS0001451 ENS0001451 FB478 +20 44745865 44752313 + protein_coding ENS0000866 E0D4B ensembl_havana 7.0 ENS0000866 ENS0000866 E0D4B +20 44746642 44747201 + lncRNA ENS0000547 CDDB2 havana 1.0 ENS0000547 ENS0000547 CDDB2 +20 44751808 44810546 - protein_coding ENS0000317 9C12A ensembl_havana 13.0 ENS0000317 ENS0000317 9C12A +20 44841925 44842539 - unprocessed_pseudogene ENS0001080 77BF1 havana 1.0 ENS0001080 ENS0001080 77BF1 +20 44864435 44885604 - lncRNA ENS0000870 20FCB havana 1.0 ENS0000870 ENS0000870 20FCB +20 44885702 44908532 + protein_coding ENS0001433 50D66 ensembl_havana 13.0 ENS0001433 ENS0001433 50D66 +20 44910060 44959035 + protein_coding ENS0000468 0CF64 ensembl_havana 13.0 ENS0000468 ENS0000468 0CF64 +20 44942130 44960397 - protein_coding ENS0000139 4F16D ensembl_havana 8.0 ENS0000139 ENS0000139 4F16D +20 44963794 44966402 - lncRNA ENS0000393 D004C havana 1.0 ENS0000393 ENS0000393 D004C +20 44966479 45080021 + protein_coding ENS0000314 AAAAC ensembl_havana 12.0 ENS0000314 ENS0000314 AAAAC +20 45004348 45004454 + misc_RNA ENS0001234 FA825 ensembl 1.0 ENS0001234 ENS0001234 FA825 +20 45091214 45101127 - protein_coding ENS0001268 D13A9 ensembl_havana 9.0 ENS0001268 ENS0001268 D13A9 +20 45109452 45115172 - protein_coding ENS0000993 09836 ensembl_havana 11.0 ENS0000993 ENS0000993 09836 +20 45123425 45124465 - protein_coding ENS0000362 46543 ensembl_havana 5.0 ENS0000362 ENS0000362 46543 +20 45137706 45138976 - transcribed_unprocessed_pseudogene ENS0000591 C6432 havana 1.0 ENS0000591 ENS0000591 C6432 +20 45174902 45176544 + protein_coding ENS0001444 9413A ensembl_havana 5.0 ENS0001444 ENS0001444 9413A +20 45207033 45209768 + protein_coding ENS0000771 A0BBD ensembl_havana 12.0 ENS0000771 ENS0000771 A0BBD +20 45221373 45224458 + protein_coding ENS0000889 ED9EA ensembl_havana 7.0 ENS0000889 ENS0000889 ED9EA +20 45229817 45230191 + unprocessed_pseudogene ENS0000630 F4F38 havana 1.0 ENS0000630 ENS0000630 F4F38 +20 45230868 45233331 + transcribed_unprocessed_pseudogene ENS0001501 11A50 havana 3.0 ENS0001501 ENS0001501 11A50 +20 45252239 45254564 - protein_coding ENS0000808 75E9C ensembl_havana 5.0 ENS0000808 ENS0000808 75E9C +20 45293445 45308529 - protein_coding ENS0000501 4E8F7 ensembl_havana 15.0 ENS0000501 ENS0000501 4E8F7 +20 45306840 45317824 + protein_coding ENS0001485 C4645 ensembl_havana 11.0 ENS0001485 ENS0001485 C4645 +20 45325288 45348424 - protein_coding ENS0000929 DC215 ensembl_havana 6.0 ENS0000929 ENS0000929 DC215 +20 45345115 45345823 - lncRNA ENS0000042 C720C havana 1.0 ENS0000042 ENS0000042 C720C +20 45361937 45376798 + protein_coding ENS0001280 9DA61 ensembl_havana 10.0 ENS0001280 ENS0001280 9DA61 +20 45363200 45410610 + protein_coding ENS0001338 F1E23 havana 5.0 ENS0001338 ENS0001338 F1E23 +20 45372557 45407889 - protein_coding ENS0000710 1A526 ensembl_havana 11.0 ENS0000710 ENS0000710 1A526 +20 45388347 45388705 - processed_pseudogene ENS0000062 38896 havana 1.0 ENS0000062 ENS0000062 38896 +20 45406057 45410610 + protein_coding ENS0000798 80E02 ensembl_havana 8.0 ENS0000798 ENS0000798 80E02 +20 45416084 45456934 + protein_coding ENS0000240 84430 ensembl_havana 18.0 ENS0000240 ENS0000240 84430 +20 45425510 45425573 + miRNA ENS0000990 E8715 mirbase 1.0 ENS0000990 ENS0000990 E8715 +20 45435272 45448325 - lncRNA ENS0000897 B049C havana 1.0 ENS0000897 ENS0000897 B049C +20 45469753 45481532 + protein_coding ENS0000834 ABDE2 ensembl_havana 18.0 ENS0000834 ENS0000834 ABDE2 +20 45487613 45490248 + lncRNA ENS0001194 66435 havana 1.0 ENS0001194 ENS0001194 66435 +20 45490818 45491671 - processed_pseudogene ENS0000348 6390D havana 1.0 ENS0000348 ENS0000348 6390D +20 45512461 45515622 - protein_coding ENS0001545 C9D35 ensembl_havana 8.0 ENS0001545 ENS0001545 C9D35 +20 45534196 45547662 - protein_coding ENS0000873 4800B ensembl_havana 2.0 ENS0000873 ENS0000873 4800B +20 45534196 45539482 - protein_coding ENS0000185 256DC ensembl_havana 9.0 ENS0000185 ENS0000185 256DC +20 45536796 45537014 - processed_pseudogene ENS0000665 A7C8B havana 1.0 ENS0000665 ENS0000665 A7C8B +20 45539983 45540227 - processed_pseudogene ENS0000129 1F49A havana 1.0 ENS0000129 ENS0000129 1F49A +20 45540626 45547752 - protein_coding ENS0001441 259B5 ensembl_havana 14.0 ENS0001441 ENS0001441 259B5 +20 45551153 45579326 - protein_coding ENS0000214 AB567 ensembl_havana 12.0 ENS0000214 ENS0000214 AB567 +20 45559758 45559876 - rRNA_pseudogene ENS0000596 7B3C9 ensembl 1.0 ENS0000596 ENS0000596 7B3C9 +20 45567990 45568807 - processed_pseudogene ENS0000862 EBF20 havana 1.0 ENS0000862 ENS0000862 EBF20 +20 45607939 45631284 - protein_coding ENS0000245 9B1D9 ensembl_havana 4.0 ENS0000245 ENS0000245 9B1D9 +20 45629739 45631196 + protein_coding ENS0001302 C0082 ensembl_havana 5.0 ENS0001302 ENS0001302 C0082 +20 45640486 45641370 - processed_pseudogene ENS0001395 716F7 havana 3.0 ENS0001395 ENS0001395 716F7 +20 45648563 45670270 - protein_coding ENS0000922 868F5 ensembl_havana 11.0 ENS0000922 ENS0000922 868F5 +20 45684653 45705019 - protein_coding ENS0001237 F0BE2 ensembl_havana 9.0 ENS0001237 ENS0001237 F0BE2 +20 45702038 45708817 + protein_coding ENS0001042 4D160 ensembl_havana 5.0 ENS0001042 ENS0001042 4D160 +20 45705102 45705180 - miRNA ENS0000138 B3C4A mirbase 1.0 ENS0000138 ENS0000138 B3C4A +20 45722347 45725830 + protein_coding ENS0000172 CDFA6 ensembl_havana 4.0 ENS0000172 ENS0000172 CDFA6 +20 45745737 45746287 - processed_pseudogene ENS0000386 F7E9E havana 2.0 ENS0000386 ENS0000386 F7E9E +20 45747944 45791932 - protein_coding ENS0000649 25D8D ensembl_havana 19.0 ENS0000649 ENS0000649 25D8D +20 45749920 45750074 + processed_pseudogene ENS0001136 95506 havana 3.0 ENS0001136 ENS0001136 95506 +20 45778698 45778823 + snRNA ENS0000250 052C1 ensembl 1.0 ENS0000250 ENS0000250 052C1 +20 45791954 45811427 + protein_coding ENS0001427 D30F5 ensembl_havana 13.0 ENS0001427 ENS0001427 D30F5 +20 45812576 45816957 + protein_coding ENS0001034 F4C50 ensembl_havana 17.0 ENS0001034 ENS0001034 F4C50 +20 45823214 45833745 - protein_coding ENS0001286 BBD76 ensembl_havana 10.0 ENS0001286 ENS0001286 BBD76 +20 45833799 45843276 + protein_coding ENS0001114 7877A ensembl_havana 19.0 ENS0001114 ENS0001114 7877A +20 45841721 45857405 - protein_coding ENS0000027 FB25D ensembl_havana 17.0 ENS0000027 ENS0000027 FB25D +20 45847630 45847940 - misc_RNA ENS0000383 F10BA ensembl 1.0 ENS0000383 ENS0000383 F10BA +20 45857614 45879122 + protein_coding ENS0001048 D4FEC ensembl_havana 7.0 ENS0001048 ENS0001048 D4FEC +20 45881227 45885266 + protein_coding ENS0000733 BBC16 ensembl_havana 4.0 ENS0000733 ENS0000733 BBC16 +20 45886489 45887635 - protein_coding ENS0001536 50008 ensembl_havana 4.0 ENS0001536 ENS0001536 50008 +20 45888625 45891287 - protein_coding ENS0000232 A7821 ensembl_havana 6.0 ENS0000232 ENS0000232 A7821 +20 45890144 45898824 + protein_coding ENS0000863 DC2D8 ensembl_havana 20.0 ENS0000863 ENS0000863 DC2D8 +20 45892694 45893419 - lncRNA ENS0000980 85217 havana 1.0 ENS0000980 ENS0000980 85217 +20 45898621 45912155 - protein_coding ENS0000650 E990A ensembl_havana 15.0 ENS0000650 ENS0000650 E990A +20 45926518 45935055 - lncRNA ENS0000865 B4AE0 havana 1.0 ENS0000865 ENS0000865 B4AE0 +20 45934683 45948023 + protein_coding ENS0000448 624F6 ensembl_havana 12.0 ENS0000448 ENS0000448 624F6 +20 45948660 45972203 - protein_coding ENS0000797 2AAF3 ensembl_havana 8.0 ENS0000797 ENS0000797 2AAF3 +20 45975249 45976075 - processed_pseudogene ENS0000597 26B73 havana 1.0 ENS0000597 ENS0000597 26B73 +20 46008908 46016561 + protein_coding ENS0000851 80650 ensembl_havana 7.0 ENS0000851 ENS0000851 80650 +20 46013500 46022073 - lncRNA ENS0000365 CE08C havana 6.0 ENS0000365 ENS0000365 CE08C +20 46021690 46060150 + protein_coding ENS0000642 366F5 ensembl_havana 14.0 ENS0000642 ENS0000642 366F5 +20 46060991 46089962 - protein_coding ENS0001125 9547B ensembl_havana 12.0 ENS0001125 ENS0001125 9547B +20 46099518 46100176 + processed_pseudogene ENS0001494 F54C2 havana 3.0 ENS0001494 ENS0001494 F54C2 +20 46118278 46129863 + protein_coding ENS0000140 C2622 ensembl_havana 14.0 ENS0000140 ENS0000140 C2622 +20 46168404 46171237 - lncRNA ENS0000266 65FD1 havana 1.0 ENS0000266 ENS0000266 65FD1 +20 46173739 46308498 - protein_coding ENS0001304 62810 ensembl_havana 11.0 ENS0001304 ENS0001304 62810 +20 46345980 46364458 - protein_coding ENS0000030 F686C ensembl_havana 15.0 ENS0000030 ENS0000030 F686C +20 46364551 46397994 + lncRNA ENS0000443 4DC7E havana 2.0 ENS0000443 ENS0000443 4DC7E +20 46366050 46432985 - protein_coding ENS0000676 52CD3 ensembl_havana 18.0 ENS0000676 ENS0000676 52CD3 +20 46414228 46459276 - transcribed_unitary_pseudogene ENS0000102 CD504 ensembl_havana 5.0 ENS0000102 ENS0000102 CD504 +20 46455033 46496731 - lncRNA ENS0000837 FAAF3 havana 1.0 ENS0000837 ENS0000837 FAAF3 +20 46463719 46477232 - transcribed_processed_pseudogene ENS0000032 4AD49 havana 2.0 ENS0000032 ENS0000032 4AD49 +20 46484461 46492640 + unprocessed_pseudogene ENS0000777 2F76A havana 10.0 ENS0000777 ENS0000777 2F76A +20 46499630 46513559 - protein_coding ENS0000911 AF720 ensembl_havana 11.0 ENS0000911 ENS0000911 AF720 +20 46540946 46550654 - protein_coding ENS0001124 F281B ensembl_havana 3.0 ENS0001124 ENS0001124 F281B +20 46557823 46684467 - protein_coding ENS0001108 4E8D0 ensembl_havana 14.0 ENS0001108 ENS0001108 4E8D0 +20 46681676 46682375 - lncRNA ENS0001224 96D7E havana 1.0 ENS0001224 ENS0001224 96D7E +20 46684365 46689444 - protein_coding ENS0000428 A128B ensembl_havana 6.0 ENS0000428 ENS0000428 A128B +20 46689659 46690289 + lncRNA ENS0000198 E485A havana 1.0 ENS0000198 ENS0000198 E485A +20 46709649 46736347 + protein_coding ENS0000411 1347B ensembl_havana 6.0 ENS0000411 ENS0000411 1347B +20 46859333 46859672 + misc_RNA ENS0000052 144C3 ensembl 1.0 ENS0000052 ENS0000052 144C3 +20 46863563 46864104 - processed_pseudogene ENS0001325 7D1C9 havana 1.0 ENS0001325 ENS0001325 7D1C9 +20 46894624 47188844 + protein_coding ENS0000311 2B299 ensembl_havana 19.0 ENS0000311 ENS0000311 2B299 +20 46901143 46901726 - lncRNA ENS0000564 96D05 havana 1.0 ENS0000564 ENS0000564 96D05 +20 46997596 46998488 + processed_pseudogene ENS0001175 5C8AD havana 1.0 ENS0001175 ENS0001175 5C8AD +20 47071865 47072236 + processed_pseudogene ENS0000410 35F12 havana 1.0 ENS0000410 ENS0000410 35F12 +20 47160838 47161700 - processed_pseudogene ENS0000119 6DDAB havana 2.0 ENS0000119 ENS0000119 6DDAB +20 47166967 47167058 + miRNA ENS0001190 8DB1B mirbase 1.0 ENS0001190 ENS0001190 8DB1B +20 47209214 47356889 - protein_coding ENS0000802 A0104 ensembl_havana 19.0 ENS0000802 ENS0000802 A0104 +20 47318502 47320754 + lncRNA ENS0000693 F1E27 havana 2.0 ENS0000693 ENS0000693 F1E27 +20 47348137 47349142 - lncRNA ENS0000595 AEE1B havana 1.0 ENS0000595 ENS0000595 AEE1B +20 47352561 47354633 + lncRNA ENS0000318 E1080 havana 2.0 ENS0000318 ENS0000318 E1080 +20 47391925 47412327 - lncRNA ENS0000079 45179 havana 2.0 ENS0000079 ENS0000079 45179 +20 47478214 47479010 - processed_pseudogene ENS0001312 5686E havana 1.0 ENS0001312 ENS0001312 5686E +20 47484278 47484381 - snRNA ENS0000628 8D8FE ensembl 1.0 ENS0000628 ENS0000628 8D8FE +20 47492978 47493085 - snRNA ENS0001439 D276D ensembl 1.0 ENS0001439 ENS0001439 D276D +20 47501887 47656877 + protein_coding ENS0000388 3D5CB ensembl_havana 19.0 ENS0000388 ENS0000388 3D5CB +20 47630266 47631438 - processed_pseudogene ENS0000509 EFD8B havana 1.0 ENS0000509 ENS0000509 EFD8B +20 47656348 47786616 - protein_coding ENS0001465 3624A ensembl_havana 14.0 ENS0001465 ENS0001465 3624A +20 47677901 47686297 + lncRNA ENS0000208 7CC84 havana 2.0 ENS0000208 ENS0000208 7CC84 +20 47826969 47827044 - snRNA ENS0001130 947D5 ensembl 1.0 ENS0001130 ENS0001130 947D5 +20 47859788 47860672 + processed_pseudogene ENS0000167 4F5B1 havana 3.0 ENS0000167 ENS0000167 4F5B1 +20 47873191 47873273 + rRNA_pseudogene ENS0000339 13E55 ensembl 1.0 ENS0000339 ENS0000339 13E55 +20 47904348 47904423 + snRNA ENS0001333 D4CE6 ensembl 1.0 ENS0001333 ENS0001333 D4CE6 +20 47950797 47954808 + lncRNA ENS0000427 DEEED havana 2.0 ENS0000427 ENS0000427 DEEED +20 47958022 47958603 + lncRNA ENS0001071 71C4E havana 1.0 ENS0001071 ENS0001071 71C4E +20 47974137 47977584 + lncRNA ENS0000852 83DA1 havana_tagene 1.0 ENS0000852 ENS0000852 83DA1 +20 47976739 47990127 - lncRNA ENS0000744 FF7E0 havana 3.0 ENS0000744 ENS0000744 FF7E0 +20 47983404 47984313 + lncRNA ENS0000867 11112 havana 1.0 ENS0000867 ENS0000867 11112 +20 48007875 48022237 - lncRNA ENS0001461 EB39B havana 1.0 ENS0001461 ENS0001461 EB39B +20 48025245 48060271 + lncRNA ENS0000118 18133 havana_tagene 1.0 ENS0000118 ENS0000118 18133 +20 48073869 48074188 + lncRNA ENS0001267 D9559 havana 1.0 ENS0001267 ENS0001267 D9559 +20 48120269 48136844 + lncRNA ENS0000224 7602B havana_tagene 1.0 ENS0000224 ENS0000224 7602B +20 48264969 48275392 - lncRNA ENS0000389 E6BDB havana_tagene 1.0 ENS0000389 ENS0000389 E6BDB +20 48324812 48367976 - lncRNA ENS0001218 E7835 havana 1.0 ENS0001218 ENS0001218 E7835 +20 48359884 48370636 + lncRNA ENS0000220 9E872 ensembl_havana 9.0 ENS0000220 ENS0000220 9E872 +20 48383323 48384895 - lncRNA ENS0001031 56C7F havana 1.0 ENS0001031 ENS0001031 56C7F +20 48406777 48410716 + lncRNA ENS0000489 A2411 havana 2.0 ENS0000489 ENS0000489 A2411 +20 48471308 48472414 + lncRNA ENS0001038 D60EC havana 1.0 ENS0001038 ENS0001038 D60EC +20 48476999 48477553 + lncRNA ENS0001208 AC3BF havana 1.0 ENS0001208 ENS0001208 AC3BF +20 48494234 48494309 - snRNA ENS0000513 3EA92 ensembl 1.0 ENS0000513 ENS0000513 3EA92 +20 48624252 48827999 - protein_coding ENS0001123 F8B24 ensembl_havana 14.0 ENS0001123 ENS0001123 F8B24 +20 48657073 48659119 + lncRNA ENS0000180 19669 havana 1.0 ENS0000180 ENS0000180 19669 +20 48821688 48849458 - lncRNA ENS0000643 F662B havana 1.0 ENS0000643 ENS0000643 F662B +20 48921711 49036693 + protein_coding ENS0000144 7C256 ensembl_havana 9.0 ENS0000144 ENS0000144 7C256 +20 49038357 49038602 - processed_pseudogene ENS0000312 B246A havana 1.0 ENS0000312 ENS0000312 B246A +20 49040463 49046168 - lncRNA ENS0000882 BAEA8 havana 6.0 ENS0000882 ENS0000882 BAEA8 +20 49046246 49096960 + protein_coding ENS0000655 D3620 ensembl_havana 17.0 ENS0000655 ENS0000655 D3620 +20 49113339 49188367 - protein_coding ENS0001539 3DAE2 ensembl_havana 20.0 ENS0001539 ENS0001539 3DAE2 +20 49134480 49135012 + processed_pseudogene ENS0000265 2FDC8 havana 1.0 ENS0000265 ENS0000265 2FDC8 +20 49219295 49244077 + protein_coding ENS0000760 14997 ensembl_havana 14.0 ENS0000760 ENS0000760 14997 +20 49237946 49278426 - protein_coding ENS0000621 494BD ensembl_havana 15.0 ENS0000621 ENS0000621 494BD +20 49278178 49299600 + lncRNA ENS0000101 61FFF ensembl_havana 13.0 ENS0000101 ENS0000101 61FFF +20 49278427 49278624 + misc_RNA ENS0000359 F9641 ensembl 1.0 ENS0000359 ENS0000359 F9641 +20 49278940 49279028 + snoRNA ENS0001549 6BCC1 ensembl 1.0 ENS0001549 ENS0001549 6BCC1 +20 49279116 49279208 + misc_RNA ENS0001061 3A3E0 ensembl 1.0 ENS0001061 ENS0001061 3A3E0 +20 49280319 49280409 + snoRNA ENS0000879 D7230 ensembl 1.0 ENS0000879 ENS0000879 D7230 +20 49280485 49280571 + misc_RNA ENS0000467 8ABEA ensembl 1.0 ENS0000467 ENS0000467 8ABEA +20 49280683 49280772 + snoRNA ENS0000519 83636 ensembl 1.0 ENS0000519 ENS0000519 83636 +20 49293394 49484297 - protein_coding ENS0001249 3572C ensembl_havana 10.0 ENS0001249 ENS0001249 3572C +20 49318337 49441654 + transcribed_unprocessed_pseudogene ENS0001260 DF83D havana 1.0 ENS0001260 ENS0001260 DF83D +20 49503874 49568137 - protein_coding ENS0001170 B03A4 ensembl_havana 6.0 ENS0001170 ENS0001170 B03A4 +20 49632945 49713878 - protein_coding ENS0000909 3995A ensembl_havana 5.0 ENS0000909 ENS0000909 3995A +20 49721949 49722247 + misc_RNA ENS0001472 BDB1D ensembl 1.0 ENS0001472 ENS0001472 BDB1D +20 49729591 49729842 - processed_pseudogene ENS0000579 C3B50 havana 1.0 ENS0000579 ENS0000579 C3B50 +20 49794811 49794914 + snRNA ENS0000651 1C804 ensembl 1.0 ENS0000651 ENS0000651 1C804 +20 49812713 49892242 + protein_coding ENS0000738 D945B ensembl_havana 12.0 ENS0000738 ENS0000738 D945B +20 49829125 49831085 + processed_pseudogene ENS0000614 EFC57 havana 1.0 ENS0000614 ENS0000614 EFC57 +20 49903391 49915529 - protein_coding ENS0000565 2AD96 ensembl_havana 11.0 ENS0000565 ENS0000565 2AD96 +20 49920597 49920694 - misc_RNA ENS0000939 5AA96 ensembl 1.0 ENS0000939 ENS0000939 5AA96 +20 49932795 49933078 + misc_RNA ENS0000729 17B66 ensembl 2.0 ENS0000729 ENS0000729 17B66 +20 49936336 49953885 + protein_coding ENS0001140 D7916 ensembl_havana 11.0 ENS0001140 ENS0001140 D7916 +20 49956745 49958032 + processed_pseudogene ENS0000525 62637 havana 1.0 ENS0000525 ENS0000525 62637 +20 49972106 49972212 + snRNA ENS0000487 41AA7 ensembl 1.0 ENS0000487 ENS0000487 41AA7 +20 49982980 49988886 + protein_coding ENS0000673 431AF ensembl_havana 4.0 ENS0000673 ENS0000673 431AF +20 50040716 50041504 - lncRNA ENS0000900 44B21 havana 1.0 ENS0000900 ENS0000900 44B21 +20 50081124 50153637 - protein_coding ENS0000918 E2CFF ensembl_havana 16.0 ENS0000918 ENS0000918 E2CFF +20 50081124 50115959 - protein_coding ENS0000077 AB7C6 ensembl_havana 12.0 ENS0000077 ENS0000077 AB7C6 +20 50118254 50153734 - protein_coding ENS0000699 C85FB ensembl_havana 11.0 ENS0000699 ENS0000699 C85FB +20 50162765 50166216 - lncRNA ENS0001310 1D1A4 havana 5.0 ENS0001310 ENS0001310 1D1A4 +20 50166362 50171742 + lncRNA ENS0000610 16421 havana 1.0 ENS0000610 ENS0000610 16421 +20 50171809 50176676 + lncRNA ENS0000435 070D8 havana_tagene 6.0 ENS0000435 ENS0000435 070D8 +20 50184598 50191498 - lncRNA ENS0001172 4CD30 havana 1.0 ENS0001172 ENS0001172 4CD30 +20 50190830 50192668 + protein_coding ENS0000570 335B5 ensembl_havana 6.0 ENS0000570 ENS0000570 335B5 +20 50247643 50283250 + lncRNA ENS0000960 FD65A ensembl_havana 7.0 ENS0000960 ENS0000960 FD65A +20 50292709 50315488 + lncRNA ENS0001385 98A5C havana 9.0 ENS0001385 ENS0001385 98A5C +20 50310711 50321342 - lncRNA ENS0000349 6D0F8 havana 1.0 ENS0000349 ENS0000349 6D0F8 +20 50429190 50429483 - misc_RNA ENS0000476 99B80 ensembl 3.0 ENS0000476 ENS0000476 99B80 +20 50479767 50479991 + processed_pseudogene ENS0001331 4D7F7 havana 1.0 ENS0001331 ENS0001331 4D7F7 +20 50510321 50585241 + protein_coding ENS0000590 4F09D ensembl_havana 10.0 ENS0000590 ENS0000590 4F09D +20 50557682 50557998 - misc_RNA ENS0000974 D36DE ensembl 3.0 ENS0000974 ENS0000974 D36DE +20 50563009 50563110 + misc_RNA ENS0000356 2972B ensembl 1.0 ENS0000356 ENS0000356 2972B +20 50570975 50578041 - lncRNA ENS0000236 449D7 havana 1.0 ENS0000236 ENS0000236 449D7 +20 50585786 50585879 + miRNA ENS0001107 0B0A3 mirbase 1.0 ENS0001107 ENS0001107 0B0A3 +20 50586108 50691542 - protein_coding ENS0001537 2258A ensembl_havana 13.0 ENS0001537 ENS0001537 2258A +20 50614636 50614785 - miRNA ENS0000765 D57A2 mirbase 1.0 ENS0000765 ENS0000765 D57A2 +20 50620763 50621077 + processed_pseudogene ENS0000105 70991 havana 1.0 ENS0000105 ENS0000105 70991 +20 50645471 50662275 + lncRNA ENS0000455 FE1A9 havana 1.0 ENS0000455 ENS0000455 FE1A9 +20 50731580 50756795 + protein_coding ENS0000447 B837E ensembl_havana 9.0 ENS0000447 ENS0000447 B837E +20 50794894 50882676 + protein_coding ENS0000222 362E6 ensembl_havana 17.0 ENS0000222 ENS0000222 362E6 +20 50840615 50840749 - processed_pseudogene ENS0000735 D3744 havana 1.0 ENS0000735 ENS0000735 D3744 +20 50888916 50931437 - protein_coding ENS0000391 94AC8 ensembl_havana 18.0 ENS0000391 ENS0000391 94AC8 +20 50913731 50914411 + processed_pseudogene ENS0001066 7FAFF havana 1.0 ENS0001066 ENS0001066 7FAFF +20 50930984 50945148 + lncRNA ENS0000461 DAF78 havana 4.0 ENS0000461 ENS0000461 DAF78 +20 50934867 50958555 - protein_coding ENS0001054 C63F0 ensembl_havana 13.0 ENS0001054 ENS0001054 C63F0 +20 50958818 50963929 + protein_coding ENS0000342 7143F ensembl_havana 5.0 ENS0000342 ENS0000342 7143F +20 50999370 51010019 + lncRNA ENS0000201 B5556 havana 1.0 ENS0000201 ENS0000201 B5556 +20 51003656 51023107 - protein_coding ENS0000828 6DAC1 ensembl_havana 14.0 ENS0000828 ENS0000828 6DAC1 +20 51098138 51098845 + TEC ENS0000704 312E8 havana 1.0 ENS0000704 ENS0000704 312E8 +20 51130788 51131667 - processed_pseudogene ENS0000007 B033F havana 2.0 ENS0000007 ENS0000007 B033F +20 51386957 51562831 - protein_coding ENS0001258 4CB90 ensembl_havana 20.0 ENS0001258 ENS0001258 4CB90 +20 51452905 51452977 - miRNA ENS0000337 4BB15 mirbase 1.0 ENS0000337 ENS0000337 4BB15 +20 51596514 51768390 - protein_coding ENS0000633 A2BD7 ensembl_havana 14.0 ENS0000633 ENS0000633 A2BD7 +20 51734533 51735067 + processed_pseudogene ENS0000987 C8AEA havana 2.0 ENS0000987 ENS0000987 C8AEA +20 51782331 51802521 - protein_coding ENS0000357 94E19 ensembl_havana 13.0 ENS0000357 ENS0000357 94E19 +20 51831797 51862912 - lncRNA ENS0000827 97A7D havana 1.0 ENS0000827 ENS0000827 97A7D +20 51902291 51902395 + snRNA ENS0000301 6C580 ensembl 1.0 ENS0000301 ENS0000301 6C580 +20 51904969 51905030 + snRNA ENS0001469 82697 ensembl 1.0 ENS0001469 ENS0001469 82697 +20 52051663 52204308 - protein_coding ENS0000110 FC149 ensembl_havana 20.0 ENS0000110 ENS0000110 FC149 +20 52072685 52096480 + lncRNA ENS0001347 E239D havana_tagene 1.0 ENS0001347 ENS0001347 E239D +20 52110344 52111049 - processed_pseudogene ENS0000672 DCB1E havana 1.0 ENS0000672 ENS0000672 DCB1E +20 52192159 52670578 + lncRNA ENS0000054 A370D havana 2.0 ENS0000054 ENS0000054 A370D +20 52451464 52455619 - lncRNA ENS0001327 B6827 havana 1.0 ENS0001327 ENS0001327 B6827 +20 52482801 52482991 - misc_RNA ENS0000608 581DB ensembl 1.0 ENS0000608 ENS0000608 581DB +20 52487922 52500591 - lncRNA ENS0000663 CD46F havana 1.0 ENS0000663 ENS0000663 CD46F +20 52511252 52511568 + processed_pseudogene ENS0000776 FD23B havana 1.0 ENS0000776 ENS0000776 FD23B +20 52671735 52699472 + lncRNA ENS0001201 73805 havana 6.0 ENS0001201 ENS0001201 73805 +20 52840936 52843775 - lncRNA ENS0001311 7B7DE havana 1.0 ENS0001311 ENS0001311 7B7DE +20 52858338 52862855 - lncRNA ENS0000868 502A1 havana 1.0 ENS0000868 ENS0000868 502A1 +20 52890162 52890386 + processed_pseudogene ENS0001392 79334 havana 1.0 ENS0001392 ENS0001392 79334 +20 52972358 53495330 + protein_coding ENS0000481 4FE29 ensembl_havana 16.0 ENS0000481 ENS0000481 4FE29 +20 53097652 53097984 + misc_RNA ENS0000217 E008E ensembl 1.0 ENS0000217 ENS0000217 E008E +20 53137293 53141169 - lncRNA ENS0000701 B8EEC havana_tagene 1.0 ENS0000701 ENS0000701 B8EEC +20 53168643 53179550 - lncRNA ENS0000436 C4CD3 havana 1.0 ENS0000436 ENS0000436 C4CD3 +20 53196229 53208705 - lncRNA ENS0000419 ABC1B havana 1.0 ENS0000419 ENS0000419 ABC1B +20 53255979 53487274 - lncRNA ENS0001098 47902 havana 1.0 ENS0001098 ENS0001098 47902 +20 53397661 53412910 - lncRNA ENS0000666 07346 havana 2.0 ENS0000666 ENS0000666 07346 +20 53421199 53421656 + processed_pseudogene ENS0001471 BBA89 havana 1.0 ENS0001471 ENS0001471 BBA89 +20 53453058 53504314 - lncRNA ENS0000035 DBB0F havana 1.0 ENS0000035 ENS0000035 DBB0F +20 53544615 53544725 + misc_RNA ENS0001375 1A227 ensembl 1.0 ENS0001375 ENS0001375 1A227 +20 53552770 53575863 + lncRNA ENS0000943 ABC14 havana 6.0 ENS0000943 ENS0000943 ABC14 +20 53567065 53609907 - protein_coding ENS0001256 DE38A ensembl_havana 13.0 ENS0001256 ENS0001256 DE38A +20 53608354 53634590 + lncRNA ENS0000823 A350E havana 1.0 ENS0000823 ENS0000823 A350E +20 53668697 53668758 - snRNA ENS0000351 DD63E ensembl 1.0 ENS0000351 ENS0000351 DD63E +20 53738728 53741797 + lncRNA ENS0000856 2ED59 havana_tagene 1.0 ENS0000856 ENS0000856 2ED59 +20 53799823 53827297 - lncRNA ENS0000092 E67FF havana 2.0 ENS0000092 ENS0000092 E67FF +20 53875252 53875557 - processed_pseudogene ENS0000709 D8879 ensembl_havana 1.0 ENS0000709 ENS0000709 D8879 +20 53936777 54070594 - protein_coding ENS0000440 12B4E ensembl_havana 13.0 ENS0000440 ENS0000440 12B4E +20 53940160 53942508 + lncRNA ENS0000758 2ED17 havana 1.0 ENS0000758 ENS0000758 2ED17 +20 54068408 54068485 - miRNA ENS0000881 E745B mirbase 1.0 ENS0000881 ENS0000881 E745B +20 54153446 54173986 - protein_coding ENS0000022 3A3A1 ensembl_havana 10.0 ENS0000022 ENS0000022 3A3A1 +20 54178414 54183129 + lncRNA ENS0001438 789BB havana_tagene 1.0 ENS0001438 ENS0001438 789BB +20 54208087 54228052 + protein_coding ENS0000813 0F235 ensembl_havana 10.0 ENS0000813 ENS0000813 0F235 +20 54232758 54269542 + lncRNA ENS0001016 460BC havana 1.0 ENS0001016 ENS0001016 460BC +20 54475593 54651169 + protein_coding ENS0000099 53DA9 ensembl_havana 12.0 ENS0000099 ENS0000099 53DA9 +20 54859688 54859812 - snRNA ENS0001506 3AE72 ensembl 1.0 ENS0001506 ENS0001506 3AE72 +20 55074644 55075140 + processed_pseudogene ENS0001135 0F9B7 havana 10.0 ENS0001135 ENS0001135 0F9B7 +20 55408898 55684915 + lncRNA ENS0000653 3B0DE havana 8.0 ENS0000653 ENS0000653 3B0DE +20 55420336 55427197 - lncRNA ENS0000555 0B07F havana 1.0 ENS0000555 ENS0000555 0B07F +20 55504906 55505215 - lncRNA ENS0001470 85BAF havana 1.0 ENS0001470 ENS0001470 85BAF +20 55607852 55744938 - lncRNA ENS0000226 580F9 havana_tagene 1.0 ENS0000226 ENS0000226 580F9 +20 55775176 55781231 + lncRNA ENS0000313 40A4F havana 2.0 ENS0000313 ENS0000313 40A4F +20 55997357 56005519 - protein_coding ENS0000071 D085F ensembl_havana 4.0 ENS0000071 ENS0000071 D085F +20 56205052 56205173 - rRNA_pseudogene ENS0000559 D643E ensembl 1.0 ENS0000559 ENS0000559 D643E +20 56248732 56249815 + protein_coding ENS0001523 CC6C9 ensembl_havana 4.0 ENS0001523 ENS0001523 CC6C9 +20 56267872 56269231 + processed_pseudogene ENS0000465 16E39 havana 1.0 ENS0000465 ENS0000465 16E39 +20 56295967 56299160 - lncRNA ENS0000200 CAFD4 havana 1.0 ENS0000200 ENS0000200 CAFD4 +20 56358974 56368663 + protein_coding ENS0000018 39E49 ensembl_havana 10.0 ENS0000018 ENS0000018 39E49 +20 56369389 56392337 - protein_coding ENS0000034 6A669 ensembl_havana 18.0 ENS0000034 ENS0000034 6A669 +20 56392371 56406362 + protein_coding ENS0000516 5DA03 ensembl_havana 12.0 ENS0000516 ENS0000516 5DA03 +20 56412112 56460387 + protein_coding ENS0001068 1FDBC ensembl_havana 16.0 ENS0001068 ENS0001068 1FDBC +20 56459428 56459583 - processed_pseudogene ENS0001386 2C3E8 havana 1.0 ENS0001386 ENS0001386 2C3E8 +20 56468585 56519449 + protein_coding ENS0000691 C40A5 ensembl_havana 13.0 ENS0000691 ENS0000691 C40A5 +20 56491492 56525925 - protein_coding ENS0001222 A58AC ensembl_havana 9.0 ENS0001222 ENS0001222 A58AC +20 56517187 56526152 + protein_coding ENS0000123 9110A ensembl_havana 10.0 ENS0000123 ENS0000123 9110A +20 56522252 56523499 + TEC ENS0001413 8FB0E havana 1.0 ENS0001413 ENS0001413 8FB0E +20 56533246 56536520 + protein_coding ENS0000188 C0D54 ensembl_havana 1.0 ENS0000188 ENS0000188 C0D54 +20 56550605 56555579 + lncRNA ENS0000577 127D7 havana_tagene 1.0 ENS0000577 ENS0000577 127D7 +20 56577563 56598621 + lncRNA ENS0001077 B8322 havana 2.0 ENS0001077 ENS0001077 B8322 +20 56601701 56602113 + processed_pseudogene ENS0001229 68578 havana 3.0 ENS0001229 ENS0001229 68578 +20 56629306 56639283 + protein_coding ENS0000791 BFCCB ensembl_havana 7.0 ENS0000791 ENS0000791 BFCCB +20 56667430 56667531 + snRNA ENS0000306 ED6B1 ensembl 1.0 ENS0000306 ENS0000306 ED6B1 +20 56688401 56688695 + misc_RNA ENS0000992 931F4 ensembl 3.0 ENS0000992 ENS0000992 931F4 +20 56700837 56701514 - processed_pseudogene ENS0000575 77FCD havana 1.0 ENS0000575 ENS0000575 77FCD +20 56730397 56731460 + lncRNA ENS0001293 69B20 havana 2.0 ENS0001293 ENS0001293 69B20 +20 56785981 56786087 - snRNA ENS0000463 3324E ensembl 1.0 ENS0000463 ENS0000463 3324E +20 57105741 57107128 + lncRNA ENS0000640 908DF havana 1.0 ENS0000640 ENS0000640 908DF +20 57110640 57122745 + lncRNA ENS0001351 11E4E havana 3.0 ENS0001351 ENS0001351 11E4E +20 57116566 57119543 - lncRNA ENS0000589 49251 havana_tagene 1.0 ENS0000589 ENS0000589 49251 +20 57161822 57163775 + TEC ENS0000142 CF324 havana 1.0 ENS0000142 ENS0000142 CF324 +20 57168753 57266641 - protein_coding ENS0000082 88FAD ensembl_havana 13.0 ENS0000082 ENS0000082 88FAD +20 57214872 57215866 + lncRNA ENS0000763 E14C8 havana 1.0 ENS0000763 ENS0000763 E14C8 +20 57266797 57282998 + lncRNA ENS0001324 8CF5F havana 1.0 ENS0001324 ENS0001324 8CF5F +20 57311854 57311958 + misc_RNA ENS0000810 B78EC ensembl 1.0 ENS0000810 ENS0000810 B78EC +20 57321502 57321591 - miRNA ENS0000472 2F1A5 mirbase 1.0 ENS0000472 ENS0000472 2F1A5 +20 57328546 57328696 + processed_pseudogene ENS0001065 F30F0 havana 1.0 ENS0001065 ENS0001065 F30F0 +20 57329803 57343994 + protein_coding ENS0000195 E061F ensembl_havana 13.0 ENS0000195 ENS0000195 E061F +20 57351223 57379211 + protein_coding ENS0001382 54E07 ensembl_havana 13.0 ENS0001382 ENS0001382 54E07 +20 57357426 57358369 - processed_pseudogene ENS0000033 5F29C havana 1.0 ENS0000033 ENS0000033 5F29C +20 57358447 57359498 - protein_coding ENS0001242 88548 ensembl_havana 3.0 ENS0001242 ENS0001242 88548 +20 57377140 57393062 - lncRNA ENS0000790 522D5 havana 3.0 ENS0000790 ENS0000790 522D5 +20 57391396 57409333 + protein_coding ENS0000857 DC97D ensembl_havana 17.0 ENS0000857 ENS0000857 DC97D +20 57442742 57458298 + lncRNA ENS0000260 64B31 havana 1.0 ENS0000260 ENS0000260 64B31 +20 57488392 57489027 - processed_pseudogene ENS0001520 53276 havana 7.0 ENS0001520 ENS0001520 53276 +20 57495966 57525652 - protein_coding ENS0001120 95528 ensembl_havana 12.0 ENS0001120 ENS0001120 95528 +20 57561080 57568121 + protein_coding ENS0000583 050C6 ensembl_havana 11.0 ENS0000583 ENS0000583 050C6 +20 57599695 57601200 - lncRNA ENS0000160 4583D havana 1.0 ENS0000160 ENS0000160 4583D +20 57603846 57620576 - protein_coding ENS0000962 ACAC6 ensembl_havana 15.0 ENS0000962 ENS0000962 ACAC6 +20 57648392 57711536 - protein_coding ENS0001544 57496 ensembl_havana 16.0 ENS0001544 ENS0001544 57496 +20 57710156 57712780 + lncRNA ENS0000473 FFB14 havana 2.0 ENS0000473 ENS0000473 FFB14 +20 57895394 57895444 + miRNA ENS0001188 91B40 mirbase 1.0 ENS0001188 ENS0001188 91B40 +20 57957126 57960176 - lncRNA ENS0001073 6B3B6 havana 2.0 ENS0001073 ENS0001073 6B3B6 +20 58003904 58004648 + lncRNA ENS0000478 510EB havana 1.0 ENS0000478 ENS0000478 510EB +20 58069054 58072070 - lncRNA ENS0000662 995A8 havana 6.0 ENS0000662 ENS0000662 995A8 +20 58097267 58097591 + processed_pseudogene ENS0000895 6D5B8 havana 1.0 ENS0000895 ENS0000895 6D5B8 +20 58150902 58161150 + protein_coding ENS0001052 060A2 ensembl_havana 6.0 ENS0001052 ENS0001052 060A2 +20 58218495 58228653 - protein_coding ENS0000836 2414D ensembl_havana 5.0 ENS0000836 ENS0000836 2414D +20 58228940 58309451 - transcribed_unprocessed_pseudogene ENS0000056 70ECE havana 17.0 ENS0000056 ENS0000056 70ECE +20 58309715 58367507 + protein_coding ENS0000364 DEC53 ensembl_havana 4.0 ENS0000364 ENS0000364 DEC53 +20 58389229 58451101 + protein_coding ENS0001138 E1E26 ensembl_havana 16.0 ENS0001138 ENS0001138 E1E26 +20 58459101 58515399 - protein_coding ENS0001484 152C3 ensembl_havana 11.0 ENS0001484 ENS0001484 152C3 +20 58515379 58619888 + lncRNA ENS0001477 D84A8 havana 6.0 ENS0001477 ENS0001477 D84A8 +20 58594417 58603973 + lncRNA ENS0000228 2E6C4 havana 1.0 ENS0000228 ENS0000228 2E6C4 +20 58634772 58635738 + lncRNA ENS0000289 F3B7B havana 2.0 ENS0000289 ENS0000289 F3B7B +20 58651253 58679526 + protein_coding ENS0001407 3086F ensembl_havana 22.0 ENS0001407 ENS0001407 3086F +20 58651434 58715410 + protein_coding ENS0000286 DB953 havana 4.0 ENS0000286 ENS0000286 DB953 +20 58689131 58719238 + protein_coding ENS0000644 44B47 ensembl_havana 12.0 ENS0000644 ENS0000644 44B47 +20 58710795 58711633 + lncRNA ENS0000213 57610 havana 1.0 ENS0000213 ENS0000213 57610 +20 58740532 58783821 - unprocessed_pseudogene ENS0000574 27E39 havana 1.0 ENS0000574 ENS0000574 27E39 +20 58754351 58756838 + lncRNA ENS0001171 85C3C havana 1.0 ENS0001171 ENS0001171 85C3C +20 58817132 58817725 - lncRNA ENS0001482 F188D havana 5.0 ENS0001482 ENS0001482 F188D +20 58817615 58817694 - miRNA ENS0000269 E9EE0 mirbase 1.0 ENS0000269 ENS0000269 E9EE0 +20 58818226 58818313 - miRNA ENS0001004 EF9AE mirbase 1.0 ENS0001004 ENS0001004 EF9AE +20 58818919 58850903 - lncRNA ENS0000444 02202 havana 7.0 ENS0000444 ENS0000444 02202 +20 58833809 58836529 - lncRNA ENS0000669 46A3D havana 1.0 ENS0000669 ENS0000669 46A3D +20 58839718 58911192 + protein_coding ENS0000187 B6141 ensembl_havana 27.0 ENS0000187 ENS0000187 B6141 +20 58842030 58842132 + misc_RNA ENS0001152 96831 ensembl 1.0 ENS0001152 ENS0001152 96831 +20 58842433 58842546 + misc_RNA ENS0000093 C28FB ensembl 1.0 ENS0000093 ENS0000093 C28FB +20 58850530 58850641 + misc_RNA ENS0000812 782CC ensembl 1.0 ENS0000812 ENS0000812 782CC +20 58850783 58850903 + misc_RNA ENS0001183 A1743 ensembl 1.0 ENS0001183 ENS0001183 A1743 +20 58863528 58888809 - lncRNA ENS0000846 51ECE havana 8.0 ENS0000846 ENS0000846 51ECE +20 58876592 58876981 - lncRNA ENS0000905 68E09 havana 1.0 ENS0000905 ENS0000905 68E09 +20 58916000 58981169 - lncRNA ENS0000721 590CA havana 1.0 ENS0000721 ENS0000721 590CA +20 58948089 58948267 - unprocessed_pseudogene ENS0001315 2BA89 havana 1.0 ENS0001315 ENS0001315 2BA89 +20 58981208 58995133 + protein_coding ENS0000229 D5500 ensembl_havana 15.0 ENS0000229 ENS0000229 D5500 +20 58995185 59007254 - protein_coding ENS0000275 98CF6 ensembl_havana 14.0 ENS0000275 ENS0000275 98CF6 +20 59019429 59026654 + protein_coding ENS0001443 A919F ensembl_havana 4.0 ENS0001443 ENS0001443 A919F +20 59025475 59032345 - protein_coding ENS0000784 ADAF2 ensembl_havana 10.0 ENS0000784 ENS0000784 ADAF2 +20 59033145 59042809 - protein_coding ENS0001530 7D270 ensembl_havana 16.0 ENS0001530 ENS0001530 7D270 +20 59113224 59113631 + processed_pseudogene ENS0001161 12DD5 havana 2.0 ENS0001161 ENS0001161 12DD5 +20 59123381 59259113 + protein_coding ENS0001372 178B8 ensembl_havana 6.0 ENS0001372 ENS0001372 178B8 +20 59300443 59325992 + protein_coding ENS0000774 B569C ensembl_havana 18.0 ENS0000774 ENS0000774 B569C +20 59352195 59357774 - lncRNA ENS0000373 55A9E havana 1.0 ENS0000373 ENS0000373 55A9E +20 59360927 59364773 - lncRNA ENS0000624 7662B havana 1.0 ENS0000624 ENS0000624 7662B +20 59395496 59397109 - lncRNA ENS0000015 DDB58 havana_tagene 1.0 ENS0000015 ENS0000015 DDB58 +20 59460619 59460837 - processed_pseudogene ENS0000956 D7910 havana 3.0 ENS0000956 ENS0000956 D7910 +20 59467755 59469853 - lncRNA ENS0000876 1BA72 havana_tagene 1.0 ENS0000876 ENS0000876 1BA72 +20 59515191 59516039 + lncRNA ENS0001127 7D838 havana 1.0 ENS0001127 ENS0001127 7D838 +20 59577509 59847711 + protein_coding ENS0001069 38378 ensembl_havana 17.0 ENS0001069 ENS0001069 38378 +20 59624904 59628295 - lncRNA ENS0001454 DB026 havana 2.0 ENS0001454 ENS0001454 DB026 +20 59852667 59852728 - snRNA ENS0000692 96A9F ensembl 1.0 ENS0000692 ENS0000692 96A9F +20 59863564 59933655 - protein_coding ENS0000363 36B86 ensembl_havana 13.0 ENS0000363 ENS0000363 36B86 +20 59933764 59948680 + protein_coding ENS0000408 8E8D3 ensembl_havana 11.0 ENS0000408 ENS0000408 8E8D3 +20 59936663 59940305 - protein_coding ENS0000983 E5B2A ensembl_havana 7.0 ENS0000983 ENS0000983 E5B2A +20 59958423 60034011 + protein_coding ENS0000036 59599 ensembl_havana 17.0 ENS0000036 ENS0000036 59599 +20 60055853 60072956 + lncRNA ENS0000766 C719D ensembl_havana 9.0 ENS0000766 ENS0000766 C719D +20 60083360 60083872 - lncRNA ENS0000847 F8C24 havana 1.0 ENS0000847 ENS0000847 F8C24 +20 60087826 60653784 + lncRNA ENS0000264 55A49 havana 6.0 ENS0000264 ENS0000264 55A49 +20 60308474 60308567 + miRNA ENS0000726 AD0DF mirbase 1.0 ENS0000726 ENS0000726 AD0DF +20 60414396 60414872 - processed_pseudogene ENS0001070 215FF havana 1.0 ENS0001070 ENS0001070 215FF +20 60478111 60478181 + miRNA ENS0000741 DB379 mirbase 1.0 ENS0000741 ENS0000741 DB379 +20 60564562 60564625 + miRNA ENS0000194 7CC04 mirbase 1.0 ENS0000194 ENS0000194 7CC04 +20 60755001 60767279 + lncRNA ENS0001436 EB124 havana 3.0 ENS0001436 ENS0001436 EB124 +20 60812004 60814211 - lncRNA ENS0000091 2E4A3 havana 1.0 ENS0000091 ENS0000091 2E4A3 +20 61077224 61077816 - lncRNA ENS0001397 6613E havana 1.0 ENS0001397 ENS0001397 6613E +20 61079049 61083750 + lncRNA ENS0001418 A5785 havana 2.0 ENS0001418 ENS0001418 A5785 +20 61252261 61940617 + protein_coding ENS0000261 CEE08 ensembl_havana 16.0 ENS0000261 ENS0000261 CEE08 +20 61267525 61271827 - lncRNA ENS0001231 8DFED havana 1.0 ENS0001231 ENS0001231 8DFED +20 61369879 61370855 + lncRNA ENS0000566 BB307 havana 1.0 ENS0000566 ENS0000566 BB307 +20 61432868 61437630 - lncRNA ENS0001153 FCCFA havana 2.0 ENS0001153 ENS0001153 FCCFA +20 61717506 61719748 - lncRNA ENS0001369 D6888 havana 3.0 ENS0001369 ENS0001369 D6888 +20 61738219 61755056 - lncRNA ENS0000316 B296E havana 1.0 ENS0000316 ENS0000316 B296E +20 61953469 62065810 - protein_coding ENS0001428 C0858 ensembl_havana 18.0 ENS0001428 ENS0001428 C0858 +20 61953546 61953662 - miRNA ENS0000940 EC4B0 mirbase 1.0 ENS0000940 ENS0000940 EC4B0 +20 62037429 62037956 - processed_pseudogene ENS0000256 7B588 havana 1.0 ENS0000256 ENS0000256 7B588 +20 62064802 62064885 + miRNA ENS0001179 3ECBE mirbase 1.0 ENS0001179 ENS0001179 3ECBE +20 62066830 62068437 + lncRNA ENS0000376 C183E havana 1.0 ENS0000376 ENS0000376 C183E +20 62122461 62135374 + protein_coding ENS0001022 D422F ensembl_havana 20.0 ENS0001022 ENS0001022 D422F +20 62136733 62143440 - protein_coding ENS0000272 1A3F4 ensembl_havana 15.0 ENS0000272 ENS0000272 1A3F4 +20 62143769 62182514 + protein_coding ENS0001505 C6ECE ensembl_havana 15.0 ENS0001505 ENS0001505 C6ECE +20 62183029 62203568 + protein_coding ENS0000136 E1A37 ensembl_havana 17.0 ENS0000136 ENS0000136 E1A37 +20 62214960 62220278 - protein_coding ENS0000060 9DD5D ensembl_havana 16.0 ENS0000060 ENS0000060 9DD5D +20 62231922 62296213 + protein_coding ENS0000542 9351B ensembl_havana 17.0 ENS0000542 ENS0000542 9351B +20 62302093 62308862 + protein_coding ENS0000397 0B386 ensembl_havana 13.0 ENS0000397 ENS0000397 0B386 +20 62305432 62306325 - lncRNA ENS0000792 9A296 havana 2.0 ENS0000792 ENS0000792 9A296 +20 62307955 62367312 - protein_coding ENS0000858 E318B ensembl_havana 15.0 ENS0000858 ENS0000858 E318B +20 62332487 62332557 - miRNA ENS0000442 215A8 mirbase 1.0 ENS0000442 ENS0000442 215A8 +20 62352995 62356480 + lncRNA ENS0000066 2C85D havana 7.0 ENS0000066 ENS0000066 2C85D +20 62386303 62386970 - lncRNA ENS0000273 E77A8 havana 1.0 ENS0000273 ENS0000273 E77A8 +20 62387103 62388520 + protein_coding ENS0001128 7598B ensembl_havana 18.0 ENS0001128 ENS0001128 7598B +20 62388632 62407285 - protein_coding ENS0000618 AE93E ensembl_havana 11.0 ENS0000618 ENS0000618 AE93E +20 62402236 62405935 - lncRNA ENS0000958 974FB havana 1.0 ENS0000958 ENS0000958 974FB +20 62410237 62427539 - protein_coding ENS0001511 D039C ensembl_havana 4.0 ENS0001511 ENS0001511 D039C +20 62427827 62447677 + lncRNA ENS0000737 A801B havana 3.0 ENS0000737 ENS0000737 A801B +20 62463497 62475995 - protein_coding ENS0001455 02660 ensembl_havana 7.0 ENS0001455 ENS0001455 02660 +20 62477870 62478594 + lncRNA ENS0001027 5CA33 havana 1.0 ENS0001027 ENS0001027 5CA33 +20 62513909 62516096 - lncRNA ENS0000545 1A808 havana 1.0 ENS0000545 ENS0000545 1A808 +20 62544343 62551526 - lncRNA ENS0000652 7E22F ensembl_havana 16.0 ENS0000652 ENS0000652 7E22F +20 62550453 62570764 + lncRNA ENS0000981 43C54 ensembl_havana 13.0 ENS0000981 ENS0000981 43C54 +20 62554306 62554376 + miRNA ENS0001367 167E0 mirbase 2.0 ENS0001367 ENS0001367 167E0 +20 62564912 62565013 + miRNA ENS0000986 1ADD0 mirbase 1.0 ENS0000986 ENS0000986 1ADD0 +20 62572671 62573384 + processed_pseudogene ENS0001226 6A85E havana 1.0 ENS0001226 ENS0001226 6A85E +20 62575590 62577507 + lncRNA ENS0000800 CF1CC havana 3.0 ENS0000800 ENS0000800 CF1CC +20 62596732 62603355 - lncRNA ENS0001019 3E283 havana 3.0 ENS0001019 ENS0001019 3E283 +20 62633681 62635504 - lncRNA ENS0000346 5B4D4 havana 1.0 ENS0000346 ENS0000346 5B4D4 +20 62633772 62635862 + lncRNA ENS0000029 3EA48 havana 1.0 ENS0000029 ENS0000029 3EA48 +20 62640719 62643304 - lncRNA ENS0000149 C12E9 havana 4.0 ENS0000149 ENS0000149 C12E9 +20 62642503 62685785 + protein_coding ENS0001247 8E276 ensembl_havana 16.0 ENS0001247 ENS0001247 8E276 +20 62648961 62650767 - lncRNA ENS0000689 E7933 havana 1.0 ENS0000689 ENS0000689 E7933 +20 62651272 62652186 - lncRNA ENS0001167 24CC7 havana 1.0 ENS0001167 ENS0001167 24CC7 +20 62663019 62666724 - lncRNA ENS0001192 7F756 ensembl_havana 1.0 ENS0001192 ENS0001192 7F756 +20 62695024 62700480 - lncRNA ENS0000740 F1681 havana 2.0 ENS0000740 ENS0000740 F1681 +20 62708836 62762771 + protein_coding ENS0001526 34C18 ensembl_havana 5.0 ENS0001526 ENS0001526 34C18 +20 62732566 62735347 - lncRNA ENS0001498 84F90 havana 1.0 ENS0001498 ENS0001498 84F90 +20 62774121 62776996 - lncRNA ENS0000285 39FBF havana_tagene 2.0 ENS0000285 ENS0000285 39FBF +20 62796473 62801729 + protein_coding ENS0000074 43BBC ensembl_havana 7.0 ENS0000074 ENS0000074 43BBC +20 62800627 62805587 - lncRNA ENS0001476 60E89 havana 1.0 ENS0001476 ENS0001476 60E89 +20 62804835 62814000 + protein_coding ENS0001464 2E6D2 ensembl_havana 16.0 ENS0001464 ENS0001464 2E6D2 +20 62816244 62841159 + protein_coding ENS0000287 19300 ensembl_havana 18.0 ENS0000287 ENS0000287 19300 +20 62841005 62861822 - protein_coding ENS0001010 36861 ensembl_havana 13.0 ENS0001010 ENS0001010 36861 +20 62845664 62845912 + processed_pseudogene ENS0000896 4B84B havana 5.0 ENS0000896 ENS0000896 4B84B +20 62872250 62873076 - processed_pseudogene ENS0001359 9E5DB havana 1.0 ENS0001359 ENS0001359 9E5DB +20 62877738 62937952 - protein_coding ENS0001388 6ECD8 ensembl_havana 17.0 ENS0001388 ENS0001388 6ECD8 +20 62928621 62929297 - lncRNA ENS0000635 29D21 havana 1.0 ENS0000635 ENS0000635 29D21 +20 62938147 62948475 + protein_coding ENS0000023 AD44A ensembl_havana 8.0 ENS0000023 ENS0000023 AD44A +20 62952707 62969585 + protein_coding ENS0000309 1C6D2 ensembl_havana 18.0 ENS0000309 ENS0000309 1C6D2 +20 63005927 63007035 - protein_coding ENS0001420 26B57 ensembl_havana 6.0 ENS0001420 ENS0001420 26B57 +20 63009383 63085071 + lncRNA ENS0001165 68CB9 havana 5.0 ENS0001165 ENS0001165 68CB9 +20 63034217 63037028 - lncRNA ENS0000627 CB7AF havana 9.0 ENS0000627 ENS0000627 CB7AF +20 63038011 63053863 + lncRNA ENS0000155 8F883 havana 1.0 ENS0000155 ENS0000155 8F883 +20 63090806 63102631 - lncRNA ENS0000804 645F3 havana 8.0 ENS0000804 ENS0000804 645F3 +20 63102142 63102259 + misc_RNA ENS0001440 DF218 ensembl 1.0 ENS0001440 ENS0001440 DF218 +20 63102205 63104386 + lncRNA ENS0000989 BA0DE havana 3.0 ENS0000989 ENS0000989 BA0DE +20 63116218 63117205 + lncRNA ENS0001111 67ACD havana_tagene 1.0 ENS0001111 ENS0001111 67ACD +20 63127495 63129459 - lncRNA ENS0001423 38EAA havana 1.0 ENS0001423 ENS0001423 38EAA +20 63144311 63152087 - lncRNA ENS0000277 39EB6 havana_tagene 1.0 ENS0000277 ENS0000277 39EB6 +20 63166797 63180903 - lncRNA ENS0001502 ECD23 havana 1.0 ENS0001502 ENS0001502 ECD23 +20 63178500 63178586 + miRNA ENS0001364 D916D mirbase 1.0 ENS0001364 ENS0001364 D916D +20 63195429 63216139 - protein_coding ENS0000586 B1F33 ensembl_havana 18.0 ENS0000586 ENS0000586 B1F33 +20 63218041 63218502 + lncRNA ENS0000480 58514 havana 1.0 ENS0000480 ENS0000480 58514 +20 63235883 63240495 + protein_coding ENS0000025 0890B ensembl_havana 13.0 ENS0000025 ENS0000025 0890B +20 63238779 63238842 + miRNA ENS0001334 D8BE1 mirbase 1.0 ENS0001334 ENS0001334 D8BE1 +20 63240784 63272694 - protein_coding ENS0001149 0A087 ensembl_havana 15.0 ENS0001149 ENS0001149 0A087 +20 63253978 63261615 + lncRNA ENS0000474 B67B9 havana 1.0 ENS0000474 ENS0000474 B67B9 +20 63272785 63289790 + protein_coding ENS0000955 59376 ensembl_havana 13.0 ENS0000955 ENS0000955 59376 +20 63286808 63286866 + miRNA ENS0000742 3E6B1 mirbase 1.0 ENS0000742 ENS0000742 3E6B1 +20 63293186 63334851 + protein_coding ENS0001500 E9EF1 ensembl_havana 16.0 ENS0001500 ENS0001500 E9EF1 +20 63337339 63337449 + snRNA ENS0001419 E2D00 ensembl 1.0 ENS0001419 ENS0001419 E2D00 +20 63343223 63378401 - protein_coding ENS0001221 FB84B ensembl_havana 18.0 ENS0001221 ENS0001221 FB84B +20 63359988 63371177 + lncRNA ENS0001345 F313D havana 2.0 ENS0001345 ENS0001345 F313D +20 63400208 63472677 - protein_coding ENS0000924 31E8B ensembl_havana 19.0 ENS0000924 ENS0000924 31E8B +20 63422269 63424555 + lncRNA ENS0001181 8182C havana 1.0 ENS0001181 ENS0001181 8182C +20 63448051 63449329 + lncRNA ENS0001178 AF383 havana 1.0 ENS0001178 ENS0001178 AF383 +20 63488013 63499239 - protein_coding ENS0000969 16DE6 ensembl_havana 13.0 ENS0000969 ENS0000969 16DE6 +20 63502287 63505111 + lncRNA ENS0001095 027A6 havana 1.0 ENS0001095 ENS0001095 027A6 +20 63520765 63522206 + protein_coding ENS0001075 67A5D ensembl_havana 10.0 ENS0001075 ENS0001075 67A5D +20 63528001 63537376 - protein_coding ENS0001109 62229 ensembl_havana 7.0 ENS0001109 ENS0001109 62229 +20 63540810 63547504 - protein_coding ENS0000894 0AE3C ensembl_havana 3.0 ENS0000894 ENS0000894 0AE3C +20 63543411 63543738 + lncRNA ENS0000409 313AB havana 1.0 ENS0000409 ENS0000409 313AB +20 63547891 63556695 + protein_coding ENS0001000 CC0CE ensembl_havana 7.0 ENS0001000 ENS0001000 CC0CE +20 63558086 63574239 - protein_coding ENS0000520 9188C ensembl_havana 16.0 ENS0000520 ENS0000520 9188C +20 63587602 63627101 - protein_coding ENS0000371 079CE ensembl_havana 11.0 ENS0000371 ENS0000371 079CE +20 63627227 63628824 + lncRNA ENS0000971 12AA9 havana 1.0 ENS0000971 ENS0000971 12AA9 +20 63639705 63657682 - protein_coding ENS0001115 25268 ensembl_havana 10.0 ENS0001115 ENS0001115 25268 +20 63657810 63696253 + protein_coding ENS0001491 59569 ensembl_havana 9.0 ENS0001491 ENS0001491 59569 +20 63659300 63698684 + protein_coding ENS0000991 408E8 havana 22.0 ENS0000991 ENS0000991 408E8 +20 63696652 63698684 + protein_coding ENS0000257 933FD havana 6.0 ENS0000257 ENS0000257 933FD +20 63698642 63708025 - protein_coding ENS0000173 F0049 ensembl_havana 20.0 ENS0000173 ENS0000173 F0049 +20 63707465 63736142 + protein_coding ENS0000796 6A5EF ensembl_havana 12.0 ENS0000796 ENS0000796 6A5EF +20 63708864 63739103 + protein_coding ENS0000003 C33E6 havana 3.0 ENS0000003 ENS0000003 C33E6 +20 63730072 63730377 - lncRNA ENS0000192 A89D8 havana 1.0 ENS0000192 ENS0000192 A89D8 +20 63736283 63739103 + protein_coding ENS0001262 DC82E ensembl_havana 10.0 ENS0001262 ENS0001262 DC82E +20 63738270 63740398 + protein_coding ENS0001394 D68A0 havana 1.0 ENS0001394 ENS0001394 D68A0 +20 63739776 63744050 + protein_coding ENS0000977 DD5FE ensembl_havana 14.0 ENS0000977 ENS0000977 DD5FE +20 63743668 63832038 - protein_coding ENS0000727 4282C ensembl_havana 12.0 ENS0000727 ENS0000727 4282C +20 63744689 63745958 + lncRNA ENS0000197 4345C havana 2.0 ENS0000197 ENS0000197 4345C +20 63808076 63816521 + lncRNA ENS0000303 6199B havana 4.0 ENS0000303 ENS0000303 6199B +20 63843436 63843915 - TEC ENS0001083 C901C havana 1.0 ENS0001083 ENS0001083 C901C +20 63861212 63864293 - lncRNA ENS0000175 414C3 havana 2.0 ENS0000175 ENS0000175 414C3 +20 63861498 63862988 + protein_coding ENS0001282 B3CA1 ensembl_havana 7.0 ENS0001282 ENS0001282 B3CA1 +20 63865228 63891545 + protein_coding ENS0000421 76828 ensembl_havana 18.0 ENS0000421 ENS0000421 76828 +20 63895126 63936031 + protein_coding ENS0001551 7E57E ensembl_havana 11.0 ENS0001551 ENS0001551 7E57E +20 63908955 63909119 + snRNA ENS0000825 E2F76 ensembl 1.0 ENS0000825 ENS0000825 E2F76 +20 63919449 63919520 + miRNA ENS0000207 BC6EE mirbase 1.0 ENS0000207 ENS0000207 BC6EE +20 63919505 63919576 + miRNA ENS0001405 6683D mirbase 3.0 ENS0001405 ENS0001405 6683D +20 63919561 63919632 + miRNA ENS0000500 1194A mirbase 1.0 ENS0000500 ENS0000500 1194A +20 63919756 63919827 + miRNA ENS0001211 872ED mirbase 3.0 ENS0001211 ENS0001211 872ED +20 63919868 63919939 + miRNA ENS0000418 BB9E0 mirbase 1.0 ENS0000418 ENS0000418 BB9E0 +20 63939829 63956416 - protein_coding ENS0001301 76DEA ensembl_havana 16.0 ENS0001301 ENS0001301 76DEA +20 63941465 63941544 - miRNA ENS0000944 CED31 mirbase 1.0 ENS0000944 ENS0000944 CED31 +20 63942631 63942726 - miRNA ENS0000147 6AAE7 mirbase 1.0 ENS0000147 ENS0000147 6AAE7 +20 63953384 63956985 + lncRNA ENS0000414 05335 havana 1.0 ENS0000414 ENS0000414 05335 +20 63956704 63969930 - protein_coding ENS0000498 ED07A havana 9.0 ENS0000498 ENS0000498 ED07A +20 63974116 63980008 - protein_coding ENS0000906 C8B0A ensembl_havana 14.0 ENS0000906 ENS0000906 C8B0A +20 63981132 64033100 + protein_coding ENS0000714 02950 ensembl_havana 8.0 ENS0000714 ENS0000714 02950 +20 64034344 64039962 + protein_coding ENS0001326 73E80 havana 8.0 ENS0001326 ENS0001326 73E80 +20 64047582 64049639 - protein_coding ENS0000745 72969 ensembl_havana 7.0 ENS0000745 ENS0000745 72969 +20 64049836 64072347 + protein_coding ENS0000225 214F7 ensembl_havana 17.0 ENS0000225 ENS0000225 214F7 +20 64056803 64057084 + lncRNA ENS0000874 AFA21 havana 1.0 ENS0000874 ENS0000874 AFA21 +20 64073181 64079988 - protein_coding ENS0000778 C9495 ensembl_havana 14.0 ENS0000778 ENS0000778 C9495 +20 64076955 64077010 - miRNA ENS0001137 D6AC8 mirbase 1.0 ENS0001137 ENS0001137 D6AC8 +20 64080082 64100643 + protein_coding ENS0000404 2AE69 ensembl_havana 18.0 ENS0000404 ENS0000404 2AE69 +20 64083380 64084359 - protein_coding ENS0000611 F9EFD ensembl_havana 10.0 ENS0000611 ENS0000611 F9EFD +20 64097208 64101162 - lncRNA ENS0001447 7A0ED havana_tagene 1.0 ENS0001447 ENS0001447 7A0ED +20 64102394 64242253 + protein_coding ENS0000360 316ED ensembl_havana 14.0 ENS0000360 ENS0000360 316ED +20 64105820 64107171 - protein_coding ENS0000539 7A206 ensembl_havana 3.0 ENS0000539 ENS0000539 7A206 +20 64255695 64287821 + protein_coding ENS0000321 D1CA1 ensembl_havana 12.0 ENS0000321 ENS0000321 D1CA1 +20 64278306 64278561 + processed_pseudogene ENS0000381 F2279 havana 1.0 ENS0000381 ENS0000381 F2279 +20 64290187 64292998 - processed_pseudogene ENS0000746 0AB8D havana 1.0 ENS0000746 ENS0000746 0AB8D +20 64290385 64313132 + lncRNA ENS0000833 63F32 ensembl_havana 8.0 ENS0000833 ENS0000833 63F32 +20 64327418 64327972 - processed_pseudogene ENS0001096 92037 havana 1.0 ENS0001096 ENS0001096 92037 \ No newline at end of file diff --git a/src/eset/create_eset/test_data/mapping_dir/AACAAGGTAC/Solo.out/Gene/raw/barcodes.tsv b/src/eset/create_eset/test_data/mapping_dir/AACAAGGTAC/Solo.out/Gene/raw/barcodes.tsv new file mode 100644 index 00000000..cacd8f41 --- /dev/null +++ b/src/eset/create_eset/test_data/mapping_dir/AACAAGGTAC/Solo.out/Gene/raw/barcodes.tsv @@ -0,0 +1 @@ +AACAAGGTAC diff --git a/src/eset/create_eset/test_data/mapping_dir/AACAAGGTAC/Solo.out/Gene/raw/features.tsv b/src/eset/create_eset/test_data/mapping_dir/AACAAGGTAC/Solo.out/Gene/raw/features.tsv new file mode 100644 index 00000000..ca5527cf --- /dev/null +++ b/src/eset/create_eset/test_data/mapping_dir/AACAAGGTAC/Solo.out/Gene/raw/features.tsv @@ -0,0 +1,25 @@ +ENS0001140 209E3 Gene Expression +ENS0001058 A2B9A Gene Expression +ENS0000508 CF168 Gene Expression +ENS0001457 3BA5A Gene Expression +ENS0001431 1C968 Gene Expression +ENS0000821 E5192 Gene Expression +ENS0001040 1821B Gene Expression +ENS0000443 5AD11 Gene Expression +ENS0000441 3F0FF Gene Expression +ENS0001387 265F2 Gene Expression +ENS0001223 28A43 Gene Expression +ENS0001208 58E28 Gene Expression +ENS0001396 6E614 Gene Expression +ENS0001199 EA941 Gene Expression +ENS0001369 99DDC Gene Expression +ENS0000770 AFCC0 Gene Expression +ENS0000389 B58E5 Gene Expression +ENS0000071 7A6C3 Gene Expression +ENS0000114 65424 Gene Expression +ENS0000355 077A2 Gene Expression +ENS0001429 22A4F Gene Expression +ENS0000477 981E6 Gene Expression +ENS0000042 E2D99 Gene Expression +ENS0000122 D90E9 Gene Expression +ENS0000221 97B0F Gene Expression \ No newline at end of file diff --git a/src/eset/create_eset/test_data/mapping_dir/AACAAGGTAC/Solo.out/Gene/raw/matrix.mtx b/src/eset/create_eset/test_data/mapping_dir/AACAAGGTAC/Solo.out/Gene/raw/matrix.mtx new file mode 100644 index 00000000..9e621e30 --- /dev/null +++ b/src/eset/create_eset/test_data/mapping_dir/AACAAGGTAC/Solo.out/Gene/raw/matrix.mtx @@ -0,0 +1,13 @@ +%%MatrixMarket matrix coordinate integer general +% +25 1 10 +8 1 1 +9 1 2 +12 1 16 +13 1 8 +14 1 1 +15 1 5 +16 1 5 +18 1 12 +19 1 2 +24 1 1 \ No newline at end of file diff --git a/src/eset/create_eset/test_data/mapping_dir/ACGCCTTCGT/Solo.out/Gene/raw/barcodes.tsv b/src/eset/create_eset/test_data/mapping_dir/ACGCCTTCGT/Solo.out/Gene/raw/barcodes.tsv new file mode 100644 index 00000000..87f62bdc --- /dev/null +++ b/src/eset/create_eset/test_data/mapping_dir/ACGCCTTCGT/Solo.out/Gene/raw/barcodes.tsv @@ -0,0 +1 @@ +ACGCCTTCGT diff --git a/src/eset/create_eset/test_data/mapping_dir/ACGCCTTCGT/Solo.out/Gene/raw/features.tsv b/src/eset/create_eset/test_data/mapping_dir/ACGCCTTCGT/Solo.out/Gene/raw/features.tsv new file mode 100644 index 00000000..ca5527cf --- /dev/null +++ b/src/eset/create_eset/test_data/mapping_dir/ACGCCTTCGT/Solo.out/Gene/raw/features.tsv @@ -0,0 +1,25 @@ +ENS0001140 209E3 Gene Expression +ENS0001058 A2B9A Gene Expression +ENS0000508 CF168 Gene Expression +ENS0001457 3BA5A Gene Expression +ENS0001431 1C968 Gene Expression +ENS0000821 E5192 Gene Expression +ENS0001040 1821B Gene Expression +ENS0000443 5AD11 Gene Expression +ENS0000441 3F0FF Gene Expression +ENS0001387 265F2 Gene Expression +ENS0001223 28A43 Gene Expression +ENS0001208 58E28 Gene Expression +ENS0001396 6E614 Gene Expression +ENS0001199 EA941 Gene Expression +ENS0001369 99DDC Gene Expression +ENS0000770 AFCC0 Gene Expression +ENS0000389 B58E5 Gene Expression +ENS0000071 7A6C3 Gene Expression +ENS0000114 65424 Gene Expression +ENS0000355 077A2 Gene Expression +ENS0001429 22A4F Gene Expression +ENS0000477 981E6 Gene Expression +ENS0000042 E2D99 Gene Expression +ENS0000122 D90E9 Gene Expression +ENS0000221 97B0F Gene Expression \ No newline at end of file diff --git a/src/eset/create_eset/test_data/mapping_dir/ACGCCTTCGT/Solo.out/Gene/raw/matrix.mtx b/src/eset/create_eset/test_data/mapping_dir/ACGCCTTCGT/Solo.out/Gene/raw/matrix.mtx new file mode 100644 index 00000000..cd68663e --- /dev/null +++ b/src/eset/create_eset/test_data/mapping_dir/ACGCCTTCGT/Solo.out/Gene/raw/matrix.mtx @@ -0,0 +1,12 @@ +%%MatrixMarket matrix coordinate integer general +% +25 1 9 +9 1 3 +12 1 13 +13 1 2 +14 1 2 +15 1 2 +16 1 3 +18 1 13 +19 1 2 +25 1 40 \ No newline at end of file diff --git a/src/eset/create_eset/test_data/mapping_dir/EMPTY/Solo.out/Gene/raw/barcodes.tsv b/src/eset/create_eset/test_data/mapping_dir/EMPTY/Solo.out/Gene/raw/barcodes.tsv new file mode 100644 index 00000000..ac50a9e8 --- /dev/null +++ b/src/eset/create_eset/test_data/mapping_dir/EMPTY/Solo.out/Gene/raw/barcodes.tsv @@ -0,0 +1 @@ +CCCCCCCCCC diff --git a/src/eset/create_eset/test_data/mapping_dir/EMPTY/Solo.out/Gene/raw/features.tsv b/src/eset/create_eset/test_data/mapping_dir/EMPTY/Solo.out/Gene/raw/features.tsv new file mode 100644 index 00000000..ca5527cf --- /dev/null +++ b/src/eset/create_eset/test_data/mapping_dir/EMPTY/Solo.out/Gene/raw/features.tsv @@ -0,0 +1,25 @@ +ENS0001140 209E3 Gene Expression +ENS0001058 A2B9A Gene Expression +ENS0000508 CF168 Gene Expression +ENS0001457 3BA5A Gene Expression +ENS0001431 1C968 Gene Expression +ENS0000821 E5192 Gene Expression +ENS0001040 1821B Gene Expression +ENS0000443 5AD11 Gene Expression +ENS0000441 3F0FF Gene Expression +ENS0001387 265F2 Gene Expression +ENS0001223 28A43 Gene Expression +ENS0001208 58E28 Gene Expression +ENS0001396 6E614 Gene Expression +ENS0001199 EA941 Gene Expression +ENS0001369 99DDC Gene Expression +ENS0000770 AFCC0 Gene Expression +ENS0000389 B58E5 Gene Expression +ENS0000071 7A6C3 Gene Expression +ENS0000114 65424 Gene Expression +ENS0000355 077A2 Gene Expression +ENS0001429 22A4F Gene Expression +ENS0000477 981E6 Gene Expression +ENS0000042 E2D99 Gene Expression +ENS0000122 D90E9 Gene Expression +ENS0000221 97B0F Gene Expression \ No newline at end of file diff --git a/src/eset/create_eset/test_data/mapping_dir/EMPTY/Solo.out/Gene/raw/matrix.mtx b/src/eset/create_eset/test_data/mapping_dir/EMPTY/Solo.out/Gene/raw/matrix.mtx new file mode 100644 index 00000000..107afd81 --- /dev/null +++ b/src/eset/create_eset/test_data/mapping_dir/EMPTY/Solo.out/Gene/raw/matrix.mtx @@ -0,0 +1,3 @@ +%%MatrixMarket matrix coordinate integer general +% +25 1 0 \ No newline at end of file diff --git a/src/eset/create_eset/test_data/pData.tsv b/src/eset/create_eset/test_data/pData.tsv new file mode 100644 index 00000000..3bfc64e0 --- /dev/null +++ b/src/eset/create_eset/test_data/pData.tsv @@ -0,0 +1,3 @@ +WellBC WellID NumberOfMTReads pctMT NumberOfERCCReads pctERCC NumberOfChromReads pctChrom NumberOfInputReads NumberOfMappedReads PctMappedReads NumberOfReadsMappedToMultipleLoci PectOfReadsMappedToMultipleLoci NumberOfReadsMappedToTooManyLoci PectOfReadsMappedToTooManyLoci NumberOfReadsUnmappedTooManyMismatches PectOfReadsUnmappedTooManyMismatches NumberOfReadsUnmappedTooShort PectOfReadsUnmappedTooShort NumberOfReadsUnmappedOther PectOfReadsUnmappedOther ReadsWithValidBarcodes SequencingSaturation Q30BasesInCB+UMI ReadsMappedToTranscriptome:Unique+MultipeGenes EstimatedNumberOfCells FractionOfReadsInCells MeanReadsPerCell NumberOfUMIs NumberOfGenes NumberOfCountedReads +AACAAGGTAC A1 0 0 0 0 8542 100 141303 23749 16.81 0 0 8458 5.99 0 0 109035 77.16 61 0.04 0.999816 0.0698056 0.979965 0.0618175 1 1 8538 7942 408 9535 +ACGCCTTCGT B2 0 0 0 0 5863 100 96430 16869 17.49 0 0 6124 6.35 0 0 73375 76.09 62 0.06 0.999782 0.0665302 0.980077 0.0620969 1 1 5862 5472 377 6463 diff --git a/src/eset/create_fdata/config.vsh.yaml b/src/eset/create_fdata/config.vsh.yaml new file mode 100644 index 00000000..9a5651d4 --- /dev/null +++ b/src/eset/create_fdata/config.vsh.yaml @@ -0,0 +1,46 @@ +name: create_fdata +namespace: eset +description: | + Create a fdata file +authors: + - __merge__: /src/base/authors/dries_schaumont.yaml + roles: [ maintainer ] + - __merge__: /src/base/authors/marijke_van_moerbeke.yaml + roles: [ contributor ] +arguments: +- name: "--gtf" + type: file + description: "Genome annotation file in GTF format." + required: true +- name: "--output" + description: | + Tab-delimited text file containing information about the 'gene' or 'transcript' + entries from the input GTF file. The 'transcript' entries are used in case the source + of the GTF was 'refGene' or 'ncbiRefSeq'. + type: file + direction: output + default: fData.$id.txt +resources: +- type: python_script + path: create_fdata.py +test_resources: +- type: python_script + path: test.py +- path: test_annotation.gtf +engines: + - type: docker + image: python:3.12-slim + setup: + - type: apt + packages: + - procps + - type: python + packages: + - pandas + test_setup: + - type: python + packages: + - viashpy +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/eset/create_fdata/create_fdata.py b/src/eset/create_fdata/create_fdata.py new file mode 100644 index 00000000..175007a4 --- /dev/null +++ b/src/eset/create_fdata/create_fdata.py @@ -0,0 +1,137 @@ +import logging +import pandas as pd +import numpy as np +from textwrap import fill + + +### VIASH START +meta = { + "name": "create_fdata", +} + +par = { + "gtf": "src/eset/create_fdata/test_annotation.gtf", + "output": "fData.tsv" +} + +### VIASH END + +logger = logging.getLogger() +console_handler = logging.StreamHandler() +logger.addHandler(console_handler) +logger.setLevel(logging.DEBUG) + + +def read_gtf(gtf_path: str) -> pd.DataFrame: + logger.info("Reading %s", gtf_path) + result = pd.read_csv(gtf_path, sep="\t", + header=None, names=("seqname", "source", + "feature", "start", "end", + "score", "strand", "frame", + "attribute"), + dtype={ + "seqname": pd.StringDtype(), + "source": pd.StringDtype(), + "feature": pd.StringDtype(), + "start": pd.Int64Dtype(), + "end": pd.Int64Dtype(), + "score": pd.StringDtype(), + "strand": pd.CategoricalDtype(categories=["+", "-"], + ordered=False), + "frame": pd.StringDtype(), + "attribute": pd.StringDtype(), + }, + comment='#' + ) + logger.info("Done reading %s. Found %d GTF entries ", par["gtf"], result.shape[0]) + logger.info("GTF file is providing information for the following chromosomes: \n%s", + fill(", ".join(result['seqname'].unique()), width=100)) + logger.info("The following sources were specified in the GTF file:\n%s", + ", ".join(result["source"].unique())) + return result + + +def parse_attributes(attributes_series: pd.Series): + attribute_dict = dict() + attributes_list = [attr.strip().split(" ") + for attr in attributes_series["attribute"].strip(";").split(";")] + for (attr_name, attr_value) in attributes_list: + attribute_dict.setdefault(attr_name, []).append(attr_value.strip('"')) + attribute_dict = {attr_name: "|".join(attr_value) + for attr_name, attr_value in attribute_dict.items()} + return pd.Series(attribute_dict) + + +def main(par): + logger.info(f"{meta['name']} started.") + parameters_str = [f'\t{param}: {param_val}\n' for param, param_val in par.items()] + logger.info("Parameters:\n%s", "".join(parameters_str).rstrip()) + gtf_file = read_gtf(par["gtf"]) + sources = set(source for source in gtf_file["source"].unique() if source != "ERCC") + specific_gtf = False + feature = "gene" + if len(sources) == 1 and (source := sources[0]) \ + and (source == "refGene" or source == "ncbiRefSeq"): + feature = "transcript" + specific_gtf = True + logger.info("Found specific GTF from %s, forcing filtering on feature type %s", source, feature) + logger.info("Filtering GTF entries for feature type '%s'.", feature) + gtf_file = gtf_file[gtf_file["feature"] == feature] + logger.info("After filtering %d entries are left.", gtf_file.shape[0]) + logger.info("Parsing the GTF attributes") + annotation = gtf_file[["attribute"]].apply(parse_attributes, result_type="expand", axis=1) + logger.info("Found the following attributes in the GTF:\n%s", ", ".join(annotation.columns)) + annotation = pd.concat([gtf_file.drop(["attribute"], axis=1), annotation], axis=1) + if specific_gtf: + logger.info("Because the source of the GTF is either 'ncbiRefSeq' or 'refGene', which" + "caused forced filtering based on %s, the duplicate genes still need to be dropped.", + feature) + annotation = annotation.drop_duplicates(subset=("gene_id", "gene_name"), keep=False) + logger.info("After dropping duplicates, %d entries are left", annotation.shape[0]) + + # detect ensembl ids + # some GTF files contain version in ENSEMBL, e.g. ENS00000000046319.1 + # we remove the version, because the annotation packages don't contain the version + if "gene_id" in annotation.columns: + logger.info("'gene_id' column was detected in attributes. Performing extra parsing of ENSEMBL ids.") + annotation["ENSEMBL_with_version"] = annotation["gene_id"].where(annotation["gene_id"].str.startswith("ENS")) + annotation["ENSEMBL"] = annotation["ENSEMBL_with_version"].str.replace(r"\.\d+$", "", regex=True) + annotation["gene_id"] = annotation["gene_id"].str.replace(r"\.\d+$", "", regex=True) + + possible_name_columns = ("Name", "name", "gene_name") + found_columns = list(filter(lambda col_name: col_name in annotation, possible_name_columns)) + # The following code allows to select a value for the SYMBOL column based on the first non-na column + if found_columns: + logger.info("Found one the following columns: %s; which can be used to populate the SYMBOL column", + ", ".join(possible_name_columns)) + # For each row (gtf entry), get the name of the first column that actually holds a value. + column_to_get = annotation.loc[:,found_columns].apply(pd.Series.first_valid_index, axis=1) + counts_per_column = column_to_get.value_counts(dropna=False).to_dict() + counts_per_column_str = [f'\t{col}: {counts}\n' for col, counts in counts_per_column.items()] + logger.info("Frequencies of the origin for the entries in the SYMBOL column:\n%s", + "".join(counts_per_column_str).rstrip()) + # If all columns hold NA for a certain row, first_valid_index will return None. + # Just use the name of the first column. + column_to_get = column_to_get.fillna(found_columns[0]) + # We now have a list one column name per row, use it so select the values + # Loc cannot be used here because 1 value per row is required, + # and loc will select for each row all the columns in columns_to_get + idx, cols = pd.factorize(column_to_get) + symbol_values = annotation.reindex(cols, axis=1).to_numpy()[np.arange(len(annotation)), idx] + annotation["SYMBOL"] = symbol_values + logger.info("Dropping unused columns") + annotation = annotation.drop(["score", "source", "frame", "feature"], axis=1) + logger.info("Looking for duplicate rows and removing them. Starting with %i entries", annotation.shape[0]) + annotation = annotation.drop_duplicates(keep="first", ignore_index=True) + logger.info("After removing duplicates: %i entries", annotation.shape[0]) + logger.info("Writing to %s", par["output"]) + annotation.to_csv(par["output"], sep="\t", header=True, index=False, na_rep="NA") + # Do these checks *after* writing the csv in order to be able to check the data + logger.info("Checking for unique gene IDs") + if not annotation["gene_id"].is_unique: + raise ValueError("Values from the 'gene_id' column are not unique after processing!") + logger.info("%s finished", meta['name']) + + +if __name__ == "__main__": + main(par) diff --git a/src/eset/create_fdata/test.py b/src/eset/create_fdata/test.py new file mode 100644 index 00000000..b80ebfac --- /dev/null +++ b/src/eset/create_fdata/test.py @@ -0,0 +1,102 @@ +import pytest +import sys +import pandas as pd +from pathlib import Path +from uuid import uuid4 +from shutil import copyfile + +### VIASH START +meta = { + "resources_dir": "./src/eset/create_fdata/", + "executable": "target/executable/eset/create_fdata/create_fdata", + "config": "src/eset/create_fdata/config.vsh.yaml" +} +### VIASH END + +@pytest.fixture +def test_annotation_path(): + return Path(meta["resources_dir"]) / "test_annotation.gtf" + + +@pytest.fixture +def random_path(tmp_path): + def wrapper(extension=None): + extension = "" if not extension else f".{extension}" + return tmp_path / f"{uuid4()}{extension}" + return wrapper + + +def test_create_fdata(run_component, test_annotation_path, random_path): + output_path = random_path("tsv") + run_component([ + "--gtf", test_annotation_path, + "--output", output_path + ]) + assert output_path.is_file() + result = pd.read_csv(output_path, sep="\t", dtype=pd.StringDtype()) + + expected_dict = { + "seqname": ["20", "20", "20", "21"], + "start": ["87250", "142590", "157454", "297570"], + "end": ["97094", "145751", "159163", "300321"], + "strand": ["+", "+", "+", "+"], + "gene_id": ["ENSG00000178591", "ENSG00000125788", + "ENSG00000088782", "ENSG00000247315"], + "gene_version": ["7", "6", "5", "4"], + "gene_name": ["DEFB125", "DEFB126", "DEFB127", pd.NA], + "gene_source": ["ensembl_havana", "ensembl_havana", + "ensembl_havana", "havana"], + "gene_biotype": ["protein_coding", "protein_coding", + "protein_coding", "protein_coding"], + "ENSEMBL_with_version": ["ENSG00000178591.7", "ENSG00000125788", + "ENSG00000088782", "ENSG00000247315"], + "ENSEMBL": ["ENSG00000178591", "ENSG00000125788", + "ENSG00000088782", "ENSG00000247315"], + "SYMBOL": ["DEFB125", "DEFB126", "DEFB127", pd.NA] + } + expected = pd.DataFrame.from_dict(expected_dict, dtype=pd.StringDtype()) + pd.testing.assert_frame_equal(expected, result, check_like=True) + + +def test_make_unique(run_component, test_annotation_path, random_path): + gtf_with_duplicate_entry_path = random_path("gtf") + output_path = random_path("tsv") + entry_to_add = ( + "\n20 ensembl_havana gene 87250 97094 . + . gene_id " + + "\"ENSG00000178591.7\"; gene_version \"7\"; gene_name \"DEFB125\"; " + + "gene_source \"ensembl_havana\"; gene_biotype \"protein_coding\";\n" + ) + copyfile(test_annotation_path, gtf_with_duplicate_entry_path) + with gtf_with_duplicate_entry_path.open("a") as open_gtf: + open_gtf.write(entry_to_add) + run_component([ + "--gtf", gtf_with_duplicate_entry_path, + "--output", output_path + ]) + assert output_path.is_file() + result = pd.read_csv(output_path, sep="\t", dtype=pd.StringDtype()) + expected_dict = { + "seqname": ["20", "20", "20", "21"], + "start": ["87250", "142590", "157454", "297570"], + "end": ["97094", "145751", "159163", "300321"], + "strand": ["+", "+", "+", "+"], + "gene_id": ["ENSG00000178591", "ENSG00000125788", + "ENSG00000088782", "ENSG00000247315"], + "gene_version": ["7", "6", "5", "4"], + "gene_name": ["DEFB125", "DEFB126", "DEFB127", pd.NA], + "gene_source": ["ensembl_havana", "ensembl_havana", + "ensembl_havana", "havana"], + "gene_biotype": ["protein_coding", "protein_coding", + "protein_coding", "protein_coding"], + "ENSEMBL_with_version": ["ENSG00000178591.7", "ENSG00000125788", + "ENSG00000088782", "ENSG00000247315"], + "ENSEMBL": ["ENSG00000178591", "ENSG00000125788", + "ENSG00000088782", "ENSG00000247315"], + "SYMBOL": ["DEFB125", "DEFB126", "DEFB127", pd.NA] + } + expected = pd.DataFrame.from_dict(expected_dict, dtype=pd.StringDtype()) + pd.testing.assert_frame_equal(expected, result, check_like=True) + + +if __name__ == '__main__': + sys.exit(pytest.main([__file__])) \ No newline at end of file diff --git a/src/eset/create_fdata/test_annotation.gtf b/src/eset/create_fdata/test_annotation.gtf new file mode 100644 index 00000000..b5417cea --- /dev/null +++ b/src/eset/create_fdata/test_annotation.gtf @@ -0,0 +1,45 @@ +20 ensembl_havana gene 87250 97094 . + . gene_id "ENSG00000178591.7"; gene_version "7"; gene_name "DEFB125"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; +20 havana transcript 87250 97094 . + . gene_id "ENSG00000178591"; gene_version "7"; transcript_id "ENST00000608838"; transcript_version "1"; gene_name "DEFB125"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB125-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; transcript_support_level "2"; +20 havana exon 87250 87359 . + . gene_id "ENSG00000178591"; gene_version "7"; transcript_id "ENST00000608838"; transcript_version "1"; exon_number "1"; gene_name "DEFB125"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB125-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; exon_id "ENSE00003702629"; exon_version "1"; transcript_support_level "2"; +20 havana exon 96005 97094 . + . gene_id "ENSG00000178591"; gene_version "7"; transcript_id "ENST00000608838"; transcript_version "1"; exon_number "2"; gene_name "DEFB125"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB125-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; exon_id "ENSE00003705060"; exon_version "1"; transcript_support_level "2"; +20 ensembl_havana transcript 87672 97094 . + . gene_id "ENSG00000178591"; gene_version "7"; transcript_id "ENST00000382410"; transcript_version "3"; gene_name "DEFB125"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB125-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12989"; tag "basic"; transcript_support_level "1 (assigned to previous version 2)"; +20 ensembl_havana exon 87672 87767 . + . gene_id "ENSG00000178591"; gene_version "7"; transcript_id "ENST00000382410"; transcript_version "3"; exon_number "1"; gene_name "DEFB125"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB125-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12989"; exon_id "ENSE00001491993"; exon_version "2"; tag "basic"; transcript_support_level "1 (assigned to previous version 2)"; +20 ensembl_havana CDS 87710 87767 . + 0 gene_id "ENSG00000178591"; gene_version "7"; transcript_id "ENST00000382410"; transcript_version "3"; exon_number "1"; gene_name "DEFB125"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB125-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12989"; protein_id "ENSP00000371847"; protein_version "2"; tag "basic"; transcript_support_level "1 (assigned to previous version 2)"; +20 ensembl_havana start_codon 87710 87712 . + 0 gene_id "ENSG00000178591"; gene_version "7"; transcript_id "ENST00000382410"; transcript_version "3"; exon_number "1"; gene_name "DEFB125"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB125-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12989"; tag "basic"; transcript_support_level "1 (assigned to previous version 2)"; +20 ensembl_havana exon 96005 97094 . + . gene_id "ENSG00000178591"; gene_version "7"; transcript_id "ENST00000382410"; transcript_version "3"; exon_number "2"; gene_name "DEFB125"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB125-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12989"; exon_id "ENSE00001491984"; exon_version "3"; tag "basic"; transcript_support_level "1 (assigned to previous version 2)"; +20 ensembl_havana CDS 96005 96414 . + 2 gene_id "ENSG00000178591"; gene_version "7"; transcript_id "ENST00000382410"; transcript_version "3"; exon_number "2"; gene_name "DEFB125"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB125-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12989"; protein_id "ENSP00000371847"; protein_version "2"; tag "basic"; transcript_support_level "1 (assigned to previous version 2)"; +20 ensembl_havana stop_codon 96415 96417 . + 0 gene_id "ENSG00000178591"; gene_version "7"; transcript_id "ENST00000382410"; transcript_version "3"; exon_number "2"; gene_name "DEFB125"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB125-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12989"; tag "basic"; transcript_support_level "1 (assigned to previous version 2)"; +20 ensembl_havana five_prime_utr 87672 87709 . + . gene_id "ENSG00000178591"; gene_version "7"; transcript_id "ENST00000382410"; transcript_version "3"; gene_name "DEFB125"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB125-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12989"; tag "basic"; transcript_support_level "1 (assigned to previous version 2)"; +20 ensembl_havana three_prime_utr 96418 97094 . + . gene_id "ENSG00000178591"; gene_version "7"; transcript_id "ENST00000382410"; transcript_version "3"; gene_name "DEFB125"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB125-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12989"; tag "basic"; transcript_support_level "1 (assigned to previous version 2)"; +20 ensembl_havana gene 142590 145751 . + . gene_id "ENSG00000125788"; gene_version "6"; gene_name "DEFB126"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; +20 ensembl_havana transcript 142590 145751 . + . gene_id "ENSG00000125788"; gene_version "6"; transcript_id "ENST00000382398"; transcript_version "4"; gene_name "DEFB126"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB126-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12990"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)"; +20 ensembl_havana exon 142590 142686 . + . gene_id "ENSG00000125788"; gene_version "6"; transcript_id "ENST00000382398"; transcript_version "4"; exon_number "1"; gene_name "DEFB126"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB126-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12990"; exon_id "ENSE00001491976"; exon_version "4"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)"; +20 ensembl_havana CDS 142629 142686 . + 0 gene_id "ENSG00000125788"; gene_version "6"; transcript_id "ENST00000382398"; transcript_version "4"; exon_number "1"; gene_name "DEFB126"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB126-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12990"; protein_id "ENSP00000371835"; protein_version "3"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)"; +20 ensembl_havana start_codon 142629 142631 . + 0 gene_id "ENSG00000125788"; gene_version "6"; transcript_id "ENST00000382398"; transcript_version "4"; exon_number "1"; gene_name "DEFB126"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB126-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12990"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)"; +20 ensembl_havana exon 145415 145751 . + . gene_id "ENSG00000125788"; gene_version "6"; transcript_id "ENST00000382398"; transcript_version "4"; exon_number "2"; gene_name "DEFB126"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB126-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12990"; exon_id "ENSE00000858522"; exon_version "4"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)"; +20 ensembl_havana CDS 145415 145689 . + 2 gene_id "ENSG00000125788"; gene_version "6"; transcript_id "ENST00000382398"; transcript_version "4"; exon_number "2"; gene_name "DEFB126"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB126-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12990"; protein_id "ENSP00000371835"; protein_version "3"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)"; +20 ensembl_havana stop_codon 145690 145692 . + 0 gene_id "ENSG00000125788"; gene_version "6"; transcript_id "ENST00000382398"; transcript_version "4"; exon_number "2"; gene_name "DEFB126"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB126-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12990"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)"; +20 ensembl_havana five_prime_utr 142590 142628 . + . gene_id "ENSG00000125788"; gene_version "6"; transcript_id "ENST00000382398"; transcript_version "4"; gene_name "DEFB126"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB126-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12990"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)"; +20 ensembl_havana three_prime_utr 145693 145751 . + . gene_id "ENSG00000125788"; gene_version "6"; transcript_id "ENST00000382398"; transcript_version "4"; gene_name "DEFB126"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB126-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12990"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)"; +20 havana transcript 142634 145749 . + . gene_id "ENSG00000125788"; gene_version "6"; transcript_id "ENST00000542572"; transcript_version "1"; gene_name "DEFB126"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB126-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; tag "mRNA_start_NF"; transcript_support_level "3"; +20 havana exon 142634 142686 . + . gene_id "ENSG00000125788"; gene_version "6"; transcript_id "ENST00000542572"; transcript_version "1"; exon_number "1"; gene_name "DEFB126"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB126-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; exon_id "ENSE00002285856"; exon_version "1"; tag "mRNA_start_NF"; transcript_support_level "3"; +20 havana exon 145415 145488 . + . gene_id "ENSG00000125788"; gene_version "6"; transcript_id "ENST00000542572"; transcript_version "1"; exon_number "2"; gene_name "DEFB126"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB126-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; exon_id "ENSE00002303512"; exon_version "1"; tag "mRNA_start_NF"; transcript_support_level "3"; +20 havana exon 145579 145749 . + . gene_id "ENSG00000125788"; gene_version "6"; transcript_id "ENST00000542572"; transcript_version "1"; exon_number "3"; gene_name "DEFB126"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB126-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; exon_id "ENSE00002217818"; exon_version "1"; tag "mRNA_start_NF"; transcript_support_level "3"; +20 ensembl_havana gene 157454 159163 . + . gene_id "ENSG00000088782"; gene_version "5"; gene_name "DEFB127"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; +20 ensembl_havana transcript 157454 159163 . + . gene_id "ENSG00000088782"; gene_version "5"; transcript_id "ENST00000382388"; transcript_version "4"; gene_name "DEFB127"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB127-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12991"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)"; +20 ensembl_havana exon 157454 157593 . + . gene_id "ENSG00000088782"; gene_version "5"; transcript_id "ENST00000382388"; transcript_version "4"; exon_number "1"; gene_name "DEFB127"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB127-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12991"; exon_id "ENSE00001491947"; exon_version "4"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)"; +20 ensembl_havana CDS 157545 157593 . + 0 gene_id "ENSG00000088782"; gene_version "5"; transcript_id "ENST00000382388"; transcript_version "4"; exon_number "1"; gene_name "DEFB127"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB127-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12991"; protein_id "ENSP00000371825"; protein_version "3"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)"; +20 ensembl_havana start_codon 157545 157547 . + 0 gene_id "ENSG00000088782"; gene_version "5"; transcript_id "ENST00000382388"; transcript_version "4"; exon_number "1"; gene_name "DEFB127"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB127-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12991"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)"; +20 ensembl_havana exon 158774 159163 . + . gene_id "ENSG00000088782"; gene_version "5"; transcript_id "ENST00000382388"; transcript_version "4"; exon_number "2"; gene_name "DEFB127"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB127-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12991"; exon_id "ENSE00001166560"; exon_version "3"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)"; +20 ensembl_havana CDS 158774 159021 . + 2 gene_id "ENSG00000088782"; gene_version "5"; transcript_id "ENST00000382388"; transcript_version "4"; exon_number "2"; gene_name "DEFB127"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB127-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12991"; protein_id "ENSP00000371825"; protein_version "3"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)"; +20 ensembl_havana stop_codon 159022 159024 . + 0 gene_id "ENSG00000088782"; gene_version "5"; transcript_id "ENST00000382388"; transcript_version "4"; exon_number "2"; gene_name "DEFB127"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB127-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12991"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)"; +20 ensembl_havana five_prime_utr 157454 157544 . + . gene_id "ENSG00000088782"; gene_version "5"; transcript_id "ENST00000382388"; transcript_version "4"; gene_name "DEFB127"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB127-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12991"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)"; +20 ensembl_havana three_prime_utr 159025 159163 . + . gene_id "ENSG00000088782"; gene_version "5"; transcript_id "ENST00000382388"; transcript_version "4"; gene_name "DEFB127"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "DEFB127-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS12991"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)"; +21 havana gene 297570 300321 . + . gene_id "ENSG00000247315"; gene_version "4"; gene_source "havana"; gene_biotype "protein_coding"; +21 havana transcript 297570 300321 . + . gene_id "ENSG00000247315"; gene_version "4"; transcript_id "ENST00000500893"; transcript_version "4"; gene_source "havana"; gene_biotype "protein_coding"; transcript_name "ZCCHC3-201"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS42844"; tag "basic"; transcript_support_level "NA (assigned to previous version 3)"; +21 havana exon 297570 300321 . + . gene_id "ENSG00000247315"; gene_version "4"; transcript_id "ENST00000500893"; transcript_version "4"; exon_number "1"; gene_source "havana"; gene_biotype "protein_coding"; transcript_name "ZCCHC3-201"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS42844"; exon_id "ENSE00001977652"; exon_version "4"; tag "basic"; transcript_support_level "NA (assigned to previous version 3)"; +21 havana CDS 297587 298795 . + 0 gene_id "ENSG00000247315"; gene_version "4"; transcript_id "ENST00000500893"; transcript_version "4"; exon_number "1"; gene_source "havana"; gene_biotype "protein_coding"; transcript_name "ZCCHC3-201"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS42844"; protein_id "ENSP00000484056"; protein_version "1"; tag "basic"; transcript_support_level "NA (assigned to previous version 3)"; +21 havana start_codon 297587 297589 . + 0 gene_id "ENSG00000247315"; gene_version "4"; transcript_id "ENST00000500893"; transcript_version "4"; exon_number "1"; gene_source "havana"; gene_biotype "protein_coding"; transcript_name "ZCCHC3-201"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS42844"; tag "basic"; transcript_support_level "NA (assigned to previous version 3)"; +21 havana stop_codon 298796 298798 . + 0 gene_id "ENSG00000247315"; gene_version "4"; transcript_id "ENST00000500893"; transcript_version "4"; exon_number "1"; gene_source "havana"; gene_biotype "protein_coding"; transcript_name "ZCCHC3-201"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS42844"; tag "basic"; transcript_support_level "NA (assigned to previous version 3)"; +21 havana five_prime_utr 297570 297586 . + . gene_id "ENSG00000247315"; gene_version "4"; transcript_id "ENST00000500893"; transcript_version "4"; gene_source "havana"; gene_biotype "protein_coding"; transcript_name "ZCCHC3-201"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS42844"; tag "basic"; transcript_support_level "NA (assigned to previous version 3)"; +21 havana three_prime_utr 298799 300321 . + . gene_id "ENSG00000247315"; gene_version "4"; transcript_id "ENST00000500893"; transcript_version "4"; gene_source "havana"; gene_biotype "protein_coding"; transcript_name "ZCCHC3-201"; transcript_source "havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS42844"; tag "basic"; transcript_support_level "NA (assigned to previous version 3)"; \ No newline at end of file diff --git a/src/eset/create_pdata/config.vsh.yaml b/src/eset/create_pdata/config.vsh.yaml new file mode 100644 index 00000000..48933e37 --- /dev/null +++ b/src/eset/create_pdata/config.vsh.yaml @@ -0,0 +1,55 @@ +name: create_pdata +namespace: eset +description: | + Create a pdata file by combining the mapping statistics +authors: + - __merge__: /src/base/authors/dries_schaumont.yaml + roles: [ maintainer ] + - __merge__: /src/base/authors/marijke_van_moerbeke.yaml + roles: [ contributor ] +arguments: +- name: "--star_stats_file" + type: file + description: | + Tab-delimited text file containing statistics (per column) that were generated + from the STAR log files (Log.final.out, Summary.csv, ReadsPerGene.out.tab). + Each entry (row) in the file describes the values for one well (barcode). + required: true +- name: "--nrReadsNrGenesPerChromPool" + type: file + description: | + Pivot table in tsv format of the combined nrReadsNrGenesPerChrom files from STAR. + Describes per chromosome (as columns) the number of reads, as well as the total number + of reads per cell barcode and the percentage of nuclear, ERCC and mitochondrial + reads. + required: true +- name: "--output" + type: file + direction: output + default: pData.$id.txt +resources: +- type: python_script + path: create_pdata.py +test_resources: +- type: python_script + path: test.py +- path: nrReadsNrGenesPerChromPool.txt +- path: starLogs.txt + +engines: + - type: docker + image: python:3.12-slim + setup: + - type: apt + packages: + - procps + - type: python + packages: + - pandas + test_setup: + - type: python + packages: + - viashpy +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/eset/create_pdata/create_pdata.py b/src/eset/create_pdata/create_pdata.py new file mode 100644 index 00000000..f316e216 --- /dev/null +++ b/src/eset/create_pdata/create_pdata.py @@ -0,0 +1,63 @@ +from itertools import batched +import pandas as pd +import logging + +### VIASH START +meta = { + "name": "create_pdata", +} + +par = { + "star_stats_file": "src/eset/create_pdata/starLogs.txt", + "nrReadsNrGenesPerChromPool": "src/eset/create_pdata/nrReadsNrGenesPerChromPool.txt", + "output": "pData.tsv" +} + +### VIASH END + +logger = logging.getLogger() +console_handler = logging.StreamHandler() +logger.addHandler(console_handler) +logger.setLevel(logging.DEBUG) + +def main(par): + logger.info(f"{meta['name']} started.") + parameters_str = [f'\t{param}: {param_val}\n' for param, param_val in par.items()] + logger.info("Parameters:\n%s", "".join(parameters_str).rstrip()) + logger.info("Reading %s", par["star_stats_file"]) + star_log_stats = pd.read_csv(par["star_stats_file"], sep="\t", index_col=0) + logger.info("STAR log statics file contains information for the following barcodes: %s", + ", ".join(star_log_stats.index)) + logger.info("Reading %s", par["nrReadsNrGenesPerChromPool"]) + reads_and_genes_per_chr_stats = pd.read_csv(par["nrReadsNrGenesPerChromPool"], sep="\t", index_col=0) + logger.info("Reads per gene and chromosome table contains information for the following barcodes: %s", + ", ".join(reads_and_genes_per_chr_stats.index)) + logger.info("Filtering mapping statistics file columns.") + cols_to_keep = ("WellID", "NumberOfMTReads", "pctMT", "NumberOfERCCReads", + "pctERCC", "NumberOfChromReads", "pctChrom") + try: + reads_and_genes_per_chr_stats = reads_and_genes_per_chr_stats.loc[:,cols_to_keep] + except KeyError as e: + raise KeyError("When trying to subset the reads per genes and chromosomes file, " + "a column was missing. Available columns in the file: " + f"{', '.join(reads_and_genes_per_chr_stats.columns)}.") from e + # Each barcode should be present. An alternative approach could be to just + # do the concatenation and check for NA values that are filled for non-overlapping + # index values, but there are already NA values present in the dataframes + if not star_log_stats.index.sort_values().equals(reads_and_genes_per_chr_stats.index.sort_values()): + raise ValueError("Error while combining two log files. It seems that the entries (barcodes) " + f"do not fully overlap. Barcodes in '{par['star_stats_file']}: " + f"{', '.join(reads_and_genes_per_chr_stats.index)}. Barcodes in " + f"'{par['nrReadsNrGenesPerChromPool']}': " + f"{', '.join(star_log_stats.index)}") + combined_stats = pd.concat([reads_and_genes_per_chr_stats, star_log_stats], axis=1) + logger.info("Summary of final output:\n%s\n", + "\n".join(repr(combined_stats.loc[:,columns].describe()) + for columns in batched(combined_stats.columns, 3))) + logger.info("Writing to %s", par["output"]) + combined_stats.reset_index("WellBC").to_csv(par["output"], sep="\t", header=True, index=False) + logger.info("Finished %s.", meta["name"]) + + +if __name__ == "__main__": + main(par) \ No newline at end of file diff --git a/src/eset/create_pdata/nrReadsNrGenesPerChromPool.txt b/src/eset/create_pdata/nrReadsNrGenesPerChromPool.txt new file mode 100644 index 00000000..0b678058 --- /dev/null +++ b/src/eset/create_pdata/nrReadsNrGenesPerChromPool.txt @@ -0,0 +1,8 @@ +WellBC WellID 20 pctChrom pctMT pctERCC SumReads NumberOfGenes NumberOfERCCReads NumberOfChromReads NumberOfMTReads +AACAAGGTAC A1 8542 100 0 0 8542 408 0 8542 0 +ACGCCTTCGT A2 5863 100 0 0 5863 377 0 5863 0 +CCATACTGAC A3 7396 100 0 0 7396 391 0 7396 0 +GCAAGCGAAT B1 10092 100 0 0 10092 420 0 10092 0 +GTCTCGAGTG C5 470 100 0 0 470 150 0 470 0 +TGCGCTCATT D6 7650 100 0 0 7650 407 0 7650 0 +TTGTGTTCGA E19 9422 100 0 0 9422 420 0 9422 0 diff --git a/src/eset/create_pdata/starLogs.txt b/src/eset/create_pdata/starLogs.txt new file mode 100644 index 00000000..665f2adf --- /dev/null +++ b/src/eset/create_pdata/starLogs.txt @@ -0,0 +1,8 @@ +WellBC NumberOfInputReads NumberOfMappedReads PctMappedReads NumberOfReadsMappedToMultipleLoci PectOfReadsMappedToMultipleLoci NumberOfReadsMappedToTooManyLoci PectOfReadsMappedToTooManyLoci NumberOfReadsUnmappedTooManyMismatches PectOfReadsUnmappedTooManyMismatches NumberOfReadsUnmappedTooShort PectOfReadsUnmappedTooShort NumberOfReadsUnmappedOther PectOfReadsUnmappedOther ReadsWithValidBarcodes SequencingSaturation Q30BasesInCB+UMI ReadsMappedToTranscriptome:Unique+MultipeGenes EstimatedNumberOfCells FractionOfReadsInCells MeanReadsPerCell NumberOfUMIs NumberOfGenes NumberOfCountedReads +ACGCCTTCGT 96430 16869 17.49 0 0 6124 6.35 0 0 73375 76.09 62 0.06 0.999782 0.0665302 0.980077 0.0620969 1 1 5862 5472 377 6463 +GTCTCGAGTG 10158 1902 18.72 0 0 967 9.52 0 0 7280 71.67 9 0.09 0.999803 0.0553191 0.984451 0.0476472 1 1 470 444 150 533 +GCAAGCGAAT 156134 24005 15.37 0 0 7961 5.1 0 0 124096 79.48 72 0.05 0.999744 0.0680872 0.982779 0.0658665 1 1 10090 9403 420 11273 +CCATACTGAC 113577 17319 15.25 0 0 5905 5.2 0 0 90292 79.5 61 0.05 0.999859 0.0717282 0.982313 0.066554 1 1 7389 6859 391 8299 +TGCGCTCATT 126989 19272 15.18 0 0 7141 5.62 0 0 100515 79.15 61 0.05 0.999843 0.0667974 0.986581 0.0616668 1 1 7650 7139 407 8444 +TTGTGTTCGA 142560 22129 15.52 0 0 7045 4.94 0 0 113324 79.49 62 0.04 0.999783 0.060828 0.986622 0.0676838 1 1 9420 8847 420 10383 +AACAAGGTAC 141303 23749 16.81 0 0 8458 5.99 0 0 109035 77.16 61 0.04 0.999816 0.0698056 0.979965 0.0618175 1 1 8538 7942 408 9535 diff --git a/src/eset/create_pdata/test.py b/src/eset/create_pdata/test.py new file mode 100644 index 00000000..22fa78c1 --- /dev/null +++ b/src/eset/create_pdata/test.py @@ -0,0 +1,173 @@ +import pytest +import sys +import pandas as pd +from pathlib import Path +from uuid import uuid4 + +### VIASH START +meta = { + "resources_dir": "./src/eset/create_pdata/", + "executable": "target/executable/eset/create_pdata/create_pdata", + "config": "src/eset/create_pdata/config.vsh.yaml" +} +### VIASH END + +@pytest.fixture +def test_reads_and_genes_per_chr_path(): + return Path(meta["resources_dir"]) / "nrReadsNrGenesPerChromPool.txt" + + +@pytest.fixture +def test_star_logs_summary_path(): + return Path(meta["resources_dir"]) / "starLogs.txt" + + +@pytest.fixture +def random_path(tmp_path): + def wrapper(extension=None): + extension = "" if not extension else f".{extension}" + return tmp_path / f"{uuid4()}{extension}" + return wrapper + + +def test_create_pdata(run_component, test_reads_and_genes_per_chr_path, + test_star_logs_summary_path, random_path): + output_path = random_path("tsv") + run_component([ + "--star_stats_file", test_star_logs_summary_path, + "--nrReadsNrGenesPerChromPool", test_reads_and_genes_per_chr_path, + "--output", output_path + ]) + assert output_path.is_file() + result = pd.read_csv(output_path, sep="\t", dtype=pd.StringDtype()) + expected_dict = { + 'WellBC': ['AACAAGGTAC', 'ACGCCTTCGT', 'CCATACTGAC', 'GCAAGCGAAT', + 'GTCTCGAGTG', 'TGCGCTCATT', 'TTGTGTTCGA'], + 'WellID': ['A1', 'A2', 'A3', 'B1', 'C5', 'D6', 'E19'], + 'NumberOfMTReads': ['0', '0', '0', '0', '0', '0', '0'], + 'pctMT': ['0', '0', '0', '0', '0', '0', '0'], + 'NumberOfERCCReads': ['0', '0', '0', '0', '0', '0', '0'], + 'pctERCC': ['0', '0', '0', '0', '0', '0', '0'], + 'NumberOfChromReads': ['8542', '5863', '7396', '10092', '470', + '7650', '9422'], + 'pctChrom': ['100', '100', '100', '100', '100', '100', '100'], + 'NumberOfInputReads': ['141303', '96430', '113577', '156134', '10158', + '126989', '142560'], + 'NumberOfMappedReads': ['23749', '16869', '17319', '24005', '1902', + '19272', '22129'], + 'PctMappedReads': ['16.81', '17.49', '15.25', '15.37', '18.72', + '15.18', '15.52'], + 'NumberOfReadsMappedToMultipleLoci': ['0', '0', '0', '0', '0', '0', '0'], + 'PectOfReadsMappedToMultipleLoci': ['0', '0', '0', '0', '0', '0', '0'], + 'NumberOfReadsMappedToTooManyLoci': ['8458', '6124', '5905', '7961', '967', + '7141', '7045'], + 'PectOfReadsMappedToTooManyLoci': ['5.99', '6.35', '5.2', '5.1', '9.52', + '5.62', '4.94'], + 'NumberOfReadsUnmappedTooManyMismatches': ['0', '0', '0', '0', '0', '0', '0'], + 'PectOfReadsUnmappedTooManyMismatches': ['0', '0', '0', '0', '0', '0', '0'], + 'NumberOfReadsUnmappedTooShort': ['109035', '73375', '90292', '124096', + '7280', '100515', '113324'], + 'PectOfReadsUnmappedTooShort': ['77.16', '76.09', '79.5', '79.48', + '71.67', '79.15', '79.49'], + 'NumberOfReadsUnmappedOther': ['61', '62', '61', '72', '9', '61', '62'], + 'PectOfReadsUnmappedOther': ['0.04', '0.06', '0.05', '0.05', + '0.09', '0.05', '0.04'], + 'ReadsWithValidBarcodes': ['0.999816', '0.999782', '0.999859', '0.999744', + '0.999803', '0.999843', '0.999783'], + 'SequencingSaturation': ['0.0698056', '0.0665302', '0.0717282', '0.0680872', + '0.0553191', '0.0667974', '0.060828'], + 'Q30BasesInCB+UMI': ['0.979965', '0.980077', '0.982313', '0.982779', + '0.984451', '0.986581', '0.986622'], + 'ReadsMappedToTranscriptome:Unique+MultipeGenes': ['0.0618175', '0.0620969', + '0.066554', '0.0658665', + '0.0476472', '0.0616668', + '0.0676838'], + 'EstimatedNumberOfCells': ['1', '1', '1', '1', '1', '1', '1'], + 'FractionOfReadsInCells': ['1', '1', '1', '1', '1', '1', '1'], + 'MeanReadsPerCell': ['8538', '5862', '7389', + '10090', '470', '7650', '9420'], + 'NumberOfUMIs': ['7942', '5472', '6859', '9403', + '444', '7139', '8847'], + 'NumberOfGenes': ['408', '377', '391', '420', '150', '407', '420'], + 'NumberOfCountedReads': ['9535', '6463', '8299', '11273', + '533', '8444', '10383'] + } + expected = pd.DataFrame.from_dict(expected_dict, dtype=pd.StringDtype()) + pd.testing.assert_frame_equal(result, expected, check_like=True) + +def test_na(run_component, test_reads_and_genes_per_chr_path, + test_star_logs_summary_path, random_path): + """ + The star log summary can contain NA values. + """ + output_path = random_path("tsv") + summary_with_na_path = random_path("txt") + original_summary = pd.read_csv(test_star_logs_summary_path, + sep="\t", index_col=0) + original_summary.loc["GTCTCGAGTG", "FractionOfReadsInCells"] = pd.NA + original_summary.reset_index("WellBC").to_csv(summary_with_na_path, sep="\t", + header=True, index=False) + run_component([ + "--star_stats_file", summary_with_na_path, + "--nrReadsNrGenesPerChromPool", test_reads_and_genes_per_chr_path, + "--output", output_path + ]) + expected_dict = { + 'WellBC': ['AACAAGGTAC', 'ACGCCTTCGT', 'CCATACTGAC', 'GCAAGCGAAT', + 'GTCTCGAGTG', 'TGCGCTCATT', 'TTGTGTTCGA'], + 'WellID': ['A1', 'A2', 'A3', 'B1', 'C5', 'D6', 'E19'], + 'NumberOfMTReads': ['0', '0', '0', '0', '0', '0', '0'], + 'pctMT': ['0', '0', '0', '0', '0', '0', '0'], + 'NumberOfERCCReads': ['0', '0', '0', '0', '0', '0', '0'], + 'pctERCC': ['0', '0', '0', '0', '0', '0', '0'], + 'NumberOfChromReads': ['8542', '5863', '7396', '10092', '470', + '7650', '9422'], + 'pctChrom': ['100', '100', '100', '100', '100', '100', '100'], + 'NumberOfInputReads': ['141303', '96430', '113577', '156134', '10158', + '126989', '142560'], + 'NumberOfMappedReads': ['23749', '16869', '17319', '24005', '1902', + '19272', '22129'], + 'PctMappedReads': ['16.81', '17.49', '15.25', '15.37', '18.72', + '15.18', '15.52'], + 'NumberOfReadsMappedToMultipleLoci': ['0', '0', '0', '0', '0', '0', '0'], + 'PectOfReadsMappedToMultipleLoci': ['0', '0', '0', '0', '0', '0', '0'], + 'NumberOfReadsMappedToTooManyLoci': ['8458', '6124', '5905', '7961', '967', + '7141', '7045'], + 'PectOfReadsMappedToTooManyLoci': ['5.99', '6.35', '5.2', '5.1', '9.52', + '5.62', '4.94'], + 'NumberOfReadsUnmappedTooManyMismatches': ['0', '0', '0', '0', '0', '0', '0'], + 'PectOfReadsUnmappedTooManyMismatches': ['0', '0', '0', '0', '0', '0', '0'], + 'NumberOfReadsUnmappedTooShort': ['109035', '73375', '90292', '124096', + '7280', '100515', '113324'], + 'PectOfReadsUnmappedTooShort': ['77.16', '76.09', '79.5', '79.48', + '71.67', '79.15', '79.49'], + 'NumberOfReadsUnmappedOther': ['61', '62', '61', '72', '9', '61', '62'], + 'PectOfReadsUnmappedOther': ['0.04', '0.06', '0.05', '0.05', + '0.09', '0.05', '0.04'], + 'ReadsWithValidBarcodes': ['0.999816', '0.999782', '0.999859', '0.999744', + '0.999803', '0.999843', '0.999783'], + 'SequencingSaturation': ['0.0698056', '0.0665302', '0.0717282', '0.0680872', + '0.0553191', '0.0667974', '0.060828'], + 'Q30BasesInCB+UMI': ['0.979965', '0.980077', '0.982313', '0.982779', + '0.984451', '0.986581', '0.986622'], + 'ReadsMappedToTranscriptome:Unique+MultipeGenes': ['0.0618175', '0.0620969', + '0.066554', '0.0658665', + '0.0476472', '0.0616668', + '0.0676838'], + 'EstimatedNumberOfCells': ['1', '1', '1', '1', '1', '1', '1'], + 'FractionOfReadsInCells': ['1.0', '1.0', '1.0', '1.0', pd.NA, '1.0', '1.0'], + 'MeanReadsPerCell': ['8538', '5862', '7389', + '10090', '470', '7650', '9420'], + 'NumberOfUMIs': ['7942', '5472', '6859', '9403', + '444', '7139', '8847'], + 'NumberOfGenes': ['408', '377', '391', '420', '150', '407', '420'], + 'NumberOfCountedReads': ['9535', '6463', '8299', '11273', + '533', '8444', '10383'] + } + result = pd.read_csv(output_path, sep="\t", dtype=pd.StringDtype()) + expected = pd.DataFrame.from_dict(expected_dict, dtype=pd.StringDtype()) + pd.testing.assert_frame_equal(result, expected, check_like=True) + + +if __name__ == '__main__': + sys.exit(pytest.main([__file__])) \ No newline at end of file diff --git a/src/integration_test_components/htrnaseq/check_eset/config.vsh.yaml b/src/integration_test_components/htrnaseq/check_eset/config.vsh.yaml new file mode 100644 index 00000000..9c5b6d94 --- /dev/null +++ b/src/integration_test_components/htrnaseq/check_eset/config.vsh.yaml @@ -0,0 +1,34 @@ +name: "check_eset" +namespace: "integration_test_components/htrnaseq" +description: "This component test the ExpressionSet object as output by the main pipeline." +authors: + - __merge__: /src/base/authors/dries_schaumont.yaml + roles: [ author, maintainer ] +argument_groups: + - name: Inputs + arguments: + - name: "--eset" + type: file + required: true + description: Path to an ExpressionSet object. + example: eset.rds + - name: "--star_output" + type: file + required: true + multiple: true +resources: + - type: r_script + path: script.R +engines: + - type: docker + image: bioconductor/bioconductor_docker:3.19 + setup: + - type: r + cran: + - bit64 + bioc: + - Biobase + +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/integration_test_components/htrnaseq/check_eset/script.R b/src/integration_test_components/htrnaseq/check_eset/script.R new file mode 100644 index 00000000..a5be5613 --- /dev/null +++ b/src/integration_test_components/htrnaseq/check_eset/script.R @@ -0,0 +1,198 @@ +library(Biobase) +library(testthat) +library(Matrix) + +sample_1_result <- readRDS(par$eset) +expected_sample_names <- c( + "sample_one_AACAAGGTAC", "sample_one_AACAATCAGG", "sample_one_AACACCTAGT", + "sample_one_AACAGGCAAT", "sample_one_AACATGGAGA", "sample_one_AACATTACCG", + "sample_one_AACCAGCCAG", "sample_one_AACCAGTTGA", "sample_one_AACCGCGACT", + "sample_one_AACCGGAAGG", "sample_one_AACCGGCGTA", "sample_one_AACCTAGTCC", + "sample_one_AACCTCATAG", "sample_one_AACGTAAGCT", "sample_one_AACTCTACAC", + "sample_one_AACTGTGTCA", "sample_one_AAGACGGATT", "sample_one_AAGATCGGCG", + "sample_one_AAGATGTCCA", "sample_one_AAGCATATGG", "sample_one_AAGCGATGTT", + "sample_one_AAGCGTTCAG", "sample_one_AAGCTCACCT", "sample_one_AAGGCATGCG", + "sample_one_AAGGTCTGGA", "sample_one_AAGTTAGCGC", "sample_one_AAGTTCCTTG", + "sample_one_AATACCGGTA", "sample_one_AATAGCCACA", "sample_one_AATCACGCGA", + "sample_one_AATCCATCTG", "sample_one_AATCCGCTCC", "sample_one_AATCCTACCA", + "sample_one_AATCGTCCGC", "sample_one_AATGAACACG", "sample_one_AATGACCTTC", + "sample_one_AATGAGAGCA", "sample_one_AATGTCAGTG", "sample_one_AATTAGGCCG", + "sample_one_AATTGCGATG", "sample_one_ACAACAGTCG", "sample_one_ACAACCATAC", + "sample_one_ACAACGGAGC", "sample_one_ACAAGCGCGA", "sample_one_ACACAATCTC", + "sample_one_ACACAGTGAA", "sample_one_ACACCGAATT", "sample_one_ACACGCAGTA", + "sample_one_ACACGGTCCT", "sample_one_ACACTTGCTG", "sample_one_ACAGTGCCAA", + "sample_one_ACATGTGTGC", "sample_one_ACCAGGACCA", "sample_one_ACCATAACAC", + "sample_one_ACCGAACCGT", "sample_one_ACCGAGAGTC", "sample_one_ACCGGTACAG", + "sample_one_ACCGTACTTC", "sample_one_ACCTCCGACA", "sample_one_ACCTCTCTCC", + "sample_one_ACCTGTCCGA", "sample_one_ACCTTATGTG", "sample_one_ACGAATGACA", + "sample_one_ACGCCTCAAC", "sample_one_ACGCCTTCGT", "sample_one_ACGCTGGATA", + "sample_one_ACGGTCCGTT", "sample_one_ACGTAGGCAC", "sample_one_ACGTGCTGAT", + "sample_one_ACTCCAAGCC", "sample_one_ACTGGCGCAT", "sample_one_ACTGGCTTCC", + "sample_one_ACTTAACTGC", "sample_one_ACTTCATCAC", "sample_one_ACTTCGTTGA", + "sample_one_ACTTCTCCTG", "sample_one_ACTTGAGGAA", "sample_one_ACTTGTAAGG", + "sample_one_AGAACCACGG", "sample_one_AGAAGCAATC", "sample_one_AGACCGTTAT", + "sample_one_AGACTAGCAT", "sample_one_AGAGATGCAG", "sample_one_AGAGCTTACA", + "sample_one_AGAGTGTAAC", "sample_one_AGAGTTCTGC", "sample_one_AGATAGTGCT", + "sample_one_AGCAATGCGC", "sample_one_AGCATGTCAT", "sample_one_AGCCACTAGC", + "sample_one_AGCCAGAATA", "sample_one_AGCCAGCTCT", "sample_one_AGCGATAACG", + "sample_one_AGCGTACAAT", "sample_one_AGCTATTCCA", "sample_one_AGCTCCTCAG", + "sample_one_AGGAGGCATA", "sample_one_AGGCGTCTGT", "sample_one_AGTAACTCAC", + "sample_one_AGTAAGCGTT", "sample_one_AGTCTGTACG", "sample_one_AGTGCAATGT", + "sample_one_ATAAGGTGCA", "sample_one_ATACACGACA", "sample_one_ATAGGCCATT", + "sample_one_ATATCCGCAT", "sample_one_ATCAGCACTT", "sample_one_ATCAGCGAGG", + "sample_one_ATCCAATACG", "sample_one_ATCCGCTGTG", "sample_one_ATCCGTCCAT", + "sample_one_ATCGACGGCT", "sample_one_ATCGCGATTA", "sample_one_ATCGGTAGGC", + "sample_one_ATCTAAGGAG", "sample_one_ATGACGGTAA", "sample_one_ATGACTCAGT", + "sample_one_ATGCACCGGA", "sample_one_ATGCGGACTG", "sample_one_ATGCTTCCTA", + "sample_one_ATGGACCAAC", "sample_one_ATGGTCTTAG", "sample_one_ATGGTGAGCG", + "sample_one_ATGTGGAAGC", "sample_one_ATTATCGGAC", "sample_one_ATTCGGAACA", + "sample_one_CAACAATCCA", "sample_one_CAAGAAGCAT", "sample_one_CAAGATGAGG", + "sample_one_CAAGCCAACG", "sample_one_CAAGTGGATC", "sample_one_CACAGTTCAT", + "sample_one_CACGAGTCTG", "sample_one_CACGCTCCAA", "sample_one_CACTGAGCAC", + "sample_one_CAGATCAATG", "sample_one_CAGTGCTCTT", "sample_one_CAGTTAAGCA", + "sample_one_CATAGCTATC", "sample_one_CATCACCACC", "sample_one_CATGTACGCC", + "sample_one_CATTACACTG", "sample_one_CATTCGACGA", "sample_one_CCAACTATGG", + "sample_one_CCAAGGAGTT", "sample_one_CCAATTGTTC", "sample_one_CCACAAGTGC", + "sample_one_CCAGCTTAGT", "sample_one_CCATAACTTG", "sample_one_CCATACTGAC", + "sample_one_CCATAGATCA", "sample_one_CCATGTGCTT", "sample_one_CCATTCAGCG", + "sample_one_CCGAACAAGC", "sample_one_CCGAACCTAA", "sample_one_CCGAAGACCT", + "sample_one_CCGAATAGTG", "sample_one_CCGACTTCTC", "sample_one_CCGATCCACT", + "sample_one_CCGATGATAC", "sample_one_CCGCGTTATG", "sample_one_CCGCTAGCTT", + "sample_one_CCGGAGTATC", "sample_one_CCGGCCAATT", "sample_one_CCGGTCTCTA", + "sample_one_CCGTACGATG", "sample_one_CCGTCAGAAC", "sample_one_CCTAGACACG", + "sample_one_CCTAGTTGAG", "sample_one_CCTATTCTGT", "sample_one_CCTCAACCGA", + "sample_one_CCTCCATAAG", "sample_one_CCTGATGCCA", "sample_one_CCTGCAATAC", + "sample_one_CCTTGTATTC", "sample_one_CGAGATCTCT", "sample_one_CGAGGAACAA", + "sample_one_CGATAACCGC", "sample_one_CGATCCTGTG", "sample_one_CGCCAACCAT", + "sample_one_CGCCAGTGTT", "sample_one_CGCCTTGTAC", "sample_one_CGCGGATTCA", + "sample_one_CGCTTAAGGC", "sample_one_CGCTTACTAA", "sample_one_CGCTTCTTGG", + "sample_one_CGGAAGCTGT", "sample_one_CGGAATACAC", "sample_one_CGGAGATTGG", + "sample_one_CGGAGCTCAA", "sample_one_CGGATCGGTA", "sample_one_CGGATTCTAG", + "sample_one_CGGCAACTTA", "sample_one_CGGCTCATCA", "sample_one_CGGTCGTATT", + "sample_one_CGGTGACATC", "sample_one_CGTAACGGAT", "sample_one_CGTAAGATTC", + "sample_one_CGTACTGTAA", "sample_one_CGTAGAAGAC", "sample_one_CGTCCTAGGA", + "sample_one_CGTCGGCAAT", "sample_one_CGTGAGTTAT", "sample_one_CGTGTCAAGC", + "sample_one_CTAACTTCAG", "sample_one_CTAATAGCGT", "sample_one_CTACACCAGG", + "sample_one_CTAGCACAAT", "sample_one_CTATGAACGG", "sample_one_CTCAAGGACC", + "sample_one_CTCACCTGTC", "sample_one_CTCCTATTGT", "sample_one_CTCGCAACGT", + "sample_one_CTCGTGCCTA", "sample_one_CTGGATTGAC", "sample_one_CTGTAGTCAG", + "sample_one_CTGTCGCTTC", "sample_one_CTGTCTGTGT", "sample_one_CTTCATATCG", + "sample_one_CTTGCTGACG", "sample_one_GAAGGATTAG", "sample_one_GAATCGAGCC", + "sample_one_GACCATCTAA", "sample_one_GACGACCACA", "sample_one_GAGACATCTT", + "sample_one_GAGCGAGTCA", "sample_one_GAGTAGACCA", "sample_one_GATACGCTTA", + "sample_one_GATAGACTGT", "sample_one_GATAGAGGCG", "sample_one_GATAGGTCAA", + "sample_one_GATATCAGGA", "sample_one_GATCTCATTC", "sample_one_GATCTGGTCG", + "sample_one_GATGAGTGAC", "sample_one_GATGGATACA", "sample_one_GATGTGACAG", + "sample_one_GATTAAGTCC", "sample_one_GATTGCACGC", "sample_one_GCAAGCGAAT", + "sample_one_GCAATGTAAG", "sample_one_GCACACTATA", "sample_one_GCACTCGGAA", + "sample_one_GCACTGCGTT", "sample_one_GCACTTAATC", "sample_one_GCAGGAGATG", + "sample_one_GCAGTACTGG", "sample_one_GCATATGAGT", "sample_one_GCATCCGATC", + "sample_one_GCCAAGTACA", "sample_one_GCCACGATTC", "sample_one_GCCATAGGTT", + "sample_one_GCCATATCGA", "sample_one_GCCGTCAATA", "sample_one_GCCTGGACAT", + "sample_one_GCGTAATTAC", "sample_one_GCTATTATCC", "sample_one_GCTCAGTAAT", + "sample_one_GCTGCTTATA", "sample_one_GGAATAAGCA", "sample_one_GGACGATGCT", + "sample_one_GGCATCGTGA", "sample_one_GGCATTATTG", "sample_one_GGCCGAGATT", + "sample_one_GGCGCTATAA", "sample_one_GGCGTTAAGT", "sample_one_GGCTATTGAT", + "sample_one_GGCTGCTACT", "sample_one_GGTAATGTGT", "sample_one_GGTGGTTGGA", + "sample_one_GGTGTTCACC", "sample_one_GGTTAGATCT", "sample_one_GGTTATGGCG", + "sample_one_GGTTCACTGG", "sample_one_GGTTGTGCAA", "sample_one_GTAACCAGTA", + "sample_one_GTAACCTTGG", "sample_one_GTAAGAACCT", "sample_one_GTAAGGCTCC", + "sample_one_GTAATCCACG", "sample_one_GTATTGTGGA", "sample_one_GTCCGCATCA", + "sample_one_GTCCTTCGGT", "sample_one_GTCGCTCTCT", "sample_one_GTCGGTGACA", + "sample_one_GTCTCGAGTG", "sample_one_GTCTCTTAAG", "sample_one_GTCTTCCGAG", + "sample_one_GTGACTATAC", "sample_one_GTGGTTAATG", "sample_one_GTGTGCCTGT", + "sample_one_GTGTGTGTCC", "sample_one_GTTCATTGCC", "sample_one_GTTCCGGTGA", + "sample_one_GTTCGTCGAA", "sample_one_GTTGAATTGG", "sample_one_GTTGATCCGC", + "sample_one_GTTGTATGCT", "sample_one_TAACCGTAGC", "sample_one_TAACGTCGAT", + "sample_one_TAAGGTACGG", "sample_one_TACGGACATA", "sample_one_TACTACCGCC", + "sample_one_TACTGTCAAG", "sample_one_TAGCGAACGC", "sample_one_TAGCGCCAAC", + "sample_one_TAGGACGCCT", "sample_one_TAGGTTGCAA", "sample_one_TAGTAGTCTC", + "sample_one_TAGTCCGCTG", "sample_one_TAGTGGAACT", "sample_one_TATCATGCAG", + "sample_one_TATCGTTACG", "sample_one_TCAAGTGCAG", "sample_one_TCACAGATAC", + "sample_one_TCACCGCCTA", "sample_one_TCACGCCACT", "sample_one_TCACGTTGGC", + "sample_one_TCATTGTCCA", "sample_one_TCCACACTAG", "sample_one_TCCACGGTCA", + "sample_one_TCCACTCGCT", "sample_one_TCCGACTAAC", "sample_one_TCCGTTATCT", + "sample_one_TCCTAAGAGA", "sample_one_TCCTCTAGTA", "sample_one_TCGAAGCATT", + "sample_one_TCGAGAGAGC", "sample_one_TCGCACTTGA", "sample_one_TCGCCTACTG", + "sample_one_TCGCGTAGCA", "sample_one_TCGGCGTTAA", "sample_one_TCTACATCCG", + "sample_one_TCTCCACATT", "sample_one_TCTCTCCTAT", "sample_one_TCTTGCTCGG", + "sample_one_TGAACTAACC", "sample_one_TGAAGAAGGT", "sample_one_TGAGCGTTCC", + "sample_one_TGAGTACGTA", "sample_one_TGGAATGGAG", "sample_one_TGTCATTCGC", + "sample_one_TGTGCTTCAG", "sample_one_TGTTCAGGAT", "sample_one_TTACACACGT", + "sample_one_TTACTGTGAC", "sample_one_TTATAGGAGG", "sample_one_TTATCGCGTT", + "sample_one_TTATGCCGCG", "sample_one_TTCACGGAAG", "sample_one_TTCAGGAGTA", + "sample_one_TTCCATCGAG", "sample_one_TTCGAGTGAT", "sample_one_TTCTGTACCT", + "sample_one_TTGGCAATTC", "sample_one_TTGGCTCCAC", "sample_one_TTGGTAACAG", + "sample_one_TTGGTCAGTA", "sample_one_TTGTCGGCCA", "sample_one_TTGTGTTCGA" +) +stopifnot(identical(sampleNames(sample_1_result), expected_sample_names)) + +expected_var_labels <- c( + "WellBC", + "WellID", + "NumberOfMTReads", + "pctMT", + "NumberOfERCCReads", + "pctERCC", + "NumberOfChromReads", + "pctChrom", + "NumberOfInputReads", + "NumberOfMappedReads", + "PctMappedReads", + "NumberOfReadsMappedToMultipleLoci", + "PectOfReadsMappedToMultipleLoci", + "NumberOfReadsMappedToTooManyLoci", + "PectOfReadsMappedToTooManyLoci", + "NumberOfReadsUnmappedTooManyMismatches", + "PectOfReadsUnmappedTooManyMismatches", + "NumberOfReadsUnmappedTooShort", + "PectOfReadsUnmappedTooShort", + "NumberOfReadsUnmappedOther", + "PectOfReadsUnmappedOther", + "ReadsWithValidBarcodes", + "SequencingSaturation", + "Q30BasesInCB.UMI", + "ReadsMappedToTranscriptome.Unique.MultipeGenes", + "EstimatedNumberOfCells", + "FractionOfReadsInCells", + "MeanReadsPerCell", + "NumberOfUMIs", + "NumberOfGenes", + "NumberOfCountedReads", + "PoolName" +) +stopifnot(identical(varLabels(sample_1_result), expected_var_labels)) + +read_mm <- function(mapping_dir) { + market_matrix_file <- file.path(mapping_dir, "Solo.out", + "Gene", "raw", "matrix.mtx") + result <- readMM(market_matrix_file) + feature_file <- file.path(mapping_dir, "Solo.out", + "Gene", "raw", "features.tsv") + features <- read.table(feature_file, sep = "\t", header = FALSE, + col.names = c("ID", "Name", "Type"))$ID + rownames(result) <- gsub("\\.\\d+$", "", features) + barcodes_file <- file.path(mapping_dir, + "Solo.out", "Gene", "raw", "barcodes.tsv") + if (!file.exists(barcodes_file)) { + stop(paste0("Expected the 'Solo.out/Gene/raw' directory at ", + mapping_dir, " to contain a 'barcodes.tsv' file.")) + } + barcodes <- readLines(barcodes_file) + if (length(barcodes) != 1) { + stop(paste0("A single STAR Solo folder should only have ", + "mapped one (1) barcode, but found '", + length(barcodes), "'for mapping directory ", mapping_dir)) + } + colnames(result) <- paste0("sample_one_", barcodes) + return(result) +} +expected_matrices <- lapply(par$star_output, read_mm) +expected_matrix <- as.matrix(do.call(cbind, expected_matrices)) +result_counts <- exprs(sample_1_result) +stopifnot(length(setdiff(colnames(expected_matrix), + colnames(exprs(sample_1_result)))) == 0) +stopifnot(length(setdiff(rownames(expected_matrix), + rownames(exprs(sample_1_result)))) == 0) +expected_matrix_sorted <- expected_matrix[, colnames(exprs(sample_1_result))] +stopifnot(identical(exprs(sample_1_result), expected_matrix_sorted)) \ No newline at end of file diff --git a/src/integration_test_components/well_demultiplexing/check_cutatapt_output/config.vsh.yaml b/src/integration_test_components/well_demultiplexing/check_cutatapt_output/config.vsh.yaml new file mode 100644 index 00000000..5c142972 --- /dev/null +++ b/src/integration_test_components/well_demultiplexing/check_cutatapt_output/config.vsh.yaml @@ -0,0 +1,41 @@ +name: "check_cutadapt_output" +namespace: "integration_test_components/well_demultiplexing" +description: "This component test the cutadapt output from the well_demultiplex subworkflow." +authors: + - __merge__: /src/base/authors/dries_schaumont.yaml + roles: [ author, maintainer ] +argument_groups: + - name: Inputs + arguments: + - name: "--fastq_r1" + type: file + required: true + multiple: true + description: Path to the forward reads to test. + - name: "--fastq_r2" + type: file + required: true + multiple: true + description: Path to the reverse reads to test. + - name: "--ids" + type: string + description: "Well IDs for the corresponding fastq input" + required: true + multiple: true +resources: + - type: python_script + path: script.py +engines: + - type: docker + image: python:3.12-slim + setup: + - type: apt + packages: + - procps + - type: python + packages: + - dnaio + +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/integration_test_components/well_demultiplexing/check_cutatapt_output/script.py b/src/integration_test_components/well_demultiplexing/check_cutatapt_output/script.py new file mode 100644 index 00000000..868024eb --- /dev/null +++ b/src/integration_test_components/well_demultiplexing/check_cutatapt_output/script.py @@ -0,0 +1,78 @@ +import dnaio +from operator import itemgetter +## VIASH START +par = { +} +## VIASH END + +def assert_number_of_reads(reads): + expected_number_of_reads = { + "SRR14730301__A1": 165, + "SRR14730301__B1": 194, + "SRR14730302__A1": 141, + "SRR14730302__B1": 213, + "SRR14730302__unknown": 99646, + "SRR14730301__unknown": 99641, + } + for input_id, expected_reads in expected_number_of_reads.items(): + num_reads = len(reads[input_id]) + assert num_reads == expected_reads, \ + f"Expected number of ouput reads for {input_id} to be {expected_reads}, was {num_reads}." + + +def string_difference(string1, string2): + result = 0 + for char1, char2 in zip(string1, string2, strict=True): + if char1.lower() != char2.lower(): + result += 1 + return result + + +def assert_barcodes_not_removed(reads): + barcodes = { + "SRR14730301__A1": "ACACCGAATT", + "SRR14730302__A1": "ACACCGAATT", + "SRR14730301__B1": "GGCTATTGAT", + "SRR14730302__B1": "GGCTATTGAT" + } + for sample_id, barcode in barcodes.items(): + sample_reads = reads[sample_id] + forward_reads = map(itemgetter(0), sample_reads) + for i, forward_read in enumerate(forward_reads): + read_sequence = forward_read.sequence + read_barcode_start = read_sequence[: len(barcode)] + # A 10% difference is allowed. + assert string_difference(read_barcode_start, barcode) <= (0.1 * len(barcode)), \ + (f"Expected barcode {barcode} to be present for sample {sample_id} " + f"in read {i}. Found {read_barcode_start}") + +def create_input_mapping(sample_ids, inputs_r1, inputs_r2): + return {sample_id: [input_r1, input_r2] + for sample_id, input_r1, input_r2 + in zip(sample_ids, inputs_r1, inputs_r2, strict=True)} + +def read_input_files(input_mapping): + expected_keys = {"SRR14730301__A1", "SRR14730301__B1", + "SRR14730302__A1", "SRR14730302__B1", + "SRR14730301__unknown", "SRR14730302__unknown"} + difference = set(input_mapping.keys()) - expected_keys + assert not difference, f"Found unexpected output id(s): {difference}" + result = {} + for input_id, input_files in input_mapping.items(): + input_r1, input_r2 = input_files + # This reads the files into memory, + # but they are reasonably small + with dnaio.open(input_r1) as r1_reads, dnaio.open(input_r2) as r2_reads: + for r1_read, r2_read in zip(r1_reads, r2_reads, strict=True): + result.setdefault(input_id, []).append((r1_read, r2_read)) + return result + + +def main(par): + inputs = create_input_mapping(par["ids"], par["fastq_r1"], par["fastq_r2"]) + reads = read_input_files(inputs) + assert_number_of_reads(reads) + assert_barcodes_not_removed(reads) + +if __name__ == "__main__": + main(par) \ No newline at end of file diff --git a/src/io/publish_fastqs/code.sh b/src/io/publish_fastqs/code.sh new file mode 100755 index 00000000..4ee3c658 --- /dev/null +++ b/src/io/publish_fastqs/code.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +echo "Publishing $par_input -> $par_output" + +echo +echo "Creating directory if it does not exist:" +mkdir -p "$par_output" && echo "$par_output created" + +echo +echo "Copying files..." +IFS=";" read -ra input <<<$par_input + +for i in "${input[@]}"; do + cp -rL "$i" "$par_output/" +done \ No newline at end of file diff --git a/src/io/publish_fastqs/config.vsh.yaml b/src/io/publish_fastqs/config.vsh.yaml new file mode 100644 index 00000000..580a80e9 --- /dev/null +++ b/src/io/publish_fastqs/config.vsh.yaml @@ -0,0 +1,34 @@ +name: "publish_fastqs" +namespace: "io" +description: "Publish the fastq files per well" +argument_groups: + - name: Input arguments + arguments: + - name: --input + description: Directory to write fastq data to + type: file + multiple: true + required: true + - name: Output arguments + arguments: + - name: --output + type: file + direction: output + # ID is the well barcode + default: "$id/" + +resources: + - type: bash_script + path: ./code.sh + +engines: + - type: docker + image: debian:stable-slim + setup: + - type: apt + packages: + - procps + +runners: + - type: executable + - type: nextflow diff --git a/src/io/publish_results/code.sh b/src/io/publish_results/code.sh new file mode 100755 index 00000000..62349343 --- /dev/null +++ b/src/io/publish_results/code.sh @@ -0,0 +1,93 @@ +#!/bin/bash + +set -eo pipefail + +echo "Publishing results to multiple output directories" + +# Create output directories for multiple files +echo "Creating output directories..." + +path_pars=( + par_star_output_dir + par_nrReadsNrGenesPerChrom_dir + par_star_qc_metrics_dir + par_eset_dir + par_f_data_dir + par_p_data_dir + par_html_report_output + par_run_params_output +) + +for par in ${path_pars[@]}; do + curr_val="${!par}" + new_value=$(realpath --canonicalize-missing "$curr_val") + declare -g "$par=$new_value" +done + +mkdir -p "$par_star_output_dir" && echo "$par_star_output_dir created" +mkdir -p "$par_nrReadsNrGenesPerChrom_dir" && echo "$par_nrReadsNrGenesPerChrom_dir created" +mkdir -p "$par_star_qc_metrics_dir" && echo "$par_star_qc_metrics_dir created" +mkdir -p "$par_eset_dir" && echo "$par_eset_dir created" +mkdir -p "$par_f_data_dir" && echo "$par_f_data_dir created" +mkdir -p "$par_p_data_dir" && echo "$par_p_data_dir created" + +echo +echo "Copying STAR output files..." +IFS=";" read -ra star_output <<<$par_star_output +for i in "${star_output[@]}"; do + echo "Copying $i to $par_star_output_dir/" + cp -rL "$i" "$par_star_output_dir/" +done + +echo +echo "Copying nrReadsNrGenesPerChrom files..." +IFS=";" read -ra nrReadsNrGenesPerChrom <<<$par_nrReadsNrGenesPerChrom +for i in "${nrReadsNrGenesPerChrom[@]}"; do + echo "Copying $i to $par_nrReadsNrGenesPerChrom_dir/" + cp -rL "$i" "$par_nrReadsNrGenesPerChrom_dir/" +done + +echo +echo "Copying STAR QC metrics files..." +IFS=";" read -ra star_qc_metrics <<<$par_star_qc_metrics +for i in "${star_qc_metrics[@]}"; do + echo "Copying $i to $par_star_qc_metrics_dir/" + cp -rL "$i" "$par_star_qc_metrics_dir/" +done + +echo +echo "Copying eset files..." +IFS=";" read -ra eset <<<$par_eset +for i in "${eset[@]}"; do + echo "Copying $i to $par_eset_dir/" + cp -rL "$i" "$par_eset_dir/" +done + +echo +echo "Copying f_data files..." +IFS=";" read -ra f_data <<<$par_f_data +for i in "${f_data[@]}"; do + echo "Copying $i to $par_f_data_dir/" + cp -rL "$i" "$par_f_data_dir/" +done + +echo +echo "Copying p_data files..." +IFS=";" read -ra p_data <<<$par_p_data +for i in "${p_data[@]}"; do + echo "Copying $i to $par_p_data_dir/" + cp -rL "$i" "$par_p_data_dir/" +done + +echo +echo "Copying single files directly..." +mkdir -p $(dirname "$par_html_report_output") +echo "Copying $par_html_report to $par_html_report_output" +cp -L "$par_html_report" "$par_html_report_output" + +echo "Copying $par_run_params to $par_run_params_output" +mkdir -p $(dirname "$par_run_params_output") +cp -L "$par_run_params" "$par_run_params_output" + +echo +echo "Publishing completed successfully!" diff --git a/src/io/publish_results/config.vsh.yaml b/src/io/publish_results/config.vsh.yaml new file mode 100644 index 00000000..6da4c312 --- /dev/null +++ b/src/io/publish_results/config.vsh.yaml @@ -0,0 +1,91 @@ +name: "publish_results" +namespace: "io" +description: "Publish the results" +argument_groups: + - name: Input arguments + arguments: + - name: --star_output + description: Output from mapping with STAR + type: file + multiple: true + required: true + - name: "--nrReadsNrGenesPerChrom" + type: file + multiple: true + required: true + - name: "--star_qc_metrics" + type: file + multiple: true + required: true + - name: "--eset" + type: file + multiple: true + required: true + - name: "--f_data" + type: file + multiple: true + required: true + - name: "--p_data" + type: file + multiple: true + required: true + - name: "--html_report" + type: file + required: true + - name: "--run_params" + type: file + required: true + - name: Output directory + description: | + Determines the name of output directories + arguments: + - name: --star_output_dir + type: file + direction: output + default: "star_output" + - name: --nrReadsNrGenesPerChrom_dir + type: file + direction: output + default: "nrReadsNrGenesPerChrom" + - name: --star_qc_metrics_dir + type: file + direction: output + default: "starLogs" + - name: --eset_dir + type: file + direction: output + default: "esets" + - name: --f_data_dir + type: file + direction: output + default: "fData" + - name: --p_data_dir + type: file + direction: output + default: "pData" + - name: "Output file arguments" + description: Determines the name of output files + arguments: + - name: "--run_params_output" + type: file + direction: output + - name: "--html_report_output" + type: file + direction: output + + +resources: + - type: bash_script + path: ./code.sh + +engines: + - type: docker + image: debian:stable-slim + setup: + - type: apt + packages: + - procps + +runners: + - type: executable + - type: nextflow diff --git a/src/parallel_map/STAR b/src/parallel_map/STAR new file mode 100755 index 00000000..331a838d Binary files /dev/null and b/src/parallel_map/STAR differ diff --git a/src/parallel_map/config.vsh.yaml b/src/parallel_map/config.vsh.yaml new file mode 100644 index 00000000..185c6c93 --- /dev/null +++ b/src/parallel_map/config.vsh.yaml @@ -0,0 +1,124 @@ +name: parallel_map +description: | + Map wells in batch, using STAR + Spliced Transcripts Alignment to a Reference (C) Alexander Dobin + https://github.com/alexdobin/STAR +authors: + - __merge__: /src/base/authors/dries_schaumont.yaml + roles: [ maintainer ] + - __merge__: /src/base/authors/toni_verbeiren.yaml + roles: [ author, maintainer ] +requirements: + commands: + - STAR + - file + - parallel +argument_groups: + - name: Input arguments + arguments: + - name: "--input_r1" + description: | + Input FASTQ files for the forward reads. All FASTQ file names must start with the prefix '{well_id}_R1', where + 'well_id' can be found as the sequence identifier in the barcodes FASTA file (see 'barcodesFasta' argument). + For each FASTQ file, a matching FASTQ file for the reverse reads must be provided to the 'input_r2' argument, + meaning that their 'well_id' prefix must match. The number of items provided for 'input_r1' must be equal + to the number of items for 'input_r2'. + type: file + required: true + multiple: true + - name: "--input_r2" + description: | + Input FASTQ files for the reverse reads. All FASTQ file names must start with the prefix '{well_id}_R2', where + 'well_id' can be found as the sequence identifier in the barcodes FASTA file (see 'barcodesFasta' argument). + For each FASTQ file, a matching FASTQ file for the reverse reads must be provided to the 'input_r1' argument, + meaning that their 'well_id' prefix must match. The number of items provided for 'input_r1' must be equal + to the number of items for 'input_r2'. + type: file + required: true + multiple: true + - name: "--genomeDir" + description: | + Reference genome to match to. Can be generated from genomic FASTA sequences and a genome annotation + by using STAR with '--runMode genomeGenerate'. + type: file + required: true + - name: "--barcodesFasta" + type: file + required: true + description: | + FASTA file where each entry specifies a unique barcode sequence present at the start of the forward input reads + (input_r1). The IDs of each barcode (the start of the FASTA headers up until the first whitespace character) must + match with the start of one input FASTQ pair. + - name: Barcode arguments + arguments: + - name: "--umiLength" + type: integer + required: true + description: | + Length of the Unique Molecular Identifiers (UMI). The UMI are expected to be located after the barcodes in the + forwards reads. + - name: "--limitBAMsortRAM" + type: string + default: "10000000000" + - name: Runtime arguments + arguments: + - name: "--runThreadN" + description: "Number of threads to use for a single STAR execution." + type: integer + default: 1 + - name: Output arguments + arguments: + - name: "--output" + type: file + description: | + A list of output folders which are the result of using STAR to map each input FASTQ pair STAR to the reference genome. + The order of the items DO NOT match with the order of the entries in the barcodes FASTA file or the input FASTQ pairs. + required: true + multiple: true + direction: output + default: './*' + - name: "--joblog" + type: file + description: Where to store the log file listing all the jobs. + required: false + direction: output + default: "execution_log.txt" + +resources: +- type: bash_script + path: script.sh +- path: STAR + +test_resources: + - type: bash_script + path: test.sh + +engines: + - type: docker + image: debian:stable-slim + setup: + - type: apt + packages: + - procps + - wget + - automake + - make + - gcc + - g++ + - zlib1g-dev + - parallel + - file + - seqkit + - type: docker + build_args: + - STAR_V=2.7.6a + env: + - STAR_SOURCE="https://github.com/alexdobin/STAR/archive/refs/tags/$STAR_V.tar.gz" + - STAR_TARGET="/app/star-$STAR_V.tar.gz" + - STAR_INSTALL_DIR="/app/STAR-$STAR_V" + - STAR_BINARY=STAR + copy: + - STAR /usr/local/bin/$STAR_BINARY +runners: + - type: executable + - type: nextflow diff --git a/src/parallel_map/script.sh b/src/parallel_map/script.sh new file mode 100755 index 00000000..a2c489b0 --- /dev/null +++ b/src/parallel_map/script.sh @@ -0,0 +1,342 @@ +#!/bin/bash + +## VIASH START +par_input_r1="work/2c/5b8b3a2dd4a988b8838e3f72d38a37/_viash_par/input_r1_1/two__ACACCGAATT.concat_text_r1.output.txt" +par_input_r2="work/2c/5b8b3a2dd4a988b8838e3f72d38a37/_viash_par/input_r2_1/two__ACACCGAATT.concat_text_r2.output.txt" +par_barcodes="ACACCGAATT;GGCTATTGAT" +par_output="./*" +par_genomeDir="star" +par_umiLength=10 +par_limitBAMsortRAM="10000000000" +meta_cpus=2 +par_runThreadN=1 +## VIASH END + +set -eo pipefail + +# Check if wildcard character is present in output folder template +printf "Checking if output folder template ($par_output) contains a single wildcard character '*'. " +output_glob_character="${par_output//[^\*]}" +if [[ "${#output_glob_character}" -ne "1" ]]; then + echo "The value for --output must contain exactly one '*' character. Exiting..." + exit 1 +else + echo "Done, wildcard character found!" +fi + +# Split the delimited strings into arrays +IFS=';' read -r -a input_r1 <<< "$par_input_r1" +IFS=';' read -r -a input_r2 <<< "$par_input_r2" + +# Read barcodes FASTQ +# seqkit will make sure to take the leading non-whitespace as sequence identifier (ID) +# Luckily, this is the same as how cutadapt determines an adapter name from the FASTA header. +readarray -t well_ids < <(seqkit seq --name "$par_barcodesFasta" ) +readarray -t barcodes < <(seqkit seq --seq --upper-case --remove-gaps --gap-letters '^' --validate-seq "$par_barcodesFasta") + +# Function to test for unique values in array +function arrayContainsUniqueValues { + # Pass the argument by reference + local -n arr=$1 + # Create a temporary associative array + # in order to use its uniqueness of keys + # 'declare' in a function is automatically local + declare -A uniq_tmp + for item in "${arr[@]}"; do + uniq_tmp[$item]=0 # assigning a placeholder + done + local unique_array_values=(${!uniq_tmp[@]}) + if [ "${#unique_array_values[@]}" -eq "${#arr[@]}" ]; then + return + fi + false +} +arrayContainsUniqueValues barcodes +is_array_unique_exit_code=$? +if ! (exit $is_array_unique_exit_code); then + echo "The provided barcodes should be unique!" + echo "Values: $par_barcodes" + exit 1 +fi + +# Check that the number of values provided for the fastq files are the same. +num_r1_inputs="${#input_r1[@]}" +num_r2_inputs="${#input_r2[@]}" + +if [ ! "$num_r1_inputs" -eq "$num_r2_inputs" ]; then + echo "The number of values for arguments "\ + "'input_r1' ($num_r1_inputs) and 'input_r2' ($num_r2_inputs) "\ + "should be the same." + exit 1 +else + echo "Checked if the same as the number of R1 FASTQ ($num_r1_inputs) and R2 FASTQ files "\ + "($num_r2_inputs) were provided. Seems OK!" +fi + +# Loop over the well IDs and match them to the input FASTQ files +# The FASTQ file names should have the format {well_id}_R(1|2).fastq, +# which is the output format that the cutadapt component uses for demultiplexing. +# sorted_input_r1 and sorted_input_r2 are the input FASTQ files sorted by the order +# of the barcodes in the barcodes array (i.e. the order in the barcodes FASTA file). +declare -a sorted_input_r1=() +declare -a sorted_input_r2=() +for barcode_index in "${!barcodes[@]}"; do + barcode="${barcodes[$barcode_index]}" + well_id="${well_ids[$barcode_index]}" + echo "Finding FASTQ files for barcode ${barcode}, well ID '${well_id}'." + # The FASTQ files for a particular barcode must match the following regex: + input_file_regex="^${well_id}_R[1-2]" + + for r1_index in "${!input_r1[@]}"; do + r1_file_path=${input_r1[$r1_index]} + r2_file_path=${input_r2[$r1_index]} + # Get the file names from the full path + r1_file_name=$(basename -- "$r1_file_path") + r2_file_name=$(basename -- "$r2_file_path") + + # Check if the file names match the regex + if [[ $r1_file_name =~ $input_file_regex ]]; then + echo "Matched with $r1_file_name and $r2_file_name." + # If the R1 FASTQ file matched the regex, + # the R2 file must have also been matched + if ! [[ $r2_file_name =~ $input_file_regex ]]; then + echo "File ${r1_file_name} matched with regex ${input_file_regex} "\ + "but ${r2_file_name} did not! Make sure that the order of "\ + "the R1 and R2 input files match." + exit 1 + fi + # Add the + sorted_input_r1+=("$r1_file_path") + sorted_input_r2+=("$r2_file_path") + # Do not continue looking for more files for this barcode + # '2' to affect the *outer* loop (which indeed loops barcodes)! + continue 2 + fi + done + echo "Did not find FASTQ files files for well ${well_id}! "\ + "Make sure that the input files have the correct file name format."\ + "Input files: ${input_r1[@]}" + exit 1 +done + + +# Define the function that will be used to run a single job +function _run() { + local par_UMIlength="$1" + local par_output="$2" + local par_genomeDir="$3" + local par_limitBAMsortRAM="$4" + local par_runThreadN="$5" + local barcode="$6" + local input_R1="$7" + local input_R2="$8" + + local barcode_length="${#barcode}" + local umi_start="$(($barcode_length + 1))" + + set -eo pipefail + + echo <<-EOF + Processing $barcode + For the following inputs (lanes): + "$star_readFilesIn + EOF + + echo "Writing barcode '$barcode' to $barcode.txt and using it as input". + # Note that there is no possible conflict between jobs here + # because the barcodes are unique (and the barcode is part of the name + # of the file). + echo "$barcode" > "$barcode.txt" + + local dir="${par_output//\*/$barcode}/" + echo "Setting output for barcode '$barcode' to '$dir'." + mkdir -p "$dir" + + # check if files are compressed + local TMPDIR=$(mktemp -d "$meta_temp_dir/parallel_map-$barcode-XXXXXX") + function clean_up { + [[ -d "$TMPDIR" ]] && rm -r "$TMPDIR" + } + trap clean_up RETURN + + # Decompress the input files when needed + # NOTE: for some reason, using STAR's --readFilesCommand does not always work + # This might be because STAR creates fifo files (see https://man7.org/linux/man-pages/man7/fifo.7.html) + # and this requires a filesystem that supports this. Another cause might be that the input files + # are symlinks. When testing this, using '--readFilesCommand "zcat"' + # always produced empty BAM files, but also a succesfull exit code (0) so the problem is not reported. + # However, the logs showed the following error: "gzip -: unexpected end of file". + + function is_gzipped { + printf "Checking if input '$1' (barcode '$barcode') is gzipped... " + if file "$1" | grep -q 'gzip'; then + echo "Done, detected compressed file." + return + fi + echo "Done, file does not need decompression." + false + } + + # Resolve symbolic links to actual file paths + input_R1=$(realpath $input_R1) + input_R2=$(realpath $input_R2) + + if is_gzipped $input_R1; then + local compressed_file_name_r1="$(basename -- $input_R1)" + local uncompressed_file_r1="$TMPDIR/${compressed_file_name_r1%.gz}" + printf "Unpacking input to $uncompressed_file_r1... " + zcat "$input_R1" > "$uncompressed_file_r1" + echo "Decompression done." + else + local uncompressed_file_r1="$input_R1" + fi + + if is_gzipped $input_R2; then + local compressed_file_name_r2="$(basename -- $input_R2)" + local uncompressed_file_r2="$TMPDIR/${compressed_file_name_r2%.gz}" + printf "Unpacking input to $uncompressed_file_r2... " + zcat "$input_R2" > "$uncompressed_file_r2" + echo "Decompression done." + else + local uncompressed_file_r2="$input_R2" + fi + + local n_input_lines_r1=$(wc -l < "$uncompressed_file_r1") + local n_input_lines_r2=$(wc -l < "$uncompressed_file_r2") + + printf "Checking if length of input file mates match. " + if (( $n_input_lines_r1 != n_input_lines_r2 )); then + echo "The length of file $input_R1 ($n_input_lines_r1) does not match with $input_R2 ($n_input_lines_r2)" + return 1 + else + echo "Seems OK, $n_input_lines_r1 input lines." + fi + echo "Starting STAR for barcode '$barcode'" + # soloType 'Droplet' is the same as 'CB_UMI_Simple': one UMI and one cell barcode of fixed length. + # By default in this mode, STAR will look for the cell barcode and the UMI int the last files specified with --readFilesIn + # So we need to specify R2 first and R1 second, because R1 contains the barcode and UMI. + # Also, you might be tempted to use '--soloBarcodeMate 1' to alter this behavior, but this requires the clipping + # the barcode from this mate by specifying --clip5pNbases and/or --clip3pNbases, which we do not want to do. + STAR \ + --readFilesIn "$uncompressed_file_r2" "$uncompressed_file_r1" \ + --soloType Droplet \ + --quantMode GeneCounts \ + --genomeLoad LoadAndKeep \ + --limitBAMsortRAM "$par_limitBAMsortRAM" \ + --runThreadN "$par_runThreadN" \ + --outFilterMultimapNmax 1 \ + --outSAMtype BAM SortedByCoordinate \ + --soloCBstart 1 \ + --readFilesType "Fastx" \ + --soloCBlen "$barcode_length" \ + --soloUMIstart "$umi_start" \ + --soloUMIlen "$par_UMIlength" \ + --soloBarcodeReadLength 0 \ + --soloStrand Unstranded \ + --soloFeatures Gene \ + --genomeDir "$par_genomeDir" \ + --outReadsUnmapped Fastx \ + --outSAMunmapped Within \ + --outSAMattributes NH HI nM AS CR UR CB UB GX GN \ + --soloCBwhitelist "$barcode.txt" \ + --outFileNamePrefix "$dir" \ + --outTmpDir "$TMPDIR/STARtemp/" + + printf "Done running STAR. " + # Check if the number of processed reads is equal to the number of input reads + local n_input_reads=$(($n_input_lines_r1 / 4)) + local nr_output_reads=$(grep -Po "Number\ of\ input\ reads \\|\W*\K\d+" "$dir/Log.final.out") + if (( $nr_output_reads != $n_input_reads )); then + echo "Not all input reads were processed for barcode $barcode." + return 1 + else + echo "Processed $nr_output_reads reads for barcode $barcode". + fi + + printf "Making sure that the output has the proper permissions." + find "$dir" -type d -exec chmod o+x {} \; + chmod -R o+r "$dir" + echo "Done" +} + +# Export the function - requires bash +export -f _run + +# Load reference genome +echo "Loading reference genome" +STAR --genomeLoad LoadAndExit --genomeDir "$par_genomeDir" + +# Run the concurrent jobs using GNU parallel + +# Make sure that parallel uses the correct shell +export PARALLEL_SHELL="/bin/bash" + +# Some notes: +# --halt now,fail=1: instruct parallel to exit when a job has failed and kill remaining running jobs. +# +# ::: is a special syntax for GNU parallel to delineate inputs +# If multiple ::: are given, each group will be treated as an input source, and all combinations of input +# sources will be generated. E.g. ::: 1 2 ::: a b c will result in the combinations (1,a) (1,b) (1,c) (2,a) (2,b) (2,c) +# The delimiter :::+ (note the extra '+') links the argument to the previous argument, and one argument from each of the input +# sources will be read. +parallel_cmd=("parallel" "--jobs" "80%" "--verbose" "--memfree" "2G" + "--tmpdir" "$meta_temp_dir" + "--retry-failed" "--retries" "4" "--halt" "soon,fail=1" + "--joblog" "$par_joblog" "_run" "{}") + +# Arguments for which there is one value, so these will not create extra jobs +parallel_cmd+=(":::" "$par_umiLength" ":::" "$par_output" ":::" "$par_genomeDir" ":::" "$par_limitBAMsortRAM" ":::" "$par_runThreadN") + +# Argument which in fact will cause extra jobs to be spawned, per job one item from each argument will be selected +# Thus, these argument lists should have the same length. +parallel_cmd+=(":::" "${barcodes[@]}" ":::+" "${sorted_input_r1[@]}" ":::+" "${sorted_input_r2[@]}") + +set +eo pipefail +"${parallel_cmd[@]}" +exit_code=$? +set -eo pipefail + +echo "GNU parallel finished!" + +# Unload reference +printf "Unloading reference genome. " +STAR --genomeLoad Remove --genomeDir "$par_genomeDir" +echo "Done!" + +# Exit code from GNU parallel: +# If fail=1 is used, the exit status will be the exit status of the failing job. +echo "Checking exit code" +if ((exit_code>0)); then + # Note that the ending HERE must be indented with TAB characters (not spaces) + # in order to remove leading indentation + MESSAGE=$( + cat <<-HERE + ================================================================== + + !!! An error occurred for one of the jobs. + Exit code of the failing job: $exit_code. + + %s + + ================================================================== + + HERE + ) + printf "$MESSAGE" "$(<$par_joblog)" + exit 1 +else + cat <<-HERE + ================================================================== + + Mapping went fine (exit code '$exit_code'), zero errors occurred + + ================================================================== + HERE + +fi + + + + + + diff --git a/src/parallel_map/test.sh b/src/parallel_map/test.sh new file mode 100755 index 00000000..c1001b07 --- /dev/null +++ b/src/parallel_map/test.sh @@ -0,0 +1,466 @@ +set -eo pipefail + +## VIASH START +meta_executable=$(realpath "target/executable/parallel_map/parallel_map") +## VIASH END + +# Some helper functions +assert_directory_exists() { + [ -d "$1" ] || { echo "File '$1' does not exist" && exit 1; } +} + +assert_file_exists() { + [ -f "$1" ] || { echo "File '$1' does not exist" && exit 1; } +} + +assert_file_contains() { + grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } +} + +assert_file_contains_regex() { + grep -q -E "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } +} + +echo "> Prepare test data in $meta_temp_dir" +TMPDIR=$(mktemp -d --tmpdir="$meta_temp_dir") +function clean_up { + [[ -d "$TMPDIR" ]] && rm -r "$TMPDIR" +} +trap clean_up EXIT + +# Sample 1, barcode ACAGTCACAG, UMI CTACGGATGA +cat > "$TMPDIR/sample1_R1.fastq" <<'EOF' +@SAMPLE_1_SEQ_ID1 +ACAGTCACAGCTACGGATGAGCCTCATAAGCCTCACACATCCGCGCCTATGTTGTGACTCTCTGTGAG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@SAMPLE_1_SEQ_ID2 +ACAGTCACAGCTACGGATGAGCCTCATAAGCCTCACACATCCGCGCCTATGTTGTGACTCTCTGTGAG ++ +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +EOF + +cat > "$TMPDIR/sample1_R2.fastq" <<'EOF' +@SAMPLE_1_SEQ_ID1 +CTCACAGAGAGTCACAACATAGGCGCGGATGTGTGAGGCTTATGAGGC ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@SAMPLE_1_SEQ_ID2 +CTCACAGAGAGTCACAACATAGGCGCGGATGTGTGAGGCTTATGAGGC ++ +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +EOF + +# Sample 2, barcode CGGGTTTACC, UMI GCTAGCTAGC +cat > "$TMPDIR/sample2_R1.fastq" << 'EOF' +@SAMPLE_2_SEQ_ID1 +CGGGTTTACCGCTAGCTAGCCACCACTATGGTTGGCCGGTTAGTAGTGT ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII +@SAMPLE_2_SEQ_ID2 +CGGGTTTACCGCTAGCTAGCCACCACTATGGTTGGCCGGTTAGTAGTGT ++ +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +EOF + +cat > "$TMPDIR/sample2_R2.fastq" <<'EOF' +@SAMPLE_2_SEQ_ID1 +ACACTACTAACCGGCCAACCATAGTGGTG ++ +IIIIIIIIIIIIIIIIIIIIIIIIIIIII +@SAMPLE_2_SEQ_ID2 +ACACTACTAACCGGCCAACCATAGTGGTG ++ +!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +EOF + +cat > "$TMPDIR/barcodes.fasta" <<'EOF' +>sample1 +ACAGTCACAG +>sample2 +CGGGTTTACC +EOF + +# Note that there is a sjdbGTFchrPrefix argument for STAR: +# prefix for chromosome names in a GTF file (default: '-') +cat > "$TMPDIR/genome.fasta" <<'EOF' +>1 +TGGCATGAGCCAACGAACGCTGCCTCATAAGCCTCACACATCCGCGCCTATGTTGTGACTCTCTGTGAGCGTTCGTGGG +GCTCGTCACCACTATGGTTGGCCGGTTAGTAGTGTGACTCCTGGTTTTCTGGAGCTTCTTTAAACCGTAGTCCAGTCAA +TGCGAATGGCACTTCACGACGGACTGTCCTTAGCTCAGGGGA +EOF + +cat > "$TMPDIR/genes.gtf" <<'EOF' +1 example_source gene 0 72 . + . gene_id "gene1"; gene_name: "GENE1; +1 example_source exon 20 71 . + . gene_id "gene1"; gene_name: "GENE1"; exon_id: gene1_exon1; +1 example_source gene 80 160 . + . gene_id "gene2"; gene_name: "GENE2; +1 example_source exon 80 159 . + . gene_id "gene2"; gene_name: "GENE2"; exon_id: gene2_exon1; + +EOF + +echo "> Generate index" +STAR \ + ${meta_cpus:+--runThreadN $meta_cpus} \ + --runMode genomeGenerate \ + --genomeDir "$TMPDIR/index/" \ + --genomeFastaFiles "$TMPDIR/genome.fasta" \ + --sjdbGTFfile "$TMPDIR/genes.gtf" \ + --genomeSAindexNbases 2 > /dev/null 2>&1 + + +echo "> Run test 1" +run_1_dir="$TMPDIR/run_1" +mkdir -p "$run_1_dir" +pushd "$run_1_dir" > /dev/null +"$meta_executable" \ + --input_r1 "$TMPDIR/sample1_R1.fastq;$TMPDIR/sample2_R1.fastq" \ + --input_r2 "$TMPDIR/sample1_R2.fastq;$TMPDIR/sample2_R2.fastq" \ + --genomeDir "$TMPDIR/index/" \ + --barcodesFasta "$TMPDIR/barcodes.fasta" \ + --umiLength 10 \ + --runThreadN 2 \ + --output "$TMPDIR/output_*" +popd + +echo ">> Check if output directories exists" +sample1_out="$TMPDIR/output_ACAGTCACAG" +sample2_out="$TMPDIR/output_CGGGTTTACC" +assert_directory_exists "$sample1_out" +assert_directory_exists "$sample2_out" + +echo ">> Check if output files have been created" +for sample in "$sample1_out" "$sample2_out"; do + assert_file_exists "$sample/Aligned.sortedByCoord.out.bam" + assert_file_exists "$sample/Unmapped.out.mate1" + assert_file_exists "$sample/Unmapped.out.mate2" + assert_file_exists "$sample/Log.out" + assert_file_exists "$sample/Log.final.out" + assert_file_exists "$sample/ReadsPerGene.out.tab" +done + + +echo ">> Check if Solo output is present" +for sample in "$sample1_out" "$sample2_out"; do + assert_directory_exists "$sample1_out/Solo.out" + assert_directory_exists "$sample1_out/Solo.out/Gene" + assert_file_exists "$sample1_out/Solo.out/Barcodes.stats" + assert_file_exists "$sample1_out/Solo.out/Gene/raw/barcodes.tsv" + assert_file_exists "$sample1_out/Solo.out/Gene/raw/features.tsv" + assert_file_exists "$sample1_out/Solo.out/Gene/raw/matrix.mtx" + assert_file_exists "$sample1_out/Solo.out/Gene/filtered/barcodes.tsv" + assert_file_exists "$sample1_out/Solo.out/Gene/filtered/features.tsv" + assert_file_exists "$sample1_out/Solo.out/Gene/filtered/matrix.mtx" +done + +echo ">> Check contents of output" +echo ">>> Sample 1" +assert_file_contains "$sample1_out/Solo.out/Barcodes.stats" "nExactMatch 2" +assert_file_contains "$sample1_out/Log.final.out" "Uniquely mapped reads number | 2" +assert_file_contains "$sample1_out/Log.final.out" "Number of input reads | 2" + +cat << EOF | cmp -s "$sample1_out/Solo.out/Gene/filtered/barcodes.tsv" || { echo "Barcodes file is different"; exit 1; } +ACAGTCACAG +EOF + +cat << EOF | cmp -s "$sample1_out/Solo.out/Gene/filtered/features.tsv" || { echo "Features file is different"; exit 1; } +gene1 gene1 Gene Expression +gene2 gene2 Gene Expression +EOF + +cat << EOF | cmp -s "$sample1_out/Solo.out/Gene/filtered/matrix.mtx" || { echo "Matrix file is different"; exit 1; } +%%MatrixMarket matrix coordinate integer general +% +2 1 1 +1 1 1 +EOF + +echo ">>> Sample 2" +assert_file_contains "$sample2_out/Solo.out/Barcodes.stats" "nExactMatch 2" +assert_file_contains "$sample2_out/Log.final.out" "Uniquely mapped reads number | 2" +assert_file_contains "$sample2_out/Log.final.out" "Number of input reads | 2" + +cat << EOF | cmp -s "$sample2_out/Solo.out/Gene/filtered/barcodes.tsv" || { echo "Barcodes file is different"; exit 1; } +CGGGTTTACC +EOF + +cat << EOF | cmp -s "$sample2_out/Solo.out/Gene/filtered/features.tsv" || { echo "Features file is different"; exit 1; } +gene1 gene1 Gene Expression +gene2 gene2 Gene Expression +EOF + +cat << EOF | cmp -s "$sample2_out/Solo.out/Gene/filtered/matrix.mtx" || { echo "Matrix file is different"; exit 1; } +%%MatrixMarket matrix coordinate integer general +% +2 1 1 +2 1 1 +EOF + +echo "> Run test 2 (compressed input)" +gzip -c "$TMPDIR/sample1_R1.fastq" > "$TMPDIR/sample1_R1.fastq.gz" +gzip -c "$TMPDIR/sample2_R1.fastq" > "$TMPDIR/sample2_R1.fastq.gz" +gzip -c "$TMPDIR/sample1_R2.fastq" > "$TMPDIR/sample1_R2.fastq.gz" +gzip -c "$TMPDIR/sample2_R2.fastq" > "$TMPDIR/sample2_R2.fastq.gz" + +run_2_dir="$TMPDIR/run_2" +mkdir -p "$run_2_dir" +pushd "$run_2_dir" > /dev/null +"$meta_executable" \ + --input_r1 "$TMPDIR/sample1_R1.fastq.gz;$TMPDIR/sample2_R1.fastq.gz" \ + --input_r2 "$TMPDIR/sample1_R2.fastq.gz;$TMPDIR/sample2_R2.fastq.gz" \ + --genomeDir "$TMPDIR/index/" \ + --barcodesFasta "$TMPDIR/barcodes.fasta" \ + --umiLength 10 \ + --runThreadN 2 \ + --output "$TMPDIR/output_gz_*" > /dev/null 2>&1 +popd > /dev/null + +echo ">> Check if output directories exists" +sample1_out="$TMPDIR/output_gz_ACAGTCACAG" +sample2_out="$TMPDIR/output_gz_CGGGTTTACC" +assert_directory_exists "$sample1_out" +assert_directory_exists "$sample2_out" + +echo ">> Check if output files have been created" +for sample in "$sample1_out" "$sample2_out"; do + assert_file_exists "$sample/Aligned.sortedByCoord.out.bam" + assert_file_exists "$sample/Unmapped.out.mate1" + assert_file_exists "$sample/Unmapped.out.mate2" + assert_file_exists "$sample/Log.out" + assert_file_exists "$sample/Log.final.out" + assert_file_exists "$sample/ReadsPerGene.out.tab" +done + + +echo ">> Check if Solo output is present" +for sample in "$sample1_out" "$sample2_out"; do + assert_directory_exists "$sample1_out/Solo.out" + assert_directory_exists "$sample1_out/Solo.out/Gene" + assert_file_exists "$sample1_out/Solo.out/Barcodes.stats" + assert_file_exists "$sample1_out/Solo.out/Gene/raw/barcodes.tsv" + assert_file_exists "$sample1_out/Solo.out/Gene/raw/features.tsv" + assert_file_exists "$sample1_out/Solo.out/Gene/raw/matrix.mtx" + assert_file_exists "$sample1_out/Solo.out/Gene/filtered/barcodes.tsv" + assert_file_exists "$sample1_out/Solo.out/Gene/filtered/features.tsv" + assert_file_exists "$sample1_out/Solo.out/Gene/filtered/matrix.mtx" +done + +echo ">> Check contents of output" +echo ">>> Sample 1" +assert_file_contains "$sample1_out/Solo.out/Barcodes.stats" "nExactMatch 2" +assert_file_contains "$sample1_out/Log.final.out" "Uniquely mapped reads number | 2" +assert_file_contains "$sample1_out/Log.final.out" "Number of input reads | 2" + +cat << EOF | cmp -s "$sample1_out/Solo.out/Gene/filtered/barcodes.tsv" || { echo "Barcodes file is different"; exit 1; } +ACAGTCACAG +EOF + +cat << EOF | cmp -s "$sample1_out/Solo.out/Gene/filtered/features.tsv" || { echo "Features file is different"; exit 1; } +gene1 gene1 Gene Expression +gene2 gene2 Gene Expression +EOF + +cat << EOF | cmp -s "$sample1_out/Solo.out/Gene/filtered/matrix.mtx" || { echo "Matrix file is different"; exit 1; } +%%MatrixMarket matrix coordinate integer general +% +2 1 1 +1 1 1 +EOF + +echo ">>> Sample 2" +assert_file_contains "$sample2_out/Solo.out/Barcodes.stats" "nExactMatch 2" +assert_file_contains "$sample2_out/Log.final.out" "Uniquely mapped reads number | 2" +assert_file_contains "$sample2_out/Log.final.out" "Number of input reads | 2" + +cat << EOF | cmp -s "$sample2_out/Solo.out/Gene/filtered/barcodes.tsv" || { echo "Barcodes file is different"; exit 1; } +CGGGTTTACC +EOF + +cat << EOF | cmp -s "$sample2_out/Solo.out/Gene/filtered/features.tsv" || { echo "Features file is different"; exit 1; } +gene1 gene1 Gene Expression +gene2 gene2 Gene Expression +EOF + +cat << EOF | cmp -s "$sample2_out/Solo.out/Gene/filtered/matrix.mtx" || { echo "Matrix file is different"; exit 1; } +%%MatrixMarket matrix coordinate integer general +% +2 1 1 +2 1 1 +EOF + + +cat > "$TMPDIR/wrong_number_of_barcodes.fasta" <<'EOF' +>A1 +ACAGTCACAG +EOF + +echo "> Check that wrong number of barcodes are detected." +run_3_dir="$TMPDIR/run_3" +mkdir -p "$run_3_dir" +pushd "$run_3_dir" > /dev/null +set +eo pipefail +"$meta_executable" \ + --input_r1 "$TMPDIR/sample1_R1.fastq.gz;$TMPDIR/sample2_R1.fastq.gz" \ + --input_r2 "$TMPDIR/sample1_R2.fastq.gz;$TMPDIR/sample2_R2.fastq.gz" \ + --genomeDir "$TMPDIR/index/" \ + --barcodesFasta "$TMPDIR/wrong_number_of_barcodes.fasta" \ + --umiLength 10 \ + --runThreadN 2 \ + --output "$TMPDIR/output_gz_*" > /dev/null 2>&1 && echo "Expected non-zero exit code " && exit 1 +set -eo pipefail +popd > /dev/null + +echo "> Check that missing wildcard character is detected." +run_4_dir="$TMPDIR/run_4" +mkdir -p "$run_4_dir" +pushd "$run_4_dir" > /dev/null +set +eo pipefail +"$meta_executable" \ + --input_r1 "$TMPDIR/sample1_R1.fastq.gz;$TMPDIR/sample2_R1.fastq.gz" \ + --input_r2 "$TMPDIR/sample1_R2.fastq.gz;$TMPDIR/sample2_R2.fastq.gz" \ + --genomeDir "$TMPDIR/index/" \ + --barcodesFasta "$TMPDIR/barcodes.fasta" \ + --umiLength 10 \ + --runThreadN 2 \ + --output "$TMPDIR/output_run4" > /dev/null 2>&1 && echo "Expected non-zero exit code." && exit 1 +set -eo pipefail +popd > /dev/null + +echo "> Check that a mismatch in the length of the input mates is detected." +empty_input_file="$TMPDIR/empty.fastq" +touch "$empty_input_file" +run_5_dir="$TMPDIR/run_5" +mkdir -p "$run_5_dir" +pushd "$run_5_dir" > /dev/null +set +eo pipefail +"$meta_executable" \ + --input_r1 "$TMPDIR/sample1_R1.fastq;$empty_input_file" \ + --input_r2 "$TMPDIR/sample1_R2.fastq;$TMPDIR/sample2_R2.fastq" \ + --genomeDir "$TMPDIR/index/" \ + --barcodesFasta "$TMPDIR/barcodes.fasta" \ + --umiLength 10 \ + --runThreadN 2 \ + --output "$TMPDIR/output_run5_*" > /dev/null 2>&1 && echo "Expected non-zero exit code " && exit 1 +set -eo pipefail +popd > /dev/null + +echo "> Check that wrong number of input files is detected." +run_6_dir="$TMPDIR/run_6" +mkdir -p "$run_6_dir" +pushd "$run_6_dir" > /dev/null +set +eo pipefail +"$meta_executable" \ + --input_r1 "$TMPDIR/sample1_R1.fastq" \ + --input_r2 "$TMPDIR/sample1_R2.fastq;$TMPDIR/sample2_R2.fastq" \ + --genomeDir "$TMPDIR/index/" \ + --barcodesFasta "$TMPDIR/barcodes.fasta" \ + --umiLength 10 \ + --runThreadN 2 \ + --output "$TMPDIR/output_run_6_*" > /dev/null 2>&1 && echo "Expected non-zero exit code " && exit 1 +set -eo pipefail +popd > /dev/null + + +echo "> Check that wrong FASTQ order is detected." +run_6_dir="$TMPDIR/run_7" +mkdir -p "$run_6_dir" +pushd "$run_6_dir" > /dev/null +set +eo pipefail +"$meta_executable" \ + --input_r1 "$TMPDIR/sample2_R1.fastq.gz;$TMPDIR/sample1_R1.fastq.gz" \ + --input_r2 "$TMPDIR/sample1_R2.fastq;$TMPDIR/sample2_R2.fastq" \ + --genomeDir "$TMPDIR/index/" \ + --barcodesFasta "$TMPDIR/barcodes.fasta" \ + --umiLength 10 \ + --runThreadN 2 \ + --output "$TMPDIR/output_run_6_*" > /dev/null 2>&1 && echo "Expected non-zero exit code " && exit 1 +set -eo pipefail +popd > /dev/null + + +echo "> Check that order of input FASTQ files must not match with the order of barcodes" +run_8_dir="$TMPDIR/run_8" +mkdir -p "$run_8_dir" +pushd "$run_8_dir" > /dev/null +"$meta_executable" \ + --input_r1 "$TMPDIR/sample2_R1.fastq;$TMPDIR/sample1_R1.fastq" \ + --input_r2 "$TMPDIR/sample2_R2.fastq;$TMPDIR/sample1_R2.fastq" \ + --genomeDir "$TMPDIR/index/" \ + --barcodesFasta "$TMPDIR/barcodes.fasta" \ + --umiLength 10 \ + --runThreadN 2 \ + --output "$TMPDIR/output_*" > /dev/null 2>&1 +popd + +echo ">> Check if output directories exists" +sample1_out="$TMPDIR/output_ACAGTCACAG" +sample2_out="$TMPDIR/output_CGGGTTTACC" +assert_directory_exists "$sample1_out" +assert_directory_exists "$sample2_out" + +echo ">> Check if output files have been created" +for sample in "$sample1_out" "$sample2_out"; do + assert_file_exists "$sample/Aligned.sortedByCoord.out.bam" + assert_file_exists "$sample/Unmapped.out.mate1" + assert_file_exists "$sample/Unmapped.out.mate2" + assert_file_exists "$sample/Log.out" + assert_file_exists "$sample/Log.final.out" + assert_file_exists "$sample/ReadsPerGene.out.tab" +done + + +echo ">> Check if Solo output is present" +for sample in "$sample1_out" "$sample2_out"; do + assert_directory_exists "$sample1_out/Solo.out" + assert_directory_exists "$sample1_out/Solo.out/Gene" + assert_file_exists "$sample1_out/Solo.out/Barcodes.stats" + assert_file_exists "$sample1_out/Solo.out/Gene/raw/barcodes.tsv" + assert_file_exists "$sample1_out/Solo.out/Gene/raw/features.tsv" + assert_file_exists "$sample1_out/Solo.out/Gene/raw/matrix.mtx" + assert_file_exists "$sample1_out/Solo.out/Gene/filtered/barcodes.tsv" + assert_file_exists "$sample1_out/Solo.out/Gene/filtered/features.tsv" + assert_file_exists "$sample1_out/Solo.out/Gene/filtered/matrix.mtx" +done + +echo ">> Check contents of output" +echo ">>> Sample 1" +assert_file_contains "$sample1_out/Solo.out/Barcodes.stats" "nExactMatch 2" +assert_file_contains "$sample1_out/Log.final.out" "Uniquely mapped reads number | 2" +assert_file_contains "$sample1_out/Log.final.out" "Number of input reads | 2" + +cat << EOF | cmp -s "$sample1_out/Solo.out/Gene/filtered/barcodes.tsv" || { echo "Barcodes file is different"; exit 1; } +ACAGTCACAG +EOF + +cat << EOF | cmp -s "$sample1_out/Solo.out/Gene/filtered/features.tsv" || { echo "Features file is different"; exit 1; } +gene1 gene1 Gene Expression +gene2 gene2 Gene Expression +EOF + +cat << EOF | cmp -s "$sample1_out/Solo.out/Gene/filtered/matrix.mtx" || { echo "Matrix file is different"; exit 1; } +%%MatrixMarket matrix coordinate integer general +% +2 1 1 +1 1 1 +EOF + +echo ">>> Sample 2" +assert_file_contains "$sample2_out/Solo.out/Barcodes.stats" "nExactMatch 2" +assert_file_contains "$sample2_out/Log.final.out" "Uniquely mapped reads number | 2" +assert_file_contains "$sample2_out/Log.final.out" "Number of input reads | 2" + +cat << EOF | cmp -s "$sample2_out/Solo.out/Gene/filtered/barcodes.tsv" || { echo "Barcodes file is different"; exit 1; } +CGGGTTTACC +EOF + +cat << EOF | cmp -s "$sample2_out/Solo.out/Gene/filtered/features.tsv" || { echo "Features file is different"; exit 1; } +gene1 gene1 Gene Expression +gene2 gene2 Gene Expression +EOF + +cat << EOF | cmp -s "$sample2_out/Solo.out/Gene/filtered/matrix.mtx" || { echo "Matrix file is different"; exit 1; } +%%MatrixMarket matrix coordinate integer general +% +2 1 1 +2 1 1 +EOF + diff --git a/src/report/OutputSTARsolo.png b/src/report/OutputSTARsolo.png new file mode 100644 index 00000000..cb77d8e0 Binary files /dev/null and b/src/report/OutputSTARsolo.png differ diff --git a/src/report/config.vsh.yaml b/src/report/config.vsh.yaml new file mode 100644 index 00000000..c00ca403 --- /dev/null +++ b/src/report/config.vsh.yaml @@ -0,0 +1,77 @@ +name: create_report +namespace: "report" +description: | + Create a basic QC report in HTML format based on a number of esets. +authors: + - __merge__: /src/base/authors/dries_schaumont.yaml + roles: [ maintainer ] + - __merge__: /src/base/authors/marijke_van_moerbeke.yaml + roles: [ author, maintainer ] +argument_groups: + - name: "Arguments" + arguments: + - type: file + name: "--eset" + required: true + multiple: true + - type: file + name: "--output_report" + required: true + direction: output + example: report.html +resources: + - type: r_script + path: script.R + - type: r_script + path: template.Rmd + - type: r_script + path: plateLayouts.R + - path: OutputSTARsolo.png + type: file +test_resources: + - type: r_script + path: test.R + - path: ./test_data +engines: + - type: docker + image: rocker/r2u:24.04 + setup: + - type: apt + packages: + - procps + - pandoc + - type: r + script: + - install.packages("BiocManager") + - BiocManager::install(version = "3.21", type = "source", checkBuilt = TRUE) + - type: r + bioc: + - Biobase + - ComplexHeatmap + cran: + - ggplot2 + - knitr + - gridExtra + - RColorBrewer + - processx + - whisker + - rmarkdown + - bookdown + - data.table + - platetools + - htmltools + - DT + - logger + - bit64 + script: + - install.packages("oaStyle", repos = c(rdepot = "https://repos.openanalytics.eu/repo/public", getOption("repos"))) + test_setup: + - type: r + packages: + - testthat + - R.utils +runners: + - type: executable + - type: nextflow + + \ No newline at end of file diff --git a/src/report/plateLayouts.R b/src/report/plateLayouts.R new file mode 100644 index 00000000..a2909d4a --- /dev/null +++ b/src/report/plateLayouts.R @@ -0,0 +1,447 @@ + +#' Displays the annotation of the wells in a plateLayout +#' @param plateData a data.table object containing the information +#' of the plate. This must contain a "WellID". +#' @param plateName The plate name +#' @param valueVariable The name of the variable in 'plateData' to +#' be visualized in a plate layout. +#' @param textVariable The name of the variable in 'plateData' to be +#' shown in the wells of the plate layout. If NULL, the valueVariable +#' is shown. +#' @param colours A named character vector containing the colours +#' for the different levels of the valuevariable. The names should +#' correspond to the dose levels. if not specified, a scheme of blues +#' will be provided. +#' @param breaks Numeric vector indicating breaks for plot coloring. +#' @param colourWellText Colour to display the text in the wells. +#' @param layout Integer vector of length two with number of rows and +#' colums in a plate, e.g. \code{c(16,24)} +#' @param legend.title A title for the legend +#' @param plot.title A title for the plot, will be contracted +#' with the plate name +#' @param ... additional arguments for \code{plateLayout.default} function +#' @import data.table +#' @importFrom platetools fill_plate +#' @export +plateLayout.annotation <- function( + plateData, + plateName = character(), + valueVariable = "Dose", + textVariable = NULL, + breaks = NULL, colours = NULL, + colourWellText = "black", + layout = c(16, 24), + legend.title = "Dose", + plot.title = "Plate Annotation - ", + textFontSize = 9, ... +) { + WellID <- Label <- NULL + + if (!(all(c("WellID", "SampleName") %in% colnames(plateData)))) { + stop(" 'WellID' and 'SampleName' column required in plateData object") + } + + #Check WellID Format + checkWellID <- grepl("^[[:upper:]]{1,2}[[:digit:]]{1,2}$", plateData$WellID) + if(!all(checkWellID)){ + stop("WellID does not have the correct format") + } + + + plateData[, WellID := paste0( + sub(".*([[:alpha:]]).+", "\\1", plateData$WellID), + sprintf( + "%02d", as.numeric(sub(".*[[:alpha:]](.+)", "\\1", plateData$WellID)) + ) + )] + + plateData <- platetools::fill_plate(plateData, "WellID", plate = layout[1]*layout[2]) + + plateData$column <- factor( + sprintf( + "%02d", + as.numeric(sub(".*[[:alpha:]](.+)", "\\1", plateData$WellID)) + ), + levels = sprintf("%02d", seq(1, layout[2])) + ) + plateData$row <- factor(sub(".*([[:alpha:]]).+", "\\1", plateData$WellID), + levels = LETTERS[seq(1, layout[1])]) + + if (!is.null(valueVariable)){ + plateData[, values := as.character(plateData[, ..valueVariable][[1]])] + valueVar <- "values" + }else{ + plateData[, values := "grey"] + valueVar <- "values" + colours <- setNames("grey", "grey") + } + + + if (is.null(colours)) { + + blues <- colorRampPalette(c("#d6e0ff", "#2171B5")) + greens <- colorRampPalette(c("light green", "dark green")) + + numLevels <- sort(as.numeric(as.character(unique(plateData[, values])[ + grepl( + "^[[:digit:]]+([.][[:digit:]]+)?$", + trimws(unique(plateData[, values])) + ) + ]))) + otherLevels <- sort(as.character(unique(plateData[, values])[ + !grepl( + "^[[:digit:]]+([.][[:digit:]]+)?$", + trimws(unique(plateData[,values])) + ) + ])) + + colours <- c(blues(length(numLevels)), greens(length(otherLevels)), "red") + names(colours) <- c(numLevels, otherLevels, "failed") + } + + if (!is.null(textVariable)) { + plateData[, + Label := do.call(paste, c(.SD, sep = "\n ")), + .SDcols = textVariable + ] + plateData[, Label := gsub("-", "-\n", Label)] + plateData[, Label := gsub("_", "_\n", Label)] + textVar <- "Label" + } else { + textVar <- NULL + } + + + if (is.null(breaks)){ + breaks <- seq_len(length(colours)) + } + + plateLayout( + plateData = plateData, valueVariable = valueVar, + textVariable = textVar, plateName = plateName, + breaks = breaks, colourWellText = colourWellText, + legend.title = legend.title, layout = layout, + colours = colours, plot.title = plot.title, + textFontSize = textFontSize, ... + ) +} + + + +#' Create a heatmap of values in a plateLayout view. The values can be +#' library sizes, number of genes, qcScore (0/1) or a factor. +#' @param plateData A data.table of the values to be visualized with +#' at least the column of interest (specified in 'varOfInterest') +#' and a 'WellID' column indicating the wells in the plate. The WellID +#' is a combination of a letter (row in the plate) and an integer +#' (column in the plate). +#' @param valueVariable The name of the variable in 'plateData' +#' to be visualized in a plate layout +#' @param textVariable The name of the variable in 'plateData' +#' to be shown in the wells of the plate layout. Defaults to the +#' valueVariable and if NULL, no text will be displayed. +#' @param breaks Numeric vector indicating breaks for plot coloring. +#' @param colours Colours to be used for levels specified by +#' the breaks. If NULL, a colour scheme of purples is shown. +#' @param colourWellText Colour to display the text in the wells. +#' @param layout Integer vector of length two with number of rows +#' and colums in a plate, e.g. \code{c(16,24)} +#' @param makeContourColours Logical, whether or not the plate +#' layout will contain a contour colours for the wells based on the +#' parameters in 'contourColours' and 'categories' +#' @param contourVariable The variable used for the contour colouring +#' @param contourColours Character vector specifying a colour for +#' each range in 'categories' +#' @param labelsCategories Character vector specifying the names +#' (labels) for each range in 'categories' +#' @param categories if contour Variable is not a factor, a numeric +#' vector specifying the categories to divide the 'varOfInterest', +#' including the lower and upper limits. +#' @param plateName The plate name +#' @param plot.title A title for the plot, will be contracted with +#' the plate name +#' @param legend.title A title for the legend +#' @param displayHeatmap Logical, whether to display the plateLayout heatmap +#' @param saveHeatmap Logical, whether to save the plateLayout heatmap +#' @param outputDir The directory where the plateLayout heatmap should be saved +#' @param prefix The prefix to the file name of the saved plateLayout heatmap +#' @param ... additional arguments for \code{ComplexHeatmap::Heatmap} function +#' @importFrom platetools fill_plate +#' @importFrom RColorBrewer brewer.pal +#' @importFrom ComplexHeatmap Heatmap +#' @importFrom circlize colorRamp2 +#' @importFrom grid grid.text grid.rect gpar legendGrob gpar +#' @importFrom grDevices dev.off png +#' @importFrom graphics title +#' @export +plateLayout <- function( + plateData, valueVariable, textVariable = valueVariable, + breaks = NULL, colours = NULL, colourWellText = "white", textFontSize = 6, + layout = c(16, 24), makeContourColours = FALSE, contourVariable = character(), + contourColours = c("red", "orange", "seagreen3"),lwdContours = c(1, 1, 1), + labelsCategories = c('1', '2', '3'), categories = NULL, plateName = character(), + plot.title = character(), legend.title = NULL, legendFontSize = 15, + row_split = rep("A", 16), col_split = rep("A", 24), legendFontSizeTitle = 15, + displayHeatmap = TRUE, saveHeatmap = FALSE, outputDir = ".", prefix = "" +) { + WellID <- NULL + if (!(all(c("WellID", "SampleName") %in% colnames(plateData)))) { + stop(" 'WellID' and 'SampleName' column required in plateData object") + } + + + plateData[, WellID := paste0( + sub(".*([[:alpha:]]).+", "\\1", plateData$WellID), + sprintf( + "%02d", + as.numeric(sub(".*[[:alpha:]](.+)", "\\1", plateData$WellID)) + ) + )] + + plateData <- platetools::fill_plate(plateData, "WellID", plate = 384) + + plateData$column <- factor( + sprintf("%02d", as.numeric( + sub(".*[[:alpha:]](.+)", "\\1", plateData$WellID) + )), + levels = sprintf("%02d", seq(1, layout[2])) + ) + plateData$row <- factor(sub(".*([[:alpha:]]).+", "\\1", plateData$WellID), + levels = LETTERS[seq(1, layout[1])]) + + + plateValues <- plateLayoutFormat( + plateData, + varOfInterest = valueVariable, + rows = layout[1], + cols = layout[2] + ) + if (!is.null(textVariable)) { + plateText <- plateLayoutFormat( + plateData, varOfInterest = textVariable, + rows = layout[1], + cols = layout[2] + ) + } + plot.title <- gsub( + "^([a-z])", "\\U\\1", + gsub("([A-Z])", " \\1", + plot.title, perl = TRUE), perl = TRUE + ) + mainTitle <- paste0(plot.title, plateName) + plateContourColours <- matrix("", nrow = layout[1], ncol = layout[2]) + + if (makeContourColours) { + contourData <- plateData[WellType %in% c("nonEmpty", "Treated Wells"), ] + + if (is.numeric(contourData[, ..contourVariable][[1]])) { + contourData$contours <- cut( + contourData[, ..contourVariable][[1]], + categories, left = TRUE, + right = TRUE, + labels = labelsCategories) + } + else { + contourData$contours <- contourData[, ..contourVariable][[1]] + } + names(contourColours) <- labelsCategories + names(lwdContours) <- labelsCategories + for (i in seq_len(layout[1])) { + for (j in seq_len(layout[2])) { + tryCatch({ + sampleHit <- which( + as.character(contourData$WellID) == paste0( + LETTERS[i], sprintf("%02d", j) + ) + ) + if (length(sampleHit) == 1) { + plateContourColours[i, j] <- as.character( + contourData[sampleHit,'contours'][[1]] + ) + } + }, + error = function(e) { + print(paste0(LETTERS[i], sprintf("%02d", j), " is missing.")) + } + ) + } + } + } + + plateValues$contours <- plateContourColours + colnames(plateValues$values) <- seq_len(ncol(plateValues$values)) + + if (is.null(breaks)) { + breakValues <- plateValues$values + breakValues[which(is.na(breakValues))] <- 0 + if (all(breakValues >= 0)) { + breaks <- computeBreaks(7, max(plateValues$values, na.rm = TRUE)) + } else { + breaks <- quantile(plateValues$values, probs = seq(0, 1, 0.125)) + } + } + + if (is.null(colours)) { + colours <- tryCatch({ + circlize::colorRamp2( + breaks = breaks, + colors = brewer.pal(length(breaks), "Purples") + ) + }, + error = function(cond){ + + message("Recomputed breaks for proper colour mapping") + + breakValues <- plateValues$values + breakValues[which(is.na(breakValues))] <- 0 + if (all(breakValues >= 0)) { + breaks <- computeBreaks(7, max(plateValues$values, na.rm = TRUE)) + } else { + breaks <- quantile(plateValues$values, probs = seq(0, 1, 0.125)) + } + + circlize::colorRamp2( + breaks = breaks, + colors = brewer.pal(length(breaks), "Purples") + ) + + }) + } + + ht <- Heatmap( + plateValues$values, + column_title = mainTitle, column_title_side = "top", + rect_gp = gpar(lwd = 0.4), + cluster_rows = FALSE, cluster_columns = FALSE, + col = colours, row_title = NULL, + row_split = row_split, column_split = col_split, + row_names_side = "left", + cluster_row_slices = FALSE, + cluster_column_slices = FALSE, + show_heatmap_legend = TRUE, + heatmap_legend_param = list( + title = ifelse( + is.null(legend.title), + paste0(valueVariable, "\n"), + paste0(legend.title, "\n") + ), + grid_height = unit(9, "mm"), border = "black", + labels_gp = gpar(fontsize = legendFontSize), + title_gp = gpar(fontsize = legendFontSizeTitle) + ), + cell_fun = function(j, i, x, y, width, height, fill) { + if (is.na(plateValues$values[i, j])) { + grid.rect( + x, y, width, height, + gp = gpar(fill = "white", alpha = 0.7, lwd = 0.7, col = "white") + ) + } + else if (!is.null(textVariable)) { + grid.text( + plateText$values[i, j], x, y, + just = "centre", + gp = gpar(fontsize = textFontSize, col = colourWellText) + ) + } + if (makeContourColours) { + if (!is.na(plateValues$contours[i, j])) { + grid.rect( + x, y, width, height, + gp = gpar( + col = contourColours[as.character(plateValues$contours[i, j])], + fill = NA, + lwd = lwdContours[as.character(plateValues$contours[i, j])] + ) + ) + } + } + } + ) + + if (displayHeatmap) { + print(ht) + } + if (saveHeatmap) { + png( + file.path( + outputDir, + paste0(prefix,gsub(" |-", "",plot.title), "_", plateName, ".png") + ), + width = 30, height = 10, units = "cm", res = 1200 + ) + print(ht) + dev.off() + } + + return(ht) +} + + +#' Return numerical matrix with number of reads that corresponds to the +#' plate layout +#' @param data A data.frame of the values to be visualized with at least +#' the columnof interest (specified in 'varOfInterest') and a 'WellID' column +#' indicating the wells in the plate. The WellID is a combination of a +#' letter (row in the plate) and an integer (column in the plate). +#' @param varOfInterest The name of the variable in 'data' to be visualized +#' in a plate layout +#' @param rows number of rows in a plate layout +#' @param cols number of columns in a plate layout +#' @param verbose if \code{TRUE}, samples missing from the plate +#' will be reported +#' @export +plateLayoutFormat <- function( + data, varOfInterest, + rows = 16, cols = 24, + verbose = FALSE +) { + plateValues <- matrix(NA, nrow = rows, ncol = cols) + for (i in seq_len(rows)) { + for (j in seq_len(cols)) { + tryCatch({ + sampleHit <- which( + as.character(data$WellID) == paste0(LETTERS[i], sprintf("%02d", j)) + ) + if(length(sampleHit) == 1){ + plateValues[i, j] <- data[sampleHit, ..varOfInterest][[1]] + } + }, + error = function(e) { + if (verbose == TRUE) { + print(paste0(LETTERS[i], sprintf("%02d", j), " is missing.")) + } + } + ) + } + } + + row.names(plateValues) <- LETTERS[1:rows] + return(list("values" = plateValues)) +} + + + +#' Helper function to automate break selection for raw count data +#' +#' This function creates an exponentially increasing vector for given number +#' breaks between zero and some element of choice. It is particularly useful for +#' raw counts or raw counts per million. +#' +#' @param nBreaks Number of breaks to be generated +#' @param maxElement Maximum value of data entries +#' @export +computeBreaks <- function(nBreaks, variable) { + + maxElement <- max(variable, na.rm = TRUE) + if (length(unique(variable)) == 1) { + breaks <- c(0, 0.5, ifelse(maxElement < 1, 1, maxElement)) + } else { + coefSystem <- solve( + rbind(c(1, 1), c(1, (nBreaks - 1)))) %*% c(0, log(maxElement) + ) + coefExp <- c(exp(coefSystem[1]), coefSystem[2]) + breaks <- coefExp[1] * exp((1:(nBreaks - 1)) * coefExp[2]) + breaks <- unique(c(0, breaks)) + } + return(breaks) +} diff --git a/src/report/script.R b/src/report/script.R new file mode 100644 index 00000000..7ab2a467 --- /dev/null +++ b/src/report/script.R @@ -0,0 +1,33 @@ +library(whisker) +library(logger) + +log_info("Setting temporary directory to: {meta$temp_dir}") +Sys.setenv(TMP = meta$temp_dir) +temp_folder <- tempdir(check = TRUE) +log_info("Created temporary directory {temp_folder}") + +template <- file.path(meta$resources_dir, "template.Rmd") + +esets_normalized <- lapply(par$eset, function(eset_path) { + return(file.path(normalizePath(dirname(eset_path)), basename(eset_path))) +}) + +log_info(paste0( + "Rendering markdown {template} to HTML ", + "{par$output_report} with esets {paste(esets_normalized, collapse = ', ')}" +)) + +rmarkdown::render( + normalizePath(template), + output_file = basename(par$output_report), + output_dir = dirname(par$output_report), + runtime = "static", + intermediates_dir = par$report_dir, + clean = TRUE, + params = list( + esets = esets_normalized, + outputDir = par$report_dir + ) +) + +log_info("Done") diff --git a/src/report/template.Rmd b/src/report/template.Rmd new file mode 100644 index 00000000..20f8c190 --- /dev/null +++ b/src/report/template.Rmd @@ -0,0 +1,977 @@ +--- +title: "Exploratory Data Report" +date: "`r format(Sys.time(), '%d %B, %Y')`" +editor_options: + chunk_output_type: console +output: + oaStyle::html_report +# parameters which are overwritten by the script +params: + outputDir: 'output/' + esets: + - sample1.rds + - sample2.rds +--- + + + + + + + + + +```{r params, eval = TRUE, include = FALSE} +outputDir <- params$outputDir +esets <- params$esets +``` + + +```{r outputDir, echo = FALSE} +## Required: ABSOLUTE outputDir +outputDir <- file.path(outputDir) + +# When working on a windows computer it should be +# "/Users/..." instead of "C:/Users/..." +if (.Platform$OS.type == "windows") { + outputDir <- paste0( + "/", + paste( + unlist(strsplit(outputDir, split = "/"))[-1], collapse = "/" + ), + "/" + ) +} +``` + + + + +```{r optionsChunkDoNotModify, echo = FALSE, message = FALSE, warning=FALSE} + +## Chunk with options for knitr. This chunk should not be modified. +knitr::opts_chunk$set( + eval = TRUE, + echo = FALSE, + message = FALSE, + cache = FALSE, + warning = FALSE, + error = FALSE, + comment = NA, #"#", + tidy = FALSE, + collapse = TRUE, + out.width = "100%", + fig.width = 20, + fig.height = 10, + results = "asis") + +knitr::opts_knit$set(root.dir = getwd()) + +options(warn = 1, width = 200) + +``` + +```{r libraries_and_functions} +source("plateLayouts.R") +library(ComplexHeatmap) +library(data.table) +library(ggplot2) +library(knitr) +library(Biobase) +library(gridExtra) +library(RColorBrewer) +``` + + +```{r dataImport} + +# Create esetList +esetList <- sapply( + esets, simplify = FALSE, + USE.NAMES = TRUE, + function(eset_raw) { + if (!file.exists(eset_raw)) { + stop(paste0("Provided path '", eset_raw, "' is not a file.")) + } + eset <- readRDS(eset_raw) + } +) +pools <- sapply(esetList, function(eset) { + unique(eset$PoolName) +}) +names(esetList) <- unlist(pools) + +# Create qcData +pDataList <- lapply(esetList, function(eset) data.table(pData(eset))) +qcData <- rbindlist(pDataList, fill = TRUE) + +textVars <- "SampleName" +annotationVar <- "PoolName" + +if (!"SampleName" %in% names(qcData)) { + qcData[, SampleName := paste0(PoolName, "_", WellBC)] +} +qcData[, log10LibSize := round(log10(NumberOfInputReads))] +qcData[, (annotationVar) := lapply(.SD, as.factor), .SDcols = annotationVar] + + +colourList <- list() +Design_levels <- sort( + as.character(unique(qcData[, ..annotationVar][[1]])), + decreasing = TRUE +) + +if (length(Design_levels) == 1) { + colours <- c("#d6e0ff", "lightgrey") + names(colours) <- c(Design_levels, "Empty") + colourList[[annotationVar]] <- list( + "colours" = colours, + "annotVar" = annotationVar, + "text" = textVars + ) +}else if (length(Design_levels) == 2) { + colours <- c("#d6e0ff", "#FF9999") + + names(colours) <- c(Design_levels) + colourList[[annotationVar]] <- list( + "colours" = colours, + "annotVar" = annotationVar, + "text" = textVars + ) +} else if (length(Design_levels) <= 20) { + + if (length(Design_levels) > 12) { + colours <- c( + brewer.pal(12, "Set3"), + brewer.pal((length(Design_levels) - 12), + "Pastel2") + ) + } else { + colours <- c(brewer.pal(length(Design_levels), "Set3")) + } + + names(colours) <- c(Design_levels) + colourList[[annotationVar]] <- list( + "colours" = colours, + "annotVar" = annotationVar, + "text" = textVars + ) +} else { + colours <- c("#d6e0ff") + names(colours) <- c("nonEmpty") + colourList[[annotVar]] <- list( + "colours" = colours, + "annotVar" = annotVar, + "text" = annotVar + ) +} +``` + +# Pool Description + +Per pool within this study, there are several pool layout plots shown, based on the + +* number of STAR input reads (= library size) + +* log10 transformed number of STAR input reads + +* number of detected UMIs + +* number of detected genes + +* number of chromosomal reads + +* percentage of ERCC + +* percentage of mitochondria + + +> The values for the different samples within each pool is expected to be comparable if the content of the different pools is equally diverse. + +```{r plateAnnotation, out.width = "100%",fig.width = 20, fig.height= 10} + +plateVars <- c("NumberOfInputReads", "log10LibSize", "NumberOfMappedReads", + "NumberOfChromReads", "NumberOfUMIs", "NumberOfGenes", + "pctMT", "pctERCC") + +breaksVars <- lapply( + plateVars, + function(var) { + computeBreaks(7, qcData[, ..var]) + } +) +names(breaksVars) <- plateVars + +for (pool in pools){ + cat("\n\n") + cat(paste0("## ", pool, " {.tabset} \n\n")) + poolData <- qcData[PoolName == pool] + lapply(plateVars, function(plateVar) { + cat("\n\n") + cat(sprintf("### %s {.unnumbered}", plateVar)) + cat("\n\n") + plateLayout( + poolData, valueVariable = plateVar, + textFontSize = 10, legendFontSize = 12, + plateName = pool, plot.title = "libSize - ", + legend.title = "libSize", breaks = breaksVars[[plateVar]] + ) + cat("\n\n") + }) + cat("\n\n") +} +``` + +
+ + +# Data Distributions + + +## Reads Distributions {.tabset} + +The 4 box plots below represent the distributions per pool of the different samples based on: + +* the number of STAR input reads + +* the number of STAR mapped reads + +* the percentage of STAR mapped reads + +* the number of detected genes + +> The distributions contribute to the QC metrics mentioned in Par 3. The higher these values, the better. +> The data range for the different plates is expected to be comparable if the content of the different plates is equally diverse. + + +### Number of Input Reads {.tabset .unnumbered} + +```{r settings_1} + +nColPlots = 1 +figHeight = 7 + +``` + +#### Distribution {.tabset .unnumbered} + + +```{r boxplots_input_plate, fig.height = figHeight} +ggplot( + qcData, + aes( + x = PoolName, + y = NumberOfInputReads, colour = PoolName + ) +) + geom_boxplot() + ylab("Number of Input Reads") + + ggtitle("Number of Input Reads") + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size = 10), + axis.text.y = element_text(angle = 90, size = 14), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title = element_text(size = 15), + axis.text.x = element_blank(), + axis.ticks.x = element_blank() + ) + +``` + + +### Number of Mapped Reads {.tabset .unnumbered} + +#### Distribution {.unnumbered} + +```{r boxplots_mapped_plate, fig.height = figHeight} + +ggplot( + qcData, + aes(x = PoolName, y = NumberOfMappedReads, colour = PoolName) +) + geom_boxplot() + ylab("Number of Mapped Reads") + + ggtitle("Number of Mapped Reads") + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size = 10), + axis.text.y = element_text(angle = 90, size = 14), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title.y = element_text(size = 15), + axis.text.x = element_blank(), + axis.ticks.x = element_blank() + ) +``` + + +#### pct Mapped Reads {.unnumbered} + +```{r boxplots_pctMapped_plate, fig.height = figHeight} +ggplot( + qcData, + aes(x = PoolName, y = PctMappedReads, colour = PoolName) +) + + geom_boxplot() + + ylab("pct Mapped Reads") + + ggtitle("pct Mapped Reads") + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size = 10), + axis.text.y = element_text(angle = 90, size = 14), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title.y = element_text(size = 15), + axis.text.x = element_blank(), + axis.ticks.x = element_blank() + ) +``` + +### Number of Chromosomal Reads {.tabset .unnumbered} + +#### Distribution {.unnumbered} + +```{r boxplots_chrom_plate, fig.height = figHeight} + +ggplot( + qcData, + aes(x = PoolName, y = NumberOfChromReads, colour = PoolName) +) + geom_boxplot() + ylab("Number of Chromosomal Reads") + + ggtitle("Number of Chromosomal Reads") + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size = 10), + axis.text.y = element_text(angle = 90, size = 14), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title.y = element_text(size = 15), + axis.text.x = element_blank(), + axis.ticks.x = element_blank() + ) + +``` + +#### pct Chromosomal Reads {.unnumbered} + +```{r boxplots_pctChrom_plate, fig.height = figHeight} + +ggplot( + qcData, + aes(x = PoolName, y = pctChrom, colour = PoolName) +) + geom_boxplot() + ylab("pct Chromosomal Reads") + + ggtitle("pct Chromosomal Reads") + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size = 10), + axis.text.y = element_text(angle = 90, size = 14), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title.y = element_text(size = 15), + axis.text.x = element_blank(), + axis.ticks.x = element_blank() + ) +``` + +### Number of UMIs {.tabset .unnumbered} + +#### Distribution {.tabset .unnumbered} + + +```{r boxplots_umi_plate, fig.height = figHeight} + +ggplot( + qcData, + aes(x = PoolName, y = NumberOfUMIs, colour = PoolName) +) + geom_boxplot() + ylab("Number of UMIs") + + ggtitle('Number of UMIs') + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size = 10), + axis.text.y = element_text(angle = 90, size = 14), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title = element_text(size = 15), + axis.text.x = element_blank(), + axis.ticks.x = element_blank() + ) + +``` + +#### Density distribution {.unnumbered} + +```{r density_numberOfUMIs} + +## Pre-filtering data exploration +dt_plot <- melt( + qcData, + id.vars = c("SampleName", "PoolName", "WellID"), + measure.vars = c("NumberOfInputReads", "NumberOfMappedReads", "NumberOfUMIs") +) + +readsDensity_plot <- ggplot(dt_plot, aes(value)) +readsDensity_plot <- readsDensity_plot + + geom_density(aes(fill = variable), alpha=0.8) + + facet_grid(~ PoolName, scales = "free_x", space = "fixed", drop = TRUE) + + geom_vline( + xintercept = 5e5, + linetype = "dashed", + color = "steelblue3", size = 2 + ) + + annotate( + "text", + x = 3.5e5, y = 2e-6, label = "500k", + angle = 90, color = "steelblue3", size = 10 + ) + + geom_vline( + xintercept = 1.5e6, linetype = "dashed", + color = "forestgreen", size = 2 + ) + + annotate( + "text", x = 1.35e6, y = 2e-6, label = "1.5M", + angle = 90, color = "forestgreen", size = 10 + ) + + labs( + title = "Density plot", + subtitle = paste0( + "# Samples with NumberOfMappedReads > 1.5M: ", + length(which(qcData$NumberOfMappedReads > 1.5e6)), + "\n# Samples with NumberOfUMIs > 500k: ", + length(which(qcData$NumberOfUMIs > 5e5)) + ), + caption = paste0("# Total samples (after removing empty): ", nrow(qcData)), + x = "Count", + fill = "Variable" + ) + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size = 5), + axis.text.x = element_text(angle = 90, size = 14), + plot.title = element_text(size = 18), + plot.subtitle = element_text(size = 17), + plot.caption = element_text(size = 15), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title = element_text(size = 15), + axis.text.y = element_blank(), + axis.ticks.y = element_blank(), + axis.title.y = element_blank() + ) +readsDensity_plot + +``` + +### Number of Genes {.tabset .unnumbered} + +#### Distribution {.unnumbered} + +```{r boxplots_genes_plate, fig.height = figHeight} +ggplot( + qcData, + aes(x = PoolName, y = NumberOfGenes, colour = PoolName) +) + + geom_boxplot() + ylab("Number of Genes") + + ggtitle("Number of Genes") + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size = 10), + axis.text.y = element_text(angle = 90, size = 14), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title.y = element_text(size = 15), + axis.text.x = element_blank(), + axis.ticks.x = element_blank() + ) +``` + +## {.tabset .toc-ignore .unnumbered} + + +In addition, several plots are shown visualizing the efficiency of the reads-to-genes translation: + +* the number of input reads vs the number of mapped reads + +* the number of chromosomal reads vs the number of mapped reads + +* the number of mapped reads per UMI vs the number of mapped reads + +* the number of UNI vs the number of mapped reads + +* the number of mapped reads vs the number of genes + +* the number of chromosomal reads vs the number of genes + +* the number of mapped reads per UMI vs the number of genes + +### Mapping Efficiency {.tabset .unnumbered} + +#### Number of Input Reads {.unnumbered} + +```{r mapping_efficiency_1_plate, fig.height = 7} + +ggplot( + qcData, + aes(x = NumberOfInputReads, y = NumberOfMappedReads, colour = PoolName) +) + + geom_point() + + xlab("Number of Input Reads") + + ylab("Number of Mapped Reads") + + ggtitle("Number of Mapped Reads vs Number of Input Reads") + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size = 10), + axis.text = element_text(angle = 90, size = 15), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title = element_text(size = 15) + ) + +``` + + +#### Number of Chromosomal Reads {.unnumbered} + +```{r mapping_efficiency_2_plate, fig.height = 7} + +ggplot( + qcData, + aes(x = NumberOfChromReads, y = NumberOfMappedReads, colour = PoolName) +) + geom_point() + + xlab("Number of Chromosomal Reads") + ylab("Number of Mapped Reads") + + ggtitle("Number of Chromosomal Reads vs Number of Mapped Reads") + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size = 10), + axis.text = element_text(angle = 90, size = 15), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title = element_text(size = 15) +) + +``` + + +#### Number of UMI {.unnumbered} + +```{r mapping_efficiency_4_plate, fig.height = 7} + +ggplot( + qcData, + aes(x =NumberOfUMIs, y = NumberOfMappedReads, colour = PoolName) +) + geom_point() + + ylab("Number of Mapped Reads") + xlab("Number of UMIs ") + + ggtitle("Number of UMIs vs Number of Mapped Reads") + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size = 10), + axis.text = element_text(angle = 90, size = 15), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title = element_text(size = 15) + ) + +``` + +### Counting Efficiency {.tabset .unnumbered} + +#### Number of Mapped Reads {.unnumbered} + +```{r gene_efficiency_1_plate, fig.height = 7} +ggplot( + qcData, + aes(x = NumberOfMappedReads, y = NumberOfGenes, colour = PoolName) +) + geom_point() + + ylab("Number of Genes") + xlab("Number of Mapped Reads") + + ggtitle("Number of Genes vs Number of Mapped Reads") + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size = 10), + axis.text = element_text(angle = 90, size = 15), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title = element_text(size = 15) + ) +``` + +#### Number of Chromosomal Reads {.unnumbered} + +```{r gene_efficiency_2_plate, fig.height = 7} +ggplot( + qcData, + aes(x = NumberOfChromReads, y = NumberOfGenes, colour = PoolName) +) + geom_point() + + ylab("Number of Genes") + xlab("Number of Chromosomal Reads") + + ggtitle("Number of Genes vs Number of Chromosomal Reads") + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size = 10), + axis.text = element_text(angle = 90, size = 15), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title = element_text(size = 15) + ) +``` + + + +## Sequencing Saturation {.tabset} + +The barplots below represent the sequencing saturation per sample as determined by STAR, split per pool. +The HT-RNAseq platform aims for shallow sequencing resulting in relatively low sequencing saturations of 10-20%. +In addition, the sequencing saturation vs the number of input reads is shown. + +### Sequencing Saturation {.unnumbered} + + + +```{r sequencingSaturation, fig.height = figHeight} + +ggplot( + qcData, + aes(x = WellID, y = SequencingSaturation, fill = PoolName) +) + geom_bar(stat = "identity", position = "dodge") + + xlab("Samples") + ggtitle("Sequencing Saturation per Sample") + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(1, "lines"), + text = element_text(size = 10), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title = element_text(size = 15), + axis.text.x = element_blank(), + axis.text.y = element_text(size = 15), + axis.ticks.x = element_blank() + ) +``` + +### Sequencing Saturation - Input Reads {.unnumbered} + + +```{r sequencingSaturation_inputReads, fig.height = figHeight} + + +ggplot( + qcData, + aes(x = NumberOfInputReads, y = SequencingSaturation, colour = PoolName) +) + geom_point() + + ggtitle("Sequencing Saturation vs Number of Input Reads") + + theme(strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size = 10), + axis.text = element_text(angle = 90, size = 15), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title = element_text(size = 15) + ) +``` + +### Sequencing Saturation - Mapped Reads {.unnumbered} + +```{r sequencingSaturation_mappedReads, fig.height = figHeight} +ggplot( + qcData, + aes(x = NumberOfChromReads, y = SequencingSaturation, colour = PoolName) +) + geom_point() + + ggtitle("Sequencing Saturation vs Number of Chromosomal Reads") + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size=10), + axis.text = element_text(angle = 90, size = 15), + plot.title = element_text(size=18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title = element_text(size = 15) +) +``` + +
+ +## Genomic Origin {.tabset} + +The 3 boxplots below represent, per pool, the distributions of the percentage of reads mapping to: + +* chromosomal regions + +* mitochondrial regions + +* ERCC spike-ins + +The 4th plot summarises the above results across samples per pool. + +The 5th plot shows the percentage of reads mapped to the transcriptome (as counted by STAR). This measurement serves as a proxy for the percentage of reads mapped to exons. + +> The percentage ERCC contributes to the QC metrics mentioned in Par 3. This value is ideally as low as possible (but non-zero to ensure the they have been spiked in) and comparable for the different pools. + + + + +### pctChrom {.tabset .unnumbered} + + +```{r genomicOrigin_chrom_plate, fig.height = figHeight} + +ggplot( + qcData, aes(x = PoolName, y = pctChrom, colour = PoolName) +) + + geom_boxplot() + + ggtitle("pctChrom") + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size = 10), + axis.text.y = element_text(angle = 90, size = 14), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title.y = element_text(size = 15), + axis.text.x = element_blank(), + axis.ticks.x = element_blank() + ) +``` + + +### pctMT {.tabset .unnumbered} + +```{r genomicOrigin_mt_plate, fig.height = figHeight} + +ggplot( + qcData, + aes(x = PoolName, y = pctMT, colour = PoolName) +) + + geom_boxplot() + ggtitle("pctMT") + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size = 10), + axis.text.y = element_text(angle = 90, size = 14), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title.y = element_text(size = 15), + axis.text.x = element_blank(), + axis.ticks.x = element_blank() + ) +``` + +### pctERCC {.tabset .unnumbered} + + +```{r genomicOrigin_ercc_plate, fig.height = figHeight} +ggplot(qcData, aes(x = PoolName, y = pctERCC, colour = PoolName)) + + geom_boxplot() + + ggtitle("pctERCC") + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size = 10), + axis.text.y = element_text(angle = 90, size = 14), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title.y = element_text(size = 15), + axis.text.x = element_blank(), + axis.ticks.x = element_blank() + ) +``` + + +### Genomic Summary {.tabset .unnumbered} + + + +```{r genomicOrigin_summary_plate} +meanPctChromMTData <- qcData[, .( + "pctChrom" = median(pctChrom), + "pctMT" = median(pctMT), + "pctERCC" = median(pctERCC) +), by = PoolName] +meanPctChromMTDataLong <- melt( + meanPctChromMTData, + id.vars = "PoolName", + measure.vars = c("pctChrom", "pctMT", "pctERCC"), + variable.name = "Origin", value.name = "pct" +) +ggplot( + meanPctChromMTDataLong, + aes(fill = Origin, y = pct, x = PoolName)) + + geom_bar(position = "stack", stat = "identity") + + ggtitle("Genomic Origin") + + theme( + text = element_text(size = 10), + axis.text = element_text(angle = 90, size = 15), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title = element_text(size = 15) + ) + +``` + + + +# Depletion {.tabset} + +
+```{r depletion} + + +for (eset_name in pools) { + cat("\n\n") + cat(paste0("## ", eset_name, " {.unnumbered}")) + cat("\n\n") + + eset <- esetList[[eset_name]] + average_reads <- sort(apply(exprs(eset), 1, mean), decreasing = TRUE) + plotData <- data.table( + ENSGID = names(average_reads), + av_count = average_reads + ) + + gen_descript <- data.table( + ENSGID = eset@featureData@data$gene_id, + Description = eset@featureData@data$GENENAME + ) + order_gen_descript <- gen_descript[ + match(plotData$ENSGID, gen_descript$ENSGID), + ] + + g <- ggplot( + plotData[c(1:100)], + aes(x = reorder(ENSGID, -av_count), y = av_count) + ) + geom_bar(stat = "identity") + + theme( + axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 12), + axis.text.y = element_text(size = 12), + legend.text = element_text(size = 15), + legend.title = element_text(size = 15), + axis.title = element_text(size = 18), + plot.title = element_text(size = 20) + ) + ylab("Average Counts") + xlab("Genes") + + print(g) + + cat("\n\n") + cat("
") + cat("
") + + print(htmltools::tagList((DT::datatable(order_gen_descript[1:100, ])))) +} +``` +
+ + +
+
+
+
+ +# Glossary {.unnumbered} + + +## Read {.unlisted .unnumbered} + +A read is a oligonucleotide (a short RNA fragment) that has been sequenced. It consists of a fixed number of base pairs (bp) and therefore has a specific read length. + + + +## Input Read {.unlisted .unnumbered} + +Each read of the fastq file used as input to the STAR aligner is considered an input read. + + + +## Read With Valid Barcode {.unlisted .unnumbered} + +A read with a valid barcode is a read for which the barcode matches the white list of barcodes under the given restriction of the number of allowed mismatches. The number of reads with a valid barcode is lower or equal to the number of input reads. + + + +## Mapped Read {.unlisted .unnumbered} + +A read that has been aligned against the reference genome and for which one or more suitable matching locations have been found is a mapped read. Depending on the number of allowed mismatches this might or might not be be an exact match. The number of mapped reads is lower or equal to the number of reads with a valid barcode. + + + +## Uniquely Mapped Read {.unlisted .unnumbered} + +A read for which one and only one suitable matching location in the reference genome was found is an uniquely mapped read. The number of uniquely mapped reads is lower or equal to the number of mapped reads. + + + +## Counted Read {.unlisted .unnumbered} + +A mapped read will only be counted if it overlaps (1 nucleotide or more) with one and only one gene. The number of counted reads is lower or equal to the number of (uniquely) mapped reads. + + + +## UMIs {.unlisted .unnumbered} + +Unique molecular identifiers (UMI) are short sequences in order to uniquely tag each molecule in a sample library. Sequencing with UMIs allows bioinformatics software to filter out duplicate reads and PCR errors with a high level of accuracy and report unique reads. + +The reported UMIs is the number of UMIs among the set of reads that map to an unique gene, i.e the number of reads is deduplicated. + + + +## pctERCC {.unlisted .unnumbered} + +The percentage of reads mapping to the ERCC genes among the total number of **mapped** reads. + + + +## pctMT {.unlisted .unnumbered} + +The percentage of reads mapping to the MT genes among the total number of **mapped** reads. + + + +## Sequencing Saturation {.unlisted .unnumbered} + +The sequencing saturation is a measure of the fraction of library complexity. The inverse of one minus the sequencing saturation can be interpreted as the number of additional reads it would take to detect a new transcript. Consequently, a low sequencing saturation indicates a shallow sequencing in which a new transcript could be discovered with a few reads. + +
+
+
+
+ +
+![](OutputSTARsolo.png) +
+ +
+
diff --git a/src/report/test.R b/src/report/test.R new file mode 100644 index 00000000..84c15694 --- /dev/null +++ b/src/report/test.R @@ -0,0 +1,41 @@ +library(whisker) +library(testthat) +library(R.utils) + +cat(">> Creating temporary directory \n") +Sys.setenv(TMP = meta$temp_dir) +temp_folder <- tempdir(check = TRUE) + +cat(">> Running component create_report for test case \n") + +input_dir <- file.path(meta$resources_dir, "test_data") +stopifnot(file.exists(input_dir)) + + +out <- processx::run(meta$executable, c( + "--eset", file.path(meta$resources_dir, "test_data", "eset.sample_one.rds"), + "--eset", file.path(meta$resources_dir, "test_data", "eset.sample_two.rds"), + "--output_report", "report.html" +)) + +expect_equal(out$status, 0) +expect_true(file.exists("report.html")) + +cat(">> Test succesful \n") + +cat(">> Running component create_report with symbolic links \n") + +link_sample_1 <- file.path(temp_folder, "eset.sample_one.rds") +link_sample_2 <- file.path(temp_folder, "eset.sample_two.rds") +createLink(link = link_sample_1, + target = file.path(meta$resources_dir, "test_data", "eset.sample_one.rds")) +createLink(link = link_sample_2, + target = file.path(meta$resources_dir, "test_data", "eset.sample_two.rds")) + +out <- processx::run(meta$executable, c( + "--eset", link_sample_1, + "--eset", link_sample_2, + "--output_report", "report2.html" +)) + +expect_true(file.exists("report2.html")) \ No newline at end of file diff --git a/src/report/test_data/eset.sample_one.rds b/src/report/test_data/eset.sample_one.rds new file mode 100644 index 00000000..f440362e Binary files /dev/null and b/src/report/test_data/eset.sample_one.rds differ diff --git a/src/report/test_data/eset.sample_two.rds b/src/report/test_data/eset.sample_two.rds new file mode 100644 index 00000000..c622f216 Binary files /dev/null and b/src/report/test_data/eset.sample_two.rds differ diff --git a/src/stats/combine_star_logs/config.vsh.yaml b/src/stats/combine_star_logs/config.vsh.yaml new file mode 100644 index 00000000..61cf6c6c --- /dev/null +++ b/src/stats/combine_star_logs/config.vsh.yaml @@ -0,0 +1,72 @@ +name: combine_star_logs +namespace: "stats" +authors: + - __merge__: /src/base/authors/dries_schaumont.yaml + roles: [ author, maintainer ] +argument_groups: + - name: "Arguments" + arguments: + - name: "--barcodes" + type: string + multiple: true + required: true + description: | + Barcodes responding to the respective log files. + - name: "--star_logs" + type: file + multiple: true + required: true + description: | + Paths to the STAR log files (most frequently called Log.final.out) + direction: input + example: "Log.final.out" + - name: "--gene_summary_logs" + direction: input + type: file + multiple: true + required: true + description: | + Paths to the Summary.csv files from the STAR Solo output. Can be found in + the 'Solo.out/Gene' folder relative to the root of the STAR output directory. + example: "Summary.txt" + - name: "--reads_per_gene_logs" + direction: input + type: file + multiple: true + required: true + description: | + Paths to the 'ReadsPerGene.out.tab' files as output by STAR. + - name: "--output" + type: file + direction: output + default: "starLogs.txt" + description: | + Tab-delimited file describing for each barcode (as the rows), the metrics (as columns) + gathered from the different input files. + +resources: +- type: python_script + path: script.py + +test_resources: + - type: python_script + path: test.py + - path: test_data + +engines: + - type: docker + image: python:3.12-slim + setup: + - type: apt + packages: + - procps + - type: python + packages: + - pandas + test_setup: + - type: python + packages: + - viashpy +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/stats/combine_star_logs/script.py b/src/stats/combine_star_logs/script.py new file mode 100644 index 00000000..6409c03c --- /dev/null +++ b/src/stats/combine_star_logs/script.py @@ -0,0 +1,228 @@ +import logging +import pandas as pd +from itertools import batched, starmap + +### VIASH START +meta = { + "name": "combine_star_logs", +} +par = { + "star_logs": ["src/stats/combine_star_logs/test_data/barcode_1/Log.final.out", + "src/stats/combine_star_logs/test_data/barcode_2/Log.final.out"], + "gene_summary_logs": ["src/stats/combine_star_logs/test_data/barcode_1/summary.csv", + "src/stats/combine_star_logs/test_data/barcode_2/summary.csv"], + "reads_per_gene_logs": ["src/stats/combine_star_logs/test_data/barcode_1/ReadsPerGene.out.tab", + "src/stats/combine_star_logs/test_data/barcode_2/ReadsPerGene.out.tab"], + "output": "output.txt", + "barcodes": ["ACGG", "TTTT"], +} + +### VIASH END + +logger = logging.getLogger() +console_handler = logging.StreamHandler() +logger.addHandler(console_handler) +logger.setLevel(logging.DEBUG) + + +def handle_percentages(column_value): + # TODO: handle this more gracefully + if column_value: + return column_value.strip('%') + return column_value + +def star_log_to_dataframe(barcode: str, log_path) -> pd.DataFrame: + logger.info("Reading STAR log %s for barcode '%s'", log_path, barcode) + result = pd.read_table(log_path, sep=r"\|\t+", converters={"Value": handle_percentages}, + engine="python", header=None, skip_blank_lines=True, + skipinitialspace=True, names=["Category", "Value"], index_col=0, + skiprows=[0, 1, 2]) + logger.info("Read %d row(s) and %d column(s) from STAR logs at %s", + *result.shape, log_path) + return result + + +def summary_to_dataframe(barcode: str, summary_path) -> pd.DataFrame: + logger.info("Reading summary log %s for barcode %s", summary_path, barcode) + result = pd.read_table(summary_path, sep=",", + header=None, names=["Category", "Value"], + index_col=0, dtype=pd.StringDtype()) + logger.info("Read %d row(s) and %d column(s) from summary file at %s", + *result.shape, summary_path) + return result + + +def reads_per_gene_to_dataframe(barcode, read_per_gene_path) -> pd.DataFrame: + logger.info("Reading reads per gene file %s for barcode %s", read_per_gene_path, barcode) + result = pd.read_table(read_per_gene_path, skiprows=[0, 1, 2, 3], header=None, sep="\t", + dtype={"geneID": pd.StringDtype(), + "Unstranded": pd.Int64Dtype(), + "posStrand": pd.Int64Dtype(), + "negStrand": pd.Int64Dtype()}, + index_col=0, names=["geneID", "Unstranded", "posStrand", "negStrand"]) + result = result[["Unstranded"]] # Do not use .loc here because we need a DataFrame, not a Series + df = pd.DataFrame({"Value": result.sum()}) + df = df.rename({"Unstranded": "NumberOfCountedReads"}, errors="raise") + df.index.name = "Category" + logger.info("Read %d row(s) and %d column(s) from reads per gene file at %s", + *df.shape, read_per_gene_path) + return df + +def star_log_remove_unwanted_entries_and_adjust_format(barcode, df: pd.DataFrame) -> pd.DataFrame: + """ + For a single star log (Log.final.out) in dataframe format, filter out the + entries that are not needed and format the labels for some metrics: + - Replace '%' with 'pect' in the labels. + - Remove labels ending with ':' + (mostly the section separators like 'MULTI-MAPPING READS:' and 'UNMAPPED READS:') + - Remove the metrics we do no need based on the following keywords: + Mapping speed, Average, Number of splices, per base, chimeric reads, average + + The dataframe provided as input must have an index with 1 level with the metric names. + """ + # Remove index values ending with ':' (rows like 'MULTI-MAPPING READS:','UNIQUE READS:') + logger.info("Filtering STAR logs for barcode %s. Starting with %d row(s) and %d column(s)", barcode, *df.shape) + to_keep = ~df.index.to_series().str.endswith(":") + # Remove index values where the values contain any of these substrings + regex_columns_to_remove = "Mapping speed|Average|Number of splices|per base|chimeric reads|average" + to_keep = to_keep & ~df.index.to_series().str.contains(regex_columns_to_remove, regex=True) + logger.info("Removed the following log entries for barcode '%s':\n\t%s", + barcode, + "\n\t".join(to_keep[~to_keep].index.to_list())) + result = df.loc[to_keep] + + # Replace % by pect, remove columns, use camel case and remove spaces + # You might be tempted to use .title() to make everything uppercase, + # but characters which are already uppercase should stay that way. + # (example: NumberOfUMIs and not NumberOfUmis) + result.index = result.index.str.replace("%", "pect")\ + .str.replace(":", "")\ + .str.replace(r"(?:^|\s).", lambda m:m.group(0).upper(), regex=True)\ + .str.replace(" ", "") + result = result.rename({"UniquelyMappedReadsNumber": "NumberOfMappedReads", + "UniquelyMappedReadsPect": "PctMappedReads"}, errors="raise") + logger.info("Done filtering STAR logs for barcode %s. Result has %d row(s) and %d column(s). " + "Found entries:\n\t%s", + barcode, *result.shape, "\n\t".join(result.index.to_list())) + return result + + +def summary_remove_unwanted_entries_and_adjust_format(barcode, df: pd.DataFrame) -> pd.DataFrame: + logger.info("Filtering and formatting summary logs for barcode %s. " + "Starting with %d row(s) and %d column(s)", barcode, *df.shape) + columns_to_remove = ( + "Number of Reads", + "Q30 Bases in RNA read", + "Reads Mapped to Genome: Unique", + "Reads Mapped to Transcriptome: Unique Genes", + "Reads in Cells Mapped to Unique Genes", + "Median UMI per Cell", + "Median Genes per Cell", + "Reads Mapped to Genome: Unique+Multiple", + "Median Reads per Cell", + "Mean UMI per Cell", + "Mean Genes per Cell", + ) + + to_keep = ~df.index.isin(columns_to_remove) + logger.info("Removed the following summary entries for barcode '%s':\n\t%s", + barcode, + "\n\t".join(df.loc[~to_keep].index.to_list())) + result = df.loc[to_keep] + result.index = result.index.str.replace(r"(?:^|\s).", lambda m:m.group(0).upper(), + regex=True).str.replace(" ", "") + to_rename = {"UMIsInCells": "NumberOfUMIs", + "TotalGenesDetected": "NumberOfGenes"} + try: + result = result.rename(to_rename, errors="raise") + except KeyError as e: + raise KeyError(f"Tried to rename log entries ({','.join(to_rename)}) in the summary " + f"log for barcode {barcode}, but an entry was not found in the file. " + "Make sure that you are using the correct version of STAR." + f"Available entries: {", ".join(result.index.to_list())}") from e + logger.info("Done filtering summary logs for barcode %s. Result has %d row(s) and %d column(s). " + "Found entries:\n\t%s", + barcode, *result.shape, "\n\t".join(result.index.to_list())) + return result + + +def join_dfs(df_list, barcodes) -> pd.DataFrame: + # Combine the dataframes together and add the barcodes as a level to the dataframe + # in order to make a 2-level index (first level the barcodes and second level the metrics). + result = pd.concat(dict(zip(barcodes, df_list)), names=["WellBC"]) + # Pivot the table by moving the metrics to the columns. Its added as an extra level, + # so we can just frop the 'Values' level that was already there + result = result.unstack(level="Category").droplevel(0, axis="columns") + return result + +def main(par): + logger.info("Component started.") + # Provide an overview of the parameters in the logs + parameters_str = [f'\t{param}: {param_val}\n' for param, param_val in par.items()] + logger.info("Parameters:\n%s", "".join(parameters_str).rstrip()) + star_logs, gene_summary_logs, reads_per_gene_logs, barcodes = par["star_logs"], \ + par["gene_summary_logs"], par["reads_per_gene_logs"], par["barcodes"] + number_of_inputs = tuple(len(i) for i in (star_logs, gene_summary_logs, + reads_per_gene_logs, barcodes)) + if len(set(number_of_inputs)) != 1: + raise ValueError("Expected the same number of inputs for 'star_logs' (%d), " + "'gene_summary_logs' (%d), 'reads_per_gene_logs' (%d) " + "and 'barcodes' (%d)." % number_of_inputs) + + logs_to_process = [ + (star_log_to_dataframe, star_log_remove_unwanted_entries_and_adjust_format, star_logs), + (summary_to_dataframe, summary_remove_unwanted_entries_and_adjust_format, gene_summary_logs), + (reads_per_gene_to_dataframe, None, reads_per_gene_logs), + ] + logger.info("Formatting the contents of the log files.") + all_logs_data = [] + for df_generator, formatter, data in logs_to_process: + data_as_df = list(starmap(df_generator, zip(barcodes, data))) + data_formatted = data_as_df + if formatter: + data_formatted = list(starmap(formatter, zip(barcodes, data_as_df))) + data_joined = join_dfs(data_formatted, barcodes) + all_logs_data.append(data_joined) + + logger.info("Joining entries across the different logs together.") + all_stats = pd.concat(all_logs_data, axis=1) + logger.info("Log statistics were gathered for the following barcodes: %s", + ", ".join(all_stats.index.to_list())) + dtypes = { + 'NumberOfInputReads': pd.UInt64Dtype(), + 'NumberOfMappedReads': pd.UInt64Dtype(), + 'PctMappedReads': pd.Float64Dtype(), + 'NumberOfReadsMappedToMultipleLoci': pd.UInt64Dtype(), + 'PectOfReadsMappedToMultipleLoci': pd.Float64Dtype(), + 'NumberOfReadsMappedToTooManyLoci': pd.UInt64Dtype(), + 'PectOfReadsMappedToTooManyLoci': pd.Float64Dtype(), + 'NumberOfReadsUnmappedTooManyMismatches': pd.UInt64Dtype(), + 'PectOfReadsUnmappedTooManyMismatches': pd.Float64Dtype(), + 'NumberOfReadsUnmappedTooShort': pd.UInt64Dtype(), + 'PectOfReadsUnmappedTooShort': pd.Float64Dtype(), + 'NumberOfReadsUnmappedOther': pd.UInt64Dtype(), + 'PectOfReadsUnmappedOther': pd.Float64Dtype(), + 'ReadsWithValidBarcodes': pd.Float64Dtype(), + 'SequencingSaturation': pd.Float64Dtype(), + 'Q30BasesInCB+UMI': pd.Float64Dtype(), + 'ReadsMappedToTranscriptome:Unique+MultipeGenes': pd.Float64Dtype(), + 'EstimatedNumberOfCells': pd.UInt64Dtype(), + 'FractionOfReadsInCells': pd.Float64Dtype(), + 'MeanReadsPerCell': pd.UInt64Dtype(), + 'NumberOfUMIs': pd.UInt64Dtype(), + 'NumberOfGenes': pd.UInt64Dtype(), + 'NumberOfCountedReads': pd.UInt64Dtype(), + } + all_stats = all_stats.astype(dtypes) + # batched() is used here to print a limited amount of columnns at a time + # to make sure that they are all displayed (pandas might limit the view for readability) + logger.info("Summary of final output:\n%s\n", + "\n".join(repr(all_stats.loc[:,columns].describe()) + for columns in batched(all_stats.columns, 3))) + logger.info("Writing output to %s", par["output"]) + all_stats.reset_index("WellBC").to_csv(par["output"], sep="\t", header=True, + index=False, float_format='%g') + logger.info("Finished %s.", meta["name"]) + +if __name__ == "__main__": + main(par) \ No newline at end of file diff --git a/src/stats/combine_star_logs/test.py b/src/stats/combine_star_logs/test.py new file mode 100644 index 00000000..8ae917c7 --- /dev/null +++ b/src/stats/combine_star_logs/test.py @@ -0,0 +1,182 @@ +import pytest +import sys +import re +import pandas as pd +from pathlib import Path +from uuid import uuid4 +from subprocess import CalledProcessError + +### VIASH START +meta = { + "resources_dir": "./src/stats/combine_star_logs/", + "executable": "target/executable/stats/combine_star_logs/combine_star_logs", + "config": "src/stats/combine_star_logs/config.vsh.yaml" +} +### VIASH END + +@pytest.fixture +def test_resources_path(): + return Path(meta["resources_dir"]) / "test_data" + +@pytest.fixture +def barcode_1_star_log(test_resources_path): + return test_resources_path / "barcode_1" / "Log.final.out" + +@pytest.fixture +def barcode_1_reads_per_gene_file(test_resources_path): + return test_resources_path / "barcode_1" / "ReadsPerGene.out.tab" + +@pytest.fixture +def barcode_1_summary(test_resources_path): + return test_resources_path / "barcode_1" / "summary.csv" + +@pytest.fixture +def barcode_2_star_log(test_resources_path): + return test_resources_path / "barcode_2" / "Log.final.out" + +@pytest.fixture +def barcode_2_reads_per_gene_file(test_resources_path): + return test_resources_path / "barcode_2" / "ReadsPerGene.out.tab" + +@pytest.fixture +def barcode_2_summary(test_resources_path): + return test_resources_path / "barcode_2" / "summary.csv" + +@pytest.fixture +def no_reads_mapped_star_log(test_resources_path): + return test_resources_path / "empty" / "Log.final.out" + +@pytest.fixture +def no_reads_mapped_reads_per_gene_file(test_resources_path): + return test_resources_path / "empty" / "ReadsPerGene.out.tab" + +@pytest.fixture +def no_reads_mapped_summary(test_resources_path): + return test_resources_path / "empty" / "summary.csv" + +@pytest.fixture +def random_path(tmp_path): + def wrapper(extension=None): + extension = "" if not extension else f".{extension}" + return tmp_path / f"{uuid4()}{extension}" + return wrapper + +def test_incorrect_number_of_inputs_raises(run_component, + barcode_1_star_log, barcode_2_star_log, + barcode_1_reads_per_gene_file, barcode_2_reads_per_gene_file, + barcode_1_summary, barcode_2_summary, + random_path): + output_path = random_path("txt") + with pytest.raises(CalledProcessError) as err: + run_component([ + "--barcodes", "foo;bar", + "--star_logs", f"{barcode_1_star_log}", + "--reads_per_gene_logs", f"{barcode_1_reads_per_gene_file};{barcode_2_reads_per_gene_file}", + "--gene_summary_logs", f"{barcode_1_summary};{barcode_2_summary}", + "--output", output_path, + ]) + assert re.search(r"ValueError: Expected the same number of inputs for 'star_logs' \(1\), " + r"'gene_summary_logs' \(2\), 'reads_per_gene_logs' \(2\) and 'barcodes' \(2\)\.", + err.value.stdout.decode('utf-8')) + + + +def test_equal_number_of_argument(run_component, + barcode_1_star_log, barcode_2_star_log, + barcode_1_reads_per_gene_file, barcode_2_reads_per_gene_file, + barcode_1_summary, barcode_2_summary, + random_path): + output_path = random_path("txt") + run_component([ + "--barcodes", "foo;bar", + "--star_logs", f"{barcode_1_star_log};{barcode_2_star_log}", + "--reads_per_gene_logs", f"{barcode_1_reads_per_gene_file};{barcode_2_reads_per_gene_file}", + "--gene_summary_logs", f"{barcode_1_summary};{barcode_2_summary}", + "--output", output_path, + ]) + # We use strings here to make a comparison of the file contents without + # doing any inferences of the numerical data type (i.e. exact file contents). + expected_dict = { + 'NumberOfInputReads': ["96398", "10155"], + 'NumberOfMappedReads': ["70824", "7179"], + 'PctMappedReads': ["73.47", "70.69"], + 'NumberOfReadsMappedToMultipleLoci': ["0", "0"], + 'PectOfReadsMappedToMultipleLoci': ["0", "0"], + 'NumberOfReadsMappedToTooManyLoci': ["22281", "2248"], + 'PectOfReadsMappedToTooManyLoci': ["23.11", "22.14"], + 'NumberOfReadsUnmappedTooManyMismatches': ["0", "0"], + 'PectOfReadsUnmappedTooManyMismatches': ["0", "0"], + 'NumberOfReadsUnmappedTooShort': ["2697", "553"], + 'PectOfReadsUnmappedTooShort': ["2.8", "5.45"], + 'NumberOfReadsUnmappedOther': ["596", "175"], + 'PectOfReadsUnmappedOther': ["0.62", "1.72"], + 'ReadsWithValidBarcodes': ["0.999782", "0.999803"], + 'SequencingSaturation': ["0.0602963", "0.0539344"], + 'Q30BasesInCB+UMI': ["0.980096", "0.984461"], + 'ReadsMappedToTranscriptome:Unique+MultipeGenes': ["0.60411", "0.530871"], + 'EstimatedNumberOfCells': ["1", "1"], + 'FractionOfReadsInCells': ["1", "1"], + 'MeanReadsPerCell': ["53602", "4969"], + 'NumberOfUMIs': ["50370", "4701"], + 'NumberOfGenes': ["8767", "2397"], + 'NumberOfCountedReads': ["17", "15"], + } + expected = pd.DataFrame.from_dict(expected_dict, dtype=pd.StringDtype()) + expected.index = pd.Index(["foo", "bar"], name="WellBC", dtype=pd.StringDtype()) + assert output_path.is_file() + + contents = pd.read_csv(output_path, sep="\t", index_col=0, dtype=pd.StringDtype()) + assert set(("NumberOfInputReads", "SequencingSaturation", + "NumberOfGenes", "NumberOfUMIs", "NumberOfCountedReads", + "PctMappedReads")).issubset(set(contents.columns)) + pd.testing.assert_frame_equal(contents, expected) + +def test_empty(run_component, no_reads_mapped_star_log, + no_reads_mapped_reads_per_gene_file, no_reads_mapped_summary, + random_path): + """ + Sometimes the summary.csv contains '-nan' values, make sure they + are properly handled. + """ + output_path = random_path("txt") + run_component([ + "--barcodes", "foo", + "--star_logs", no_reads_mapped_star_log, + "--reads_per_gene_logs", no_reads_mapped_reads_per_gene_file, + "--gene_summary_logs", no_reads_mapped_summary, + "--output", output_path, + ]) + expected_dict = { + 'NumberOfInputReads': ["1327"], + 'NumberOfMappedReads': ["116"], + 'PctMappedReads': ["8.74"], + 'NumberOfReadsMappedToMultipleLoci': ["0"], + 'PectOfReadsMappedToMultipleLoci': ["0"], + 'NumberOfReadsMappedToTooManyLoci': ["43"], + 'PectOfReadsMappedToTooManyLoci': ["3.24"], + 'NumberOfReadsUnmappedTooManyMismatches': ["0"], + 'PectOfReadsUnmappedTooManyMismatches': ["0"], + 'NumberOfReadsUnmappedTooShort': ["1166"], + 'PectOfReadsUnmappedTooShort': ["87.87"], + 'NumberOfReadsUnmappedOther': ["2"], + 'PectOfReadsUnmappedOther': ["0.15"], + 'ReadsWithValidBarcodes': ["0.023361"], + 'SequencingSaturation': [pd.NA], + 'Q30BasesInCB+UMI': ["0.917408"], + 'ReadsMappedToTranscriptome:Unique+MultipeGenes': ["0"], + 'EstimatedNumberOfCells': ["0"], + 'FractionOfReadsInCells': [pd.NA], + 'MeanReadsPerCell': ["0"], + 'NumberOfUMIs': ["0"], + 'NumberOfGenes': ["0"], + 'NumberOfCountedReads': ["0"], + } + expected = pd.DataFrame.from_dict(expected_dict, dtype=pd.StringDtype()) + expected.index = pd.Index(["foo"], name="WellBC", dtype=pd.StringDtype()) + contents = pd.read_csv(output_path, sep="\t", index_col=0, dtype=pd.StringDtype()) + pd.testing.assert_frame_equal(contents, expected) + + + +if __name__ == '__main__': + sys.exit(pytest.main([__file__])) \ No newline at end of file diff --git a/src/stats/combine_star_logs/test_data/barcode_1/Log.final.out b/src/stats/combine_star_logs/test_data/barcode_1/Log.final.out new file mode 100644 index 00000000..c57f5b89 --- /dev/null +++ b/src/stats/combine_star_logs/test_data/barcode_1/Log.final.out @@ -0,0 +1,37 @@ + Started job on | Jun 26 09:38:11 + Started mapping on | Jun 26 09:38:14 + Finished on | Jun 26 09:38:23 + Mapping speed, Million of reads per hour | 38.56 + + Number of input reads | 96398 + Average input read length | 57 + UNIQUE READS: + Uniquely mapped reads number | 70824 + Uniquely mapped reads % | 73.47% + Average mapped length | 56.93 + Number of splices: Total | 6432 + Number of splices: Annotated (sjdb) | 6285 + Number of splices: GT/AG | 6331 + Number of splices: GC/AG | 33 + Number of splices: AT/AC | 2 + Number of splices: Non-canonical | 66 + Mismatch rate per base, % | 0.61% + Deletion rate per base | 0.01% + Deletion average length | 1.38 + Insertion rate per base | 0.00% + Insertion average length | 1.24 + MULTI-MAPPING READS: + Number of reads mapped to multiple loci | 0 + % of reads mapped to multiple loci | 0.00% + Number of reads mapped to too many loci | 22281 + % of reads mapped to too many loci | 23.11% + UNMAPPED READS: + Number of reads unmapped: too many mismatches | 0 + % of reads unmapped: too many mismatches | 0.00% + Number of reads unmapped: too short | 2697 + % of reads unmapped: too short | 2.80% + Number of reads unmapped: other | 596 + % of reads unmapped: other | 0.62% + CHIMERIC READS: + Number of chimeric reads | 0 + % of chimeric reads | 0.00% \ No newline at end of file diff --git a/src/stats/combine_star_logs/test_data/barcode_1/ReadsPerGene.out.tab b/src/stats/combine_star_logs/test_data/barcode_1/ReadsPerGene.out.tab new file mode 100644 index 00000000..5b4335a1 --- /dev/null +++ b/src/stats/combine_star_logs/test_data/barcode_1/ReadsPerGene.out.tab @@ -0,0 +1,8 @@ +N_unmapped 11111 22222 33333 +N_multimapping 0 0 0 +N_noFeature 44444 55555 66666 +N_ambiguous 77777 88888 99999 +gene1 2 0 0 +gene2 0 0 0 +gene3 6 0 6 +gene5 9 6 3 diff --git a/src/stats/combine_star_logs/test_data/barcode_1/summary.csv b/src/stats/combine_star_logs/test_data/barcode_1/summary.csv new file mode 100644 index 00000000..98ada0d1 --- /dev/null +++ b/src/stats/combine_star_logs/test_data/barcode_1/summary.csv @@ -0,0 +1,20 @@ +Number of Reads,96398 +Reads With Valid Barcodes,0.999782 +Sequencing Saturation,0.0602963 +Q30 Bases in CB+UMI,0.980096 +Q30 Bases in RNA read,0.799904 +Reads Mapped to Genome: Unique+Multiple,0.734704 +Reads Mapped to Genome: Unique,0.734704 +Reads Mapped to Transcriptome: Unique+Multipe Genes,0.60411 +Reads Mapped to Transcriptome: Unique Genes,0.556049 +Estimated Number of Cells,1 +Reads in Cells Mapped to Unique Genes,53602 +Fraction of Reads in Cells,1 +Mean Reads per Cell,53602 +Median Reads per Cell,53602 +UMIs in Cells,50370 +Mean UMI per Cell,50370 +Median UMI per Cell,50370 +Mean Genes per Cell,8767 +Median Genes per Cell,8767 +Total Genes Detected,8767 \ No newline at end of file diff --git a/src/stats/combine_star_logs/test_data/barcode_2/Log.final.out b/src/stats/combine_star_logs/test_data/barcode_2/Log.final.out new file mode 100644 index 00000000..a0ded89d --- /dev/null +++ b/src/stats/combine_star_logs/test_data/barcode_2/Log.final.out @@ -0,0 +1,37 @@ + Started job on | Jun 26 09:38:56 + Started mapping on | Jun 26 09:39:00 + Finished on | Jun 26 09:39:02 + Mapping speed, Million of reads per hour | 18.28 + + Number of input reads | 10155 + Average input read length | 57 + UNIQUE READS: + Uniquely mapped reads number | 7179 + Uniquely mapped reads % | 70.69% + Average mapped length | 56.36 + Number of splices: Total | 526 + Number of splices: Annotated (sjdb) | 495 + Number of splices: GT/AG | 502 + Number of splices: GC/AG | 4 + Number of splices: AT/AC | 1 + Number of splices: Non-canonical | 19 + Mismatch rate per base, % | 0.85% + Deletion rate per base | 0.00% + Deletion average length | 1.09 + Insertion rate per base | 0.00% + Insertion average length | 1.07 + MULTI-MAPPING READS: + Number of reads mapped to multiple loci | 0 + % of reads mapped to multiple loci | 0.00% + Number of reads mapped to too many loci | 2248 + % of reads mapped to too many loci | 22.14% + UNMAPPED READS: + Number of reads unmapped: too many mismatches | 0 + % of reads unmapped: too many mismatches | 0.00% + Number of reads unmapped: too short | 553 + % of reads unmapped: too short | 5.45% + Number of reads unmapped: other | 175 + % of reads unmapped: other | 1.72% + CHIMERIC READS: + Number of chimeric reads | 0 + % of chimeric reads | 0.00% \ No newline at end of file diff --git a/src/stats/combine_star_logs/test_data/barcode_2/ReadsPerGene.out.tab b/src/stats/combine_star_logs/test_data/barcode_2/ReadsPerGene.out.tab new file mode 100644 index 00000000..135681e5 --- /dev/null +++ b/src/stats/combine_star_logs/test_data/barcode_2/ReadsPerGene.out.tab @@ -0,0 +1,8 @@ +N_unmapped 101010 202020 303030 +N_multimapping 0 0 0 +N_noFeature 404040 505050 606060 +N_ambiguous 707070 808080 909090 +gene1 0 0 0 +gene2 0 0 0 +gene6 5 5 0 +gene4 10 2 8 diff --git a/src/stats/combine_star_logs/test_data/barcode_2/summary.csv b/src/stats/combine_star_logs/test_data/barcode_2/summary.csv new file mode 100644 index 00000000..a306bb20 --- /dev/null +++ b/src/stats/combine_star_logs/test_data/barcode_2/summary.csv @@ -0,0 +1,20 @@ +Number of Reads,10155 +Reads With Valid Barcodes,0.999803 +Sequencing Saturation,0.0539344 +Q30 Bases in CB+UMI,0.984461 +Q30 Bases in RNA read,0.786064 +Reads Mapped to Genome: Unique+Multiple,0.706942 +Reads Mapped to Genome: Unique,0.706942 +Reads Mapped to Transcriptome: Unique+Multipe Genes,0.530871 +Reads Mapped to Transcriptome: Unique Genes,0.489316 +Estimated Number of Cells,1 +Reads in Cells Mapped to Unique Genes,4969 +Fraction of Reads in Cells,1 +Mean Reads per Cell,4969 +Median Reads per Cell,4969 +UMIs in Cells,4701 +Mean UMI per Cell,4701 +Median UMI per Cell,4701 +Mean Genes per Cell,2397 +Median Genes per Cell,2397 +Total Genes Detected,2397 \ No newline at end of file diff --git a/src/stats/combine_star_logs/test_data/empty/Log.final.out b/src/stats/combine_star_logs/test_data/empty/Log.final.out new file mode 100644 index 00000000..9548e8a5 --- /dev/null +++ b/src/stats/combine_star_logs/test_data/empty/Log.final.out @@ -0,0 +1,37 @@ + Started job on | Jun 26 09:38:56 + Started mapping on | Jun 26 09:39:00 + Finished on | Jun 26 09:39:02 + Mapping speed, Million of reads per hour | 18.28 + + Number of input reads | 1327 + Average input read length | 58 + UNIQUE READS: + Uniquely mapped reads number | 116 + Uniquely mapped reads % | 8.74% + Average mapped length | 54.11 + Number of splices: Total | 6 + Number of splices: Annotated (sjdb) | 4 + Number of splices: GT/AG | 4 + Number of splices: GC/AG | 0 + Number of splices: AT/AC | 0 + Number of splices: Non-canonical | 2 + Mismatch rate per base, % | 6.63% + Deletion rate per base | 0.13% + Deletion average length | 2.00 + Insertion rate per base | 0.00% + Insertion average length | 0.00 + MULTI-MAPPING READS: + Number of reads mapped to multiple loci | 0 + % of reads mapped to multiple loci | 0.00% + Number of reads mapped to too many loci | 43 + % of reads mapped to too many loci | 3.24% + UNMAPPED READS: + Number of reads unmapped: too many mismatches | 0 + % of reads unmapped: too many mismatches | 0.00% + Number of reads unmapped: too short | 1166 + % of reads unmapped: too short | 87.87% + Number of reads unmapped: other | 2 + % of reads unmapped: other | 0.15% + CHIMERIC READS: + Number of chimeric reads | 0 + % of chimeric reads | 0.00% \ No newline at end of file diff --git a/src/stats/combine_star_logs/test_data/empty/ReadsPerGene.out.tab b/src/stats/combine_star_logs/test_data/empty/ReadsPerGene.out.tab new file mode 100644 index 00000000..083cd470 --- /dev/null +++ b/src/stats/combine_star_logs/test_data/empty/ReadsPerGene.out.tab @@ -0,0 +1,8 @@ +N_unmapped 1211 1211 1211 +N_multimapping 0 0 0 +N_noFeature 23 26 109 +N_ambiguous 6 2 0 +gene1 0 0 0 +gene2 0 0 0 +gene6 0 0 0 +gene4 0 0 0 diff --git a/src/stats/combine_star_logs/test_data/empty/summary.csv b/src/stats/combine_star_logs/test_data/empty/summary.csv new file mode 100644 index 00000000..214fc70d --- /dev/null +++ b/src/stats/combine_star_logs/test_data/empty/summary.csv @@ -0,0 +1,20 @@ +Number of Reads,1327 +Reads With Valid Barcodes,0.023361 +Sequencing Saturation,-nan +Q30 Bases in CB+UMI,0.917408 +Q30 Bases in RNA read,0.711711 +Reads Mapped to Genome: Unique+Multiple,0.0874152 +Reads Mapped to Genome: Unique,0.0874152 +Reads Mapped to Transcriptome: Unique+Multipe Genes,0 +Reads Mapped to Transcriptome: Unique Genes,0 +Estimated Number of Cells,0 +Reads in Cells Mapped to Unique Genes,0 +Fraction of Reads in Cells,-nan +Mean Reads per Cell,0 +Median Reads per Cell,0 +UMIs in Cells,0 +Mean UMI per Cell,0 +Median UMI per Cell,0 +Mean Genes per Cell,0 +Median Genes per Cell,0 +Total Genes Detected,0 \ No newline at end of file diff --git a/src/stats/generate_pool_statistics/config.vsh.yaml b/src/stats/generate_pool_statistics/config.vsh.yaml new file mode 100644 index 00000000..3c9263bc --- /dev/null +++ b/src/stats/generate_pool_statistics/config.vsh.yaml @@ -0,0 +1,56 @@ +name: generate_pool_statistics +namespace: "stats" +authors: + - __merge__: /src/base/authors/dries_schaumont.yaml + roles: [ author, maintainer ] + - __merge__: /src/base/authors/marijke_van_moerbeke.yaml + roles: [ contributor ] +argument_groups: + - name: "Arguments" + arguments: + - name: "--nrReadsNrGenesPerChrom" + type: file + multiple: true + description: | + Path to an output file that contains a .tsv formatted table describing + per chromosome the number of reads that were mapped to that chromosome (NumberOfReads + column) and the number of genes on that chromosome that had at least one + read mapped to it (NumberOfGenes). + direction: input + default: [processedBamFile_well1.tsv, processedBamfile_well2.tsv] + - name: "--nrReadsNrGenesPerChromPool" + direction: output + type: file + multiple: false + description: | + Pivot table in tsv format of the combined input nrReadsNrGenesPerChrom files. Describes + per chromosome (as columns) the number of reads, as well as the total number + of reads per cell barcode and the percentage of nuclear, ERCC and mitochondrial + reads. + example: "nrReadsNrGenesPerChrom.txt" + +resources: +- type: python_script + path: script.py + +test_resources: + - type: python_script + path: test.py + +engines: + - type: docker + image: python:3.12-slim + setup: + - type: apt + packages: + - procps + - type: python + packages: + - pandas + test_setup: + - type: python + packages: + - viashpy +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/stats/generate_pool_statistics/script.py b/src/stats/generate_pool_statistics/script.py new file mode 100644 index 00000000..e6b57ca4 --- /dev/null +++ b/src/stats/generate_pool_statistics/script.py @@ -0,0 +1,94 @@ +import pandas as pd +from pathlib import Path +import re + +### VIASH START +par = { + "nrReadsNrGenesPerChrom": ["src/stats/generate_pool_statistics/test1.tsv", "src/stats/generate_pool_statistics/test2.tsv"], + "nrReadsNrGenesPerChromPool": "nrReadsNrGenesPerChrom_pool.txt" +} + +### VIASH END + +INDEX_COL = ["WellBC", "WellID"] + +if __name__ == "__main__": + ######### + # nrReadsNrGenesPerChrom file + ######### + nr_reads_nr_genes_wells = [] + par["nrReadsNrGenesPerChrom"] = list(map(Path, par["nrReadsNrGenesPerChrom"])) + for nr_reads_nr_genes_file in par["nrReadsNrGenesPerChrom"]: + nr_reads_nr_gene_well = pd.read_csv(nr_reads_nr_genes_file, + header=0, delimiter="\t", + dtype={"WellBC": pd.StringDtype(), + "WellID": pd.StringDtype(), + "Chr": pd.StringDtype(), + "NumberOfReads": pd.UInt64Dtype(), + "NumberOfGenes": pd.UInt64Dtype()}) + if nr_reads_nr_gene_well.empty: + raise ValueError(f"{nr_reads_nr_genes_file.name} does not seem to contain any information!") + nr_reads_nr_genes_wells.append(nr_reads_nr_gene_well) + nr_reads_nr_genes_pool = pd.concat(nr_reads_nr_genes_wells, ignore_index=True,) + total_nr_reads_per_chromosome = nr_reads_nr_genes_pool.pivot_table(index=INDEX_COL, columns="Chr", + values=["NumberOfReads"], fill_value=0, + aggfunc="sum").droplevel(0, axis=1) + total_nr_reads_per_chromosome.columns.name = None + # Remove scaffolds/chromosomes with no counts + total_nr_reads_per_chromosome = total_nr_reads_per_chromosome.loc[:, (total_nr_reads_per_chromosome != 0).any(axis=0)] + ##### Total number of genes from all chromosomes + total_nr_genes = nr_reads_nr_genes_pool.loc[:, INDEX_COL + ['NumberOfGenes']].groupby(["WellBC", "WellID"]).sum() + + ##### Total counts across (irrespective of chromosome) + total_sum_of_reads = total_nr_reads_per_chromosome.sum(numeric_only=True, axis=1) + + ##### Logic to split up chromosome per type + chromosome_names = total_nr_reads_per_chromosome.columns.to_list() + chr_regex = re.compile(r"^(chr)?\d+") + matching_chromosomes = [chr_name for chr_name + in chromosome_names + if chr_regex.match(chr_name)] + sex_chromosome_names = ["X", "Y"] + mitochondrial_chr_name = "MT" + # This is logic from the original HT pipeline, + # only when all of the matched chromosomes start with "chr", the mitochonrial, X and Y + # chromosomes should also start with 'chr' + if all(chr_name.startswith("chr") for chr_name in matching_chromosomes): + sex_chromosome_names += ["chrX", "chrY"] + mitochondrial_chr_name = "chrM" + + ###### Counts for mitochondrial reads + try: + mitochondrial_reads = total_nr_reads_per_chromosome.loc[:,mitochondrial_chr_name] + except KeyError: + mitochondrial_reads = 0 + percentage_mitochondrial_reads = round(mitochondrial_reads / total_sum_of_reads * 100, 2) + + ###### Counts for ERCC reads + total_ercc_reads = total_nr_reads_per_chromosome.filter(regex=r"^ERCC").sum(axis=1) + percentage_ercc_reads = round(total_ercc_reads / total_sum_of_reads * 100, 2) + + ###### Counts for nuclear chromosomes + total_chromosomal_reads = total_nr_reads_per_chromosome.loc[:,matching_chromosomes].sum(axis=1) + percentage_chromosomal_reads = round(total_chromosomal_reads / total_sum_of_reads * 100, 2) + + cols_to_add = { + "pctChrom": percentage_chromosomal_reads, + "pctMT": percentage_mitochondrial_reads, + "pctERCC": percentage_ercc_reads, + "SumReads": total_sum_of_reads, + "NumberOfGenes": total_nr_genes, + "NumberOfERCCReads": total_ercc_reads, + "NumberOfChromReads": total_chromosomal_reads, + "NumberOfMTReads": mitochondrial_reads, + } + total_nr_reads_per_chromosome = total_nr_reads_per_chromosome.assign( + **cols_to_add + ) + + total_nr_reads_per_chromosome.reset_index(names=INDEX_COL)\ + .to_csv(par["nrReadsNrGenesPerChromPool"], sep="\t", + header=True, index=False, float_format="%g", + columns=tuple(INDEX_COL) + tuple(chromosome_names) + tuple(cols_to_add.keys()) + ) + diff --git a/src/stats/generate_pool_statistics/test.py b/src/stats/generate_pool_statistics/test.py new file mode 100644 index 00000000..7fb80b33 --- /dev/null +++ b/src/stats/generate_pool_statistics/test.py @@ -0,0 +1,269 @@ +from uuid import uuid4 +from textwrap import dedent +from subprocess import CalledProcessError +import pandas as pd +import re +import pytest +import sys +from pathlib import Path + +### VIASH START +meta = { + "resources_dir": "./src/stats/generate_pool_statistics/", + "executable": "target/executable/stats/generate_pool_statistics/generate_pool_statistics", + "config": "src/stats/generate_pool_statistics/config.vsh.yaml" +} +### VIASH END + +@pytest.fixture +def random_path(tmp_path): + def wrapper(extension=None): + extension = "" if not extension else f".{extension}" + return Path(tmp_path / f"{uuid4()}{extension}") + return wrapper + + +@pytest.fixture +def random_tsv_path(random_path): + def wrapper(): + return random_path(".tsv") + return wrapper + + +@pytest.fixture +def simple_input_file_one(random_tsv_path, request): + prefix = request.param + mito_name = f"{prefix}M{'T' if not prefix else ''}" + + contents = dedent( + f"""\ + WellBC WellID Chr NumberOfReads NumberOfGenes + AGG A1 {prefix}1 2 1 + AGG A1 {prefix}2 3 2 + AGG A1 {prefix}3 4 2 + AGG A1 {mito_name} 4 2 + AGG A1 {prefix}X 2 3 + AGG A1 ERCC-1 1 1 + AGG A1 ERCC-2 1 1 + """) + output_file = random_tsv_path() + with output_file.open("w") as open_file: + open_file.write(contents) + return output_file + + +@pytest.fixture +def simple_input_file_two(random_tsv_path, request): + prefix = request.param + contents = dedent( + f"""\ + WellBC WellID Chr NumberOfReads NumberOfGenes + CCC B2 {prefix}2 2 1 + CCC B2 {prefix}3 3 2 + CCC B2 {prefix}5 4 2 + CCC B2 {prefix}1 4 2 + CCC B2 {prefix}Y 2 3 + CCC B2 {prefix}X 2 3 + CCC B2 ERCC-3 1 1 + CCC B2 ERCC-2 1 1 + """) + output_file = random_tsv_path() + with output_file.open("w") as open_file: + open_file.write(contents) + return output_file + +@pytest.fixture +def empty_input_file(random_tsv_path): + contents = dedent( + f"""\ + WellBC WellID Chr NumberOfReads NumberOfGenes + """) + output_file = random_tsv_path() + with output_file.open("w") as open_file: + open_file.write(contents) + return output_file + + +@pytest.mark.parametrize("simple_input_file_one,simple_input_file_two,expected", [("chr", "chr", "chr"), ("", "", "")], + indirect=["simple_input_file_one", "simple_input_file_two"]) +def test_generate_pool_statistics_simple(run_component, simple_input_file_one, + simple_input_file_two, random_tsv_path, expected): + + output_path = random_tsv_path() + run_component([ + "--nrReadsNrGenesPerChrom", simple_input_file_one, + "--nrReadsNrGenesPerChrom", simple_input_file_two, + "--nrReadsNrGenesPerChromPool", output_path + ]) + mito_name = f"{expected}M{'T' if not expected else ''}" + expected_dict = { + "WellBC": ["AGG", "CCC"], + "WellID": ["A1", "B2"], + "ERCC-1": ["1", "0"], + "ERCC-2": ["1", "1"], + "ERCC-3": ["0", "1"], + f"{expected}1": ["2", "4"], + f"{expected}2": ["3", "2"], + f"{expected}3": ["4", "3"], + f"{expected}5": ["0", "4"], + f"{mito_name}": ["4", "0"], + f"{expected}X": ["2", "2"], + f"{expected}Y": ["0", "2"], + "SumReads": ["17", "19"], + "pctMT": ["23.53", "0"], + "pctERCC": ["11.76", "10.53"], + "pctChrom": ["52.94", "68.42"], + "NumberOfGenes": ["12", "15"], + "NumberOfMTReads": ["4", "0"], + "NumberOfChromReads": ["9", "13"], + "NumberOfERCCReads": ["2", "2"], + } + expected_frame = pd.DataFrame.from_dict(expected_dict, dtype=pd.StringDtype()) + assert output_path.is_file() + contents = pd.read_csv(output_path, sep="\t", dtype=pd.StringDtype()) + pd.testing.assert_frame_equal(contents, expected_frame, check_like=True) + + +def test_only_numerical_chromosomes(run_component, random_tsv_path): + """ + The chromosome column might be read as an integer instead of a string, + make sure that a numerical column only works. + """ + output_path = random_tsv_path() + contents1 = dedent( + f"""\ + WellBC WellID Chr NumberOfReads NumberOfGenes + CCC B2 2 2 1 + CCC B2 3 3 2 + CCC B2 5 4 2 + CCC B2 1 4 2 + """) + input_file_1 = random_tsv_path() + with input_file_1.open("w") as open_file: + open_file.write(contents1) + + contents2 = dedent( + f"""\ + WellBC WellID Chr NumberOfReads NumberOfGenes + AGG A1 2 2 1 + AGG A1 3 3 2 + AGG A1 5 4 2 + AGG A1 1 4 2 + """) + input_file_2 = random_tsv_path() + with input_file_2.open("w") as open_file: + open_file.write(contents2) + output_path = random_tsv_path() + run_component([ + "--nrReadsNrGenesPerChrom", input_file_1, + "--nrReadsNrGenesPerChrom", input_file_2, + "--nrReadsNrGenesPerChromPool", output_path + ]) + + expected_dict = { + "WellBC": ["AGG", "CCC"], + "WellID": ["A1", "B2"], + "1": ["4", "4"], + "2": ["2", "2"], + "3": ["3", "3"], + "5": ["4", "4"], + "pctChrom": ["100", "100"], + "pctMT": ["0", "0"], + "pctERCC": ["0", "0"], + "SumReads": ["13", "13"], + "NumberOfGenes": ["7", "7"], + "NumberOfERCCReads": ["0", "0"], + "NumberOfChromReads": ["13", "13"], + "NumberOfMTReads": ["0", "0"], + } + expected_frame = pd.DataFrame.from_dict(expected_dict, + dtype=pd.StringDtype()) + + assert output_path.is_file() + contents = pd.read_csv(output_path, sep="\t", dtype=pd.StringDtype()) + pd.testing.assert_frame_equal(contents, expected_frame, check_like=True) + + +@pytest.mark.parametrize("simple_input_file_one", [("")], + indirect=["simple_input_file_one"]) +def test_empty_input_raises(run_component, simple_input_file_one, empty_input_file, random_tsv_path): + """ + When an input file contains no data, raise an error. + """ + output_path = random_tsv_path() + with pytest.raises(CalledProcessError) as err: + run_component([ + "--nrReadsNrGenesPerChrom", simple_input_file_one, + "--nrReadsNrGenesPerChrom", empty_input_file, + "--nrReadsNrGenesPerChromPool", output_path + ]) + assert re.search( + rf"{empty_input_file.name} does not seem to contain any information", + err.value.stdout.decode("utf-8"), + ) + +def test_remove_chromosomes_with_no_counts(run_component, random_tsv_path): + """ + If a chromosome has no counts across all of the wells, it should + not be included in the output + """ + output_path = random_tsv_path() + contents1 = dedent( + f"""\ + WellBC WellID Chr NumberOfReads NumberOfGenes + CCC B2 2 2 1 + CCC B2 3 3 2 + CCC B2 5 4 2 + CCC B2 1 4 2 + CCC B2 empty 0 0 + """) + input_file_1 = random_tsv_path() + with input_file_1.open("w") as open_file: + open_file.write(contents1) + + contents2 = dedent( + f"""\ + WellBC WellID Chr NumberOfReads NumberOfGenes + AGG A1 2 2 1 + AGG A1 3 3 2 + AGG A1 5 4 2 + AGG A1 1 4 2 + AGG A1 empty 0 0 + """) + input_file_2 = random_tsv_path() + with input_file_2.open("w") as open_file: + open_file.write(contents2) + output_path = random_tsv_path() + run_component([ + "--nrReadsNrGenesPerChrom", input_file_1, + "--nrReadsNrGenesPerChrom", input_file_2, + "--nrReadsNrGenesPerChromPool", output_path + ]) + # Here, the chromosome called "empty" should not be included + expected_dict = { + "WellBC": ["AGG", "CCC"], + "WellID": ["A1", "B2"], + "1": ["4", "4"], + "2": ["2", "2"], + "3": ["3", "3"], + "5": ["4", "4"], + "pctChrom": ["100", "100"], + "pctMT": ["0", "0"], + "pctERCC": ["0", "0"], + "SumReads": ["13", "13"], + "NumberOfGenes": ["7", "7"], + "NumberOfERCCReads": ["0", "0"], + "NumberOfChromReads": ["13", "13"], + "NumberOfMTReads": ["0", "0"], + } + expected_frame = pd.DataFrame.from_dict(expected_dict, + dtype=pd.StringDtype()) + + assert output_path.is_file() + contents = pd.read_csv(output_path, sep="\t", dtype=pd.StringDtype()) + pd.testing.assert_frame_equal(contents, expected_frame, check_like=True) + + + +if __name__ == '__main__': + sys.exit(pytest.main([__file__])) \ No newline at end of file diff --git a/src/stats/generate_well_statistics/config.vsh.yaml b/src/stats/generate_well_statistics/config.vsh.yaml new file mode 100644 index 00000000..6a2118dd --- /dev/null +++ b/src/stats/generate_well_statistics/config.vsh.yaml @@ -0,0 +1,93 @@ +name: generate_well_statistics +namespace: "stats" +description: Generate summary statistics from BAM files generated by STAR solo. +authors: + - __merge__: /src/base/authors/dries_schaumont.yaml + roles: [ author, maintainer ] + - __merge__: /src/base/authors/marijke_van_moerbeke.yaml + roles: [ contributor ] +argument_groups: + - name: "Arguments" + arguments: + - name: "--input" + type: file + description: "The .bam file as returned by the mapping tool STAR." + direction: input + example: "input.bam" + - name: "--barcode" + type: string + description: | + The barcode for the well that is being processed. Is only used to add a metadata + column to all output files. + required: true + - name: "--well_id" + type: string + description: | + ID of this well. Only used to add a metadata column to the output files. + required: true + - name: "--processedBAMFile" + type: file + description: | + Path to a .tsv file listing, per read in the BAM file, + the value for the "CB", "UX", "GX" and "GN" tag, together with the + chromsome to which the read was mapped to. + direction: output + default: processedBamFile.txt + - name: "--nrReadsNrGenesPerChrom" + type: file + description: | + Path to an output file that contains a .tsv formatted table describing + per chromosome the number of reads that were mapped to that chromosome (NumberOfReads + column) and the number of genes on that chromosome that had at least one + read mapped to it (NumberOfGenes). + default: nrReadsNrGenesPerChrom.txt + direction: output + - name: "--nrReadsNrUMIsPerCB" + type: file + description: | + Path to an output file that contains a .tsv formatted table describing + per barcode the number of UMI's (nrUMIs) and the total number of reads (NumberOfReads). + direction: output + default: nrReadsNrUMIsPerCB.txt + - name: "--umiFreqTop" + type: file + description: | + Path to an output file that contains a .tsv formatted table describing + per UMI (column UB) the frequency at which they occur in the reads (column + N). Only the top 100 UMIs are included. + default: umiFreqTop100.txt + direction: output + - name: "--threads" + type: integer + description: | + Number of threads to use for decompressing BAM files. + min: 1 + default: 1 +resources: +- type: python_script + path: script.py + +test_resources: + - type: python_script + path: test.py + - path: test.sam + - path: empty.sam + +engines: + - type: docker + image: python:3.13-trixie + setup: + - type: apt + packages: + - procps + - type: python + packages: + - pysam + - pandas + test_setup: + - type: python + packages: + - viashpy +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/stats/generate_well_statistics/empty.sam b/src/stats/generate_well_statistics/empty.sam new file mode 100644 index 00000000..199446e6 --- /dev/null +++ b/src/stats/generate_well_statistics/empty.sam @@ -0,0 +1,3 @@ +@HD VN:1.4 SO:coordinate +@SQ SN:1 LN:200 +@SQ SN:2 LN:50 \ No newline at end of file diff --git a/src/stats/generate_well_statistics/script.py b/src/stats/generate_well_statistics/script.py new file mode 100644 index 00000000..90a8397d --- /dev/null +++ b/src/stats/generate_well_statistics/script.py @@ -0,0 +1,83 @@ +import pysam +import pandas as pd +import logging + +### VIASH START +par = { + "input": "src/stats/generate_well_statistics/test.sam", + "processedBAMFile": "processedBamFile.txt", + "nrReadsNrGenesPerChrom": "nrReadsNrGenesPerChrom.txt", + "nrReadsNrUMIsPerCB": "nrReadsNrUMIsPerCB.txt", + "umiFreqTop": "umiFreqTop.txt", + "threads": 1, + "barcode": "ACGT", + "well_id": "A1", +} +### VIASH END +logger = logging.getLogger() +console_handler = logging.StreamHandler() +logger.addHandler(console_handler) +logger.setLevel(logging.DEBUG) + +if __name__ == "__main__": + logger.info("Component started.") + parameters_str = [f'\t{param}: {param_val}\n' for param, param_val in par.items()] + logger.info("Parameters:\n%s", "".join(parameters_str).rstrip()) + logger.info("Opening '%s'", par["input"]) + samfile = pysam.AlignmentFile(par["input"], "rb", threads=par["threads"]) + all_tags = [] + index = [] + tags_selection = ("CB", "UB", "GX", "GN") + for aligned_segment in samfile: + tags = dict(aligned_segment.get_tags()) + all_tags.append(tags) + reference_name = aligned_segment.reference_name + index.append("*" if not reference_name else reference_name) + if not index: + # Workaround for https://github.com/pandas-dev/pandas/issues/58594 + tag_dataframe = pd.DataFrame([], index=[], columns=tags_selection) + else: + tag_dataframe = pd.DataFrame.from_records(all_tags, index=index, + columns=tags_selection) + tag_dataframe_to_write = tag_dataframe.copy() + logger.info("Done reading BAM file. Found %i entries", tag_dataframe.shape[0]) + tag_dataframe.assign(WellBC=par["barcode"], WellID=par["well_id"])\ + .reset_index(names="Chr")\ + .to_csv(par["processedBAMFile"], sep="\t", na_rep="", + header=True, index=False, + columns=("WellBC", "WellID", "Chr") + tags_selection) + logger.info("Constructing of dataframe done.") + # Number of genes that had a read mapped to them per chromosome, + # and the number of reads mapped to those genes per chromosome. + nr_reads_nr_genes = tag_dataframe.dropna(subset=["GX"]).groupby(level=0).agg( + NumberOfReads=pd.NamedAgg("GX", aggfunc="size"), + NumberOfGenes=pd.NamedAgg(column="GX", aggfunc="nunique") + ) + nr_reads_nr_genes = nr_reads_nr_genes.reindex(samfile.header.references, fill_value=0) + logger.info("Done calculating number of reads per gene and per chromesome. Writing to %s", + par['nrReadsNrGenesPerChrom']) + nr_reads_nr_genes.reset_index(names="Chr").assign(WellBC=par["barcode"], WellID=par["well_id"])\ + .to_csv(par["nrReadsNrGenesPerChrom"], sep="\t", + header=True, index=False, + columns=("WellBC", "WellID", "Chr", "NumberOfReads", "NumberOfGenes")) + + # Number of reads mapped to the reference, grouped by UMI + nr_read_per_umi = tag_dataframe.groupby('UB').size()\ + .drop("", errors="ignore").sort_values(ascending=False).head(100) + nr_read_per_umi_df = nr_read_per_umi.to_frame(name="N") + logger.info("Done calculating number of mapped reads per UMI, writing to %s", par["umiFreqTop"]) + nr_read_per_umi_df.assign(WellBC=par["barcode"], WellID=par["well_id"]).reset_index(names="UB")\ + .to_csv(par["umiFreqTop"], header=True, sep="\t", + index=False, columns=("WellBC", "WellID", "UB", "N")) + + # Total number of mapped reads and total number of UMIs (not grouped per chromosome) + nr_reads_and_umi_per_barcode = tag_dataframe.groupby(by="CB").agg( + NumberOfReads=pd.NamedAgg("CB", "size"), + nrUMIs=pd.NamedAgg("UB", "nunique") + ) + logger.info("Done calculating number of mapped reads and number of UMIs per Cell Barcode, writing to %s", + par["nrReadsNrUMIsPerCB"]) + nr_reads_and_umi_per_barcode.assign(WellBC=par["barcode"], WellID=par["well_id"]).reset_index(names="CB")\ + .to_csv(par["nrReadsNrUMIsPerCB"], sep="\t", header=True, + index=False, columns=("WellBC", "WellID", "CB", "NumberOfReads", "nrUMIs")) + logger.info("Finished!") \ No newline at end of file diff --git a/src/stats/generate_well_statistics/test.py b/src/stats/generate_well_statistics/test.py new file mode 100644 index 00000000..70b770c5 --- /dev/null +++ b/src/stats/generate_well_statistics/test.py @@ -0,0 +1,166 @@ +import sys +import pytest +import pysam +from uuid import uuid4 +from pathlib import Path +from textwrap import dedent + +### VIASH START +meta = { + "resources_dir": "./src/stats/generate_well_statistics/", + "executable": "target/executable/stats/generate_well_statistics/generate_well_statistics", + "config": "src/stats/generate_well_statistics/config.vsh.yaml" +} +### VIASH END + +def assert_file_content_equals(file_to_check, expected): + with file_to_check.open('r') as open_file: + contents = open_file.read() + assert contents == expected + + +@pytest.fixture +def input_sam_path(): + return Path(meta["resources_dir"]) / "test.sam" + + +@pytest.fixture +def random_path(tmp_path): + def wrapper(extension=None): + extension = "" if not extension else f".{extension}" + return tmp_path / f"{uuid4()}{extension}" + return wrapper + +@pytest.fixture +def random_bam_path(random_path): + def wrapper(): + return random_path(".bam") + return wrapper + + +@pytest.fixture +def sam_to_bam(random_bam_path): + def wrapper(sam_file): + out_path = random_bam_path() + with pysam.AlignmentFile(sam_file, "r") as infile, \ + pysam.AlignmentFile(out_path, "wb", template=infile) as outfile: + for s in infile: + outfile.write(s) + infile.close() + return out_path + return wrapper + + +@pytest.fixture +def empty_sam_path(): + return Path(meta["resources_dir"]) / "empty.sam" + + +def test_generate_well_statistics_simple_bam(run_component, input_sam_path, sam_to_bam, random_path): + bam_file = sam_to_bam(input_sam_path) + processed_bam = random_path("tsv") + reads_per_chromosome = random_path("tsv") + nr_reads_nr_umis_per_cb = random_path("tsv") + top_onehundred_umis = random_path("tsv") + run_component([ + "--input", bam_file, + "--processedBAMFile", processed_bam, + "--nrReadsNrGenesPerChrom", reads_per_chromosome, + "--nrReadsNrUMIsPerCB", nr_reads_nr_umis_per_cb, + "--umiFreqTop", top_onehundred_umis, + "--barcode", "ACGT", + "--well_id", "A1", + ]) + for file_path in (processed_bam, reads_per_chromosome, + nr_reads_nr_umis_per_cb, top_onehundred_umis): + assert file_path.is_file() + + expected_processed_bam = \ + dedent("""\ + WellBC WellID Chr CB UB GX GN + ACGT A1 1 ACA CGG gene1 gene1 + ACGT A1 1 ACA CGG gene1 gene1 + ACGT A1 2 GGG GTT gene2 gene2 + ACGT A1 2 GGG GTC gene3 gene3 + """) + + expected_reads_per_chromosome = \ + dedent("""\ + WellBC WellID Chr NumberOfReads NumberOfGenes + ACGT A1 1 2 1 + ACGT A1 2 2 2 + """) + + expected_nr_reads_nr_umis_per_cb = \ + dedent("""\ + WellBC WellID CB NumberOfReads nrUMIs + ACGT A1 ACA 2 1 + ACGT A1 GGG 2 2 + """) + + expected_top_onehundred_umis = \ + dedent("""\ + WellBC WellID UB N + ACGT A1 CGG 2 + ACGT A1 GTC 1 + ACGT A1 GTT 1 + """) + + assert_file_content_equals(processed_bam, expected_processed_bam) + assert_file_content_equals(reads_per_chromosome, expected_reads_per_chromosome) + assert_file_content_equals(nr_reads_nr_umis_per_cb, expected_nr_reads_nr_umis_per_cb) + assert_file_content_equals(top_onehundred_umis, expected_top_onehundred_umis) + + +def test_empty_sam(run_component, empty_sam_path, sam_to_bam, random_path): + """ + Test an empty bam file. Make sure that chromosomes with mapped reads + are still represented. Ran into issue https://github.com/pandas-dev/pandas/pull/59258 + """ + bam_file = sam_to_bam(empty_sam_path) + processed_bam = random_path("tsv") + reads_per_chromosome = random_path("tsv") + nr_reads_nr_umis_per_cb = random_path("tsv") + top_onehundred_umis = random_path("tsv") + run_component([ + "--input", bam_file, + "--processedBAMFile", processed_bam, + "--nrReadsNrGenesPerChrom", reads_per_chromosome, + "--nrReadsNrUMIsPerCB", nr_reads_nr_umis_per_cb, + "--umiFreqTop", top_onehundred_umis, + "--barcode", "ACGT", + "--well_id", "A1", + ]) + for file_path in (processed_bam, reads_per_chromosome, + nr_reads_nr_umis_per_cb, top_onehundred_umis): + assert file_path.is_file() + + expected_processed_bam = \ + dedent("""\ + WellBC WellID Chr CB UB GX GN + """) + + expected_reads_per_chromosome = \ + dedent("""\ + WellBC WellID Chr NumberOfReads NumberOfGenes + ACGT A1 1 0 0 + ACGT A1 2 0 0 + """) + + expected_nr_reads_nr_umis_per_cb = \ + dedent("""\ + WellBC WellID CB NumberOfReads nrUMIs + """) + + expected_top_onehundred_umis = \ + dedent("""\ + WellBC WellID UB N + """) + + assert_file_content_equals(processed_bam, expected_processed_bam) + assert_file_content_equals(reads_per_chromosome, expected_reads_per_chromosome) + assert_file_content_equals(nr_reads_nr_umis_per_cb, expected_nr_reads_nr_umis_per_cb) + assert_file_content_equals(top_onehundred_umis, expected_top_onehundred_umis) + +if __name__ == '__main__': + sys.exit(pytest.main([__file__])) \ No newline at end of file diff --git a/src/stats/generate_well_statistics/test.sam b/src/stats/generate_well_statistics/test.sam new file mode 100644 index 00000000..9fe9556f --- /dev/null +++ b/src/stats/generate_well_statistics/test.sam @@ -0,0 +1,7 @@ +@HD VN:1.4 SO:coordinate +@SQ SN:1 LN:200 +@SQ SN:2 LN:50 +test_1 16 1 22 255 1M * 0 0 C I NH:i:1 HI:i:1 nM:i:0 AS:i:47 CR:Z:ACA UR:Z:CGG GX:Z:gene1 GN:Z:gene1 CB:Z:ACA UB:Z:CGG +test_2 16 1 22 255 1M * 0 0 G ! NH:i:1 HI:i:1 nM:i:0 AS:i:47 CR:Z:ACA UR:Z:CGG GX:Z:gene1 GN:Z:gene1 CB:Z:ACA UB:Z:CGG +test_3 0 2 40 255 1M * 0 0 T ! NH:i:1 HI:i:1 nM:i:0 AS:i:47 CR:Z:GGG UR:Z:GTT GX:Z:gene2 GN:Z:gene2 CB:Z:GGG UB:Z:GTT +test_4 0 2 60 255 1M * 0 0 C ! NH:i:1 HI:i:1 nM:i:0 AS:i:47 CR:Z:GGG UR:Z:GTC GX:Z:gene3 GN:Z:gene3 CB:Z:GGG UB:Z:GTC \ No newline at end of file diff --git a/src/utils/concatRuns/config.vsh.yaml b/src/utils/concatRuns/config.vsh.yaml new file mode 100644 index 00000000..272026bc --- /dev/null +++ b/src/utils/concatRuns/config.vsh.yaml @@ -0,0 +1,43 @@ +name: concatRuns +namespace: utils +description: | + Concatenate well FASTQ files from different runs in order to increase sequencing depth. +arguments: + - name: "--input_r1" + type: file + required: true + multiple: true + - name: "--input_r2" + type: file + required: true + multiple: true + - name: "--sample_id" + type: string + required: true + - name: "--output_r1" + type: file + multiple: true + description: Path to read 1 fastq/fasta file + direction: output + - name: "--output_r2" + type: file + multiple: true + description: Path to read 2 fastq/fasta file + direction: output +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf +dependencies: + - name: concat_text + repository: cb +repositories: + - name: cb + type: vsh + repo: craftbox + tag: v0.3.0 +runners: + - type: nextflow + +engines: + - type: native diff --git a/src/utils/concatRuns/main.nf b/src/utils/concatRuns/main.nf new file mode 100644 index 00000000..7550ebee --- /dev/null +++ b/src/utils/concatRuns/main.nf @@ -0,0 +1,128 @@ +workflow run_wf { + + take: + input_ch + + main: + // Count the number of input events per sample + // Results from events with the same sample ID need to be concatenated. + event_counts_ch = input_ch + | map {id, state -> + def new_state = state + ["event_id": id] + def new_event = [state.sample_id, new_state] + return new_event + } + | groupTuple(by: 0) + | flatMap { id, states -> + def orig_event_ids = states.collect{it.event_id} + def new_events = orig_event_ids.collect{ orig_event_id -> + [orig_event_id, ["n_events": states.size()]] + } + return new_events + } + + + // The number of events per sample needs is passed number to `groupTuple()` + // so that it can emit the sample as soon as it is ready. This makes sure + // that the samples are processed asynchronously. + output_ch = input_ch.join(event_counts_ch) + | flatMap {id, state_demultiplex, state_event_counts -> + assert state_demultiplex.input_r1.size() == state_demultiplex.input_r2.size(), + "Expected output from well demultiplexing to contain equal amount or forward and reverse FASTQ files." + def new_states = [state_demultiplex.input_r1, state_demultiplex.input_r2].transpose().collect{ fastq_files -> + def (r1_file, r2_file) = fastq_files + def regex = ~/^(\w+)_R[12]{1}_001\.fastq(\.gz)?$/ + def parsed_file_name = r1_file.name =~ regex + def parsed_file_name_r2 = r2_file.name =~ regex + def well_id = parsed_file_name[0][1] + def well_id_r2 = parsed_file_name_r2[0][1] + + assert (well_id.length() != 0) && (well_id == well_id_r2) + def new_state = state_demultiplex + [ + "input_r1": r1_file, + "input_r2": r2_file, + "event_id": id, + ] + def group_settings = groupKey("${state_demultiplex.sample_id}_${well_id}", state_event_counts.n_events) + return [group_settings, new_state] + + } + return new_states + } + | groupTuple(by: 0, sort: "hash", remainder: true) + | map {group_settings, sample_states -> + def input_r1 = sample_states.collect{it.input_r1}.flatten() + def input_r2 = sample_states.collect{it.input_r2}.flatten() + def event_ids = sample_states.collect{it.event_id} + def sample_id_list = sample_states.collect{it.sample_id}.unique() + assert sample_id_list.size() == 1 + def sample_id = sample_id_list[0] + assert input_r1.size() == input_r2.size() + + def new_state = [ + "input_r1": input_r1, + "input_r2": input_r2, + "event_id": event_ids, + "sample_id": sample_id, + ] + return [group_settings.target, new_state] + } + | concat_text.run( + directives: [label: ["lowmem", "lowcpu"]], + key: "concat_samples_r1", + runIf: {id, state -> state.input_r1.size() > 1}, + fromState: { id, state -> + def output_file_name = state.input_r1[0].name + [ + input: state.input_r1, + gzip_output: false, + output: output_file_name + ] + }, + toState: { id, result, state -> + def newState = state + [ input_r1: [ result.output ] ] + return newState + } + ) + | concat_text.run( + directives: [label: ["lowmem", "lowcpu"]], + key: "concat_samples_r2", + runIf: {id, state -> state.input_r2.size() > 1}, + fromState: { id, state -> + def output_file_name = state.input_r2[0].name + [ + input: state.input_r2, + gzip_output: false, + output: output_file_name + ] + }, + toState: { id, result, state -> + def newState = state + [ input_r2: [ result.output ] ] + return newState + } + ) + | map {id, state -> + def new_state = [state.sample_id, state] + return new_state + } + | groupTuple(by: 0, sort: 'hash') + | map {id, states -> + def new_state = [ + "input_r1": states.collect{it.input_r1}.flatten(), + "input_r2": states.collect{it.input_r2}.flatten(), + "_meta": ["join_id": states[0].event_id[0]] + ] + return [id, new_state] + } + | setState( + [ + "output_r1": "input_r1", + "output_r2": "input_r2", + "_meta": "_meta" + ] + ) + + emit: + output_ch + +} diff --git a/src/utils/listInputDir/config.vsh.yaml b/src/utils/listInputDir/config.vsh.yaml new file mode 100644 index 00000000..1c450ed2 --- /dev/null +++ b/src/utils/listInputDir/config.vsh.yaml @@ -0,0 +1,45 @@ +name: listInputDir +namespace: utils +description: List the contents of a directory and parse contained fastq files +arguments: + - name: "--input" + alternatives: [-i] + type: file + description: Path to the directory containing fastq files + required: true + example: fastq_dir + - name: --pools + description: "Pool names to include. By default all pools are selected for analysis." + type: string + multiple: true + - name: "--r1_output" + type: file + description: Path to read 1 fastq/fasta file + direction: output + - name: "--r2_output" + type: file + description: Path to read 2 fastq/fasta file + direction: output + - name: "--lane" + type: string + description: Lane nr + direction: output + - name: "--sample" + type: string + description: Sample nr + direction: output + - name: "--sample_id" + type: string + description: Sample name + direction: output + +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + +runners: + - type: nextflow + +engines: + - type: native diff --git a/src/utils/listInputDir/main.nf b/src/utils/listInputDir/main.nf new file mode 100644 index 00000000..4ed93cdc --- /dev/null +++ b/src/utils/listInputDir/main.nf @@ -0,0 +1,72 @@ +workflow run_wf { + + take: in_ + + main: + + out_ = in_ + | flatMap{ id, state -> + println "Looking for fastq files in ${state.input}" + def allFastqs = state.input + .listFiles() + .findAll{ + it.isFile() && + it.name ==~ /^.+\.fastq.gz$|^.+\.fastq$|^.+\.fasta$/ + } + println "Found ${allFastqs.size()} fastq/fasta files in ${state.input}" + assert allFastqs.size() > 0: "No fastq/fasta files found" + + println("Extracting information from fastq/fasta filenames") + def processed_fastqs = allFastqs.collect { f -> + def regex = ~/^(\S+)_S(\d+)_(L(\d+)_)?R(\d)_(\d+)\.fast[qa](\.gz)?$/ + def validFastq = f.name ==~ regex + + assert validFastq: "${f} does not match the regex ${regex}" + + def parsedFastq = f.name =~ regex + def lane = parsedFastq[0][3] + // Remove the trailing '_' + def lane_remove_trailing = lane == null ? "" : lane.replaceAll('_$', "") + def sample_id = parsedFastq[0][1] + if (sample_id in ["Undetermined"] || (state.pools && !state.pools.isEmpty() && !state.pools.contains(sample_id))) { + return null + } + return [ + "fastq": f, + "sample_id": sample_id, + "sample": parsedFastq[0][2], + "lane": lane_remove_trailing, + "read": parsedFastq[0][5], + ] + } + + println("Group paired fastq/fasta files") + def grouped = processed_fastqs + .findAll{it != null} + .groupBy({it.sample_id}, {it.lane}) + .collectMany{ sample_id, states_per_lane -> + def result = states_per_lane.collect{lane, lane_states -> + assert lane_states.size() == 2, "Expected to find two fastq files per lane! " + + "Found ${lane_states.size()}. State: ${states_per_lane}" + def r1_state = lane_states.find({it.read == "1"}) + def r2_state = lane_states.find({it.read == "2"}) + def fastq_state = [ + "r1_output": r1_state.fastq, + "r2_output": r2_state.fastq + ] + def new_state = fastq_state + + r1_state.findAll{it.key in ["sample_id", "sample", "lane"]} + + ["_meta": ["join_id": id]] + def new_id = lane?.trim() ? "${sample_id}_${lane}".toString() : sample_id + return [new_id, new_state] + } + return result + + } + return grouped + + } + + emit: out_ + +} diff --git a/src/utils/save_params/config.vsh.yaml b/src/utils/save_params/config.vsh.yaml new file mode 100644 index 00000000..d21924da --- /dev/null +++ b/src/utils/save_params/config.vsh.yaml @@ -0,0 +1,47 @@ +name: save_params +namespace: utils +description: | + Save parameters to a YAML file + +argument_groups: + - name: Inputs + arguments: + - name: "--id" + description: | + The id of the job + type: string + required: true + - name: "--params_yaml" + description: | + base64 encoded yaml containing the state + type: string + required: true + + - name: Outputs + arguments: + - name: "--output" + description: | + The output YAML file + type: file + direction: output + required: true + example: "output.yaml" + +resources: + - type: python_script + path: script.py + +engines: + - type: docker + image: python:3.12-slim + setup: + - type: apt + packages: + - procps + - type: python + packages: + - pyyaml + +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/utils/save_params/script.py b/src/utils/save_params/script.py new file mode 100644 index 00000000..0eed41d9 --- /dev/null +++ b/src/utils/save_params/script.py @@ -0,0 +1,28 @@ +import re +import yaml +import base64 + +## VIASH START +par = { + "id": "sample_one", + "params_yaml": "cGFyYW1zX3lhbWw6IHt9Cg==", + "output": "output.yaml" +} +## VIASH END + +class Dumper(yaml.Dumper): + def increase_indent(self, flow=False, indentless=False): + return super(Dumper, self).increase_indent(flow, False) + +def decode_params_yaml(encoded_yaml): + yaml_bytes = base64.b64decode(encoded_yaml) + yaml_string = yaml_bytes.decode('utf-8') + yaml_data = yaml.safe_load(yaml_string) + + return yaml_data + +params = decode_params_yaml(par['params_yaml']) + +with open(par["output"], 'w') as f: + yaml.dump(params, f, default_flow_style=False, Dumper=Dumper) + diff --git a/src/utils/save_params/test.py b/src/utils/save_params/test.py new file mode 100644 index 00000000..e69de29b diff --git a/src/workflows/htrnaseq/config.vsh.yaml b/src/workflows/htrnaseq/config.vsh.yaml new file mode 100644 index 00000000..cfba9f55 --- /dev/null +++ b/src/workflows/htrnaseq/config.vsh.yaml @@ -0,0 +1,144 @@ +name: htrnaseq +namespace: workflows +authors: + - __merge__: /src/base/authors/dries_schaumont.yaml + roles: [ maintainer ] +argument_groups: + - name: Input arguments + arguments: + - name: --input_r1 + description: | + Forward reads in FASTQ format. Multiple files corresponding to different lanes can be provided which will + be demultiplexed separately before joining the results for each individual well. + type: file + required: true + multiple: true + - name: --input_r2 + description: | + Reverse reads in FASTQ format. Multiple files corresponding to different lanes can be provided which will + be demultiplexed separately before joining the results for each individual well. + type: file + required: true + multiple: true + - name: --barcodesFasta + type: file + required: true + - name: "--umi_length" + description: | + Length of the UMI sequences + type: integer + min: 1 + default: 10 + - name: --genomeDir + type: file + required: true + - name: --annotation + type: file + required: true + - name: --sample_id + type: string + required: false + description: | + Sample ID for the provided input files. If not provided, the value of --id + will be used. Input files will allways be demultiplexed separately, + but the FASTQs for wells with matching sample IDs will be concatenated before mapping. + - name: Output arguments + arguments: + - name: "--fastq_output" + description: "Directory containing output fastq files" + type: file + multiple: true + required: true + default: "fastq/*" + direction: output + - name: --star_output + description: Output from mapping with STAR + type: file + direction: output + multiple: true + required: true + default: star.$id/* + - name: "--nrReadsNrGenesPerChrom" + type: file + direction: output + required: true + default: "nrReadsNrGenesPerChrom.$id.txt" + - name: "--star_qc_metrics" + type: file + direction: output + required: true + default: "starLogs.$id.txt" + - name: "--eset" + type: file + direction: output + required: true + default: eset.$id.rds + - name: "--f_data" + type: file + direction: output + required: true + default: fData.$id.tsv + - name: "--p_data" + type: file + direction: output + required: true + default: pData.$id.tsv + - name: "--html_report" + type: file + direction: output + required: true + default: report.$id.html + - name: "--run_params" + type: file + direction: output + required: false + default: params.$id.yaml +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf +test_resources: + - type: nextflow_script + path: test.nf + entrypoint: test_wf + - type: nextflow_script + path: test.nf + entrypoint: test_wf2 +dependencies: + - name: stats/combine_star_logs + repository: local + - name: stats/generate_pool_statistics + repository: local + - name: stats/generate_well_statistics + repository: local + - name: workflows/well_demultiplex + repository: local + - name: workflows/well_metadata + repository: local + - name: parallel_map + repository: local + - name: eset/create_eset + repository: local + - name: eset/create_fdata + repository: local + - name: eset/create_pdata + repository: local + - name: report/create_report + repository: local + - name: utils/concatRuns + repository: local + - name: utils/save_params + repository: local +repositories: + - name: local + type: local + - name: bb + type: vsh + repo: biobox + tag: v0.3.1 + +runners: + - type: nextflow + +engines: + - type: native diff --git a/src/workflows/htrnaseq/integration_test.sh b/src/workflows/htrnaseq/integration_test.sh new file mode 100755 index 00000000..28490902 --- /dev/null +++ b/src/workflows/htrnaseq/integration_test.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +# Make sure the workflow is built +viash ns build --setup cb --parallel + +export NXF_VER=24.04.4 + +set -eo pipefail + +nextflow \ + run . \ + -main-script src/workflows/htrnaseq/test.nf \ + -config ./src/config/labels.config \ + -entry test_wf \ + -resume \ + -profile docker,local \ + --publish_dir output + +nextflow \ + run . \ + -main-script src/workflows/htrnaseq/test.nf \ + -config ./src/config/labels.config \ + -entry test_wf2 \ + -resume \ + -profile docker,local \ + --publish_dir output2 + diff --git a/src/workflows/htrnaseq/main.nf b/src/workflows/htrnaseq/main.nf new file mode 100644 index 00000000..6b7fb1a4 --- /dev/null +++ b/src/workflows/htrnaseq/main.nf @@ -0,0 +1,359 @@ +workflow run_wf { + take: + raw_ch + + main: + input_ch = raw_ch + // Use the event ID as the default for the sample ID + | map {id, state -> + def sample_id = state.sample_id ?: id + def newState = state + ["sample_id": sample_id, "run_id": id] + return [id, newState] + } + + | save_params.run( + runIf: { id, state -> + state.run_params != null + }, + fromState: {id, state -> + // Define the function before using it + def convertPaths + convertPaths = { value -> + if (value instanceof java.nio.file.Path) + return value.toUriString() + else if (value instanceof List) + return value.collect { convertPaths(it) } + else if (value instanceof Collection) + throw new UnsupportedOperationException("Collections other than Lists are not supported") + else + return value + } + + // Apply conversion to all state values + def convertedState = state.collectEntries { k, v -> [(k): convertPaths(v)] } + + def yaml = new org.yaml.snakeyaml.Yaml() + def yamlString = yaml.dump(convertedState) + def encodedYaml = yamlString.bytes.encodeBase64().toString() + + return [ + "id": id, + "params_yaml": encodedYaml, + "output": state.run_params + ] + }, + toState: ["run_params": "output"] + ) + + // The featureData only has one requirement: the genome annotation. + // It can be generated straight away. Most of the time, there is one shared + // annotation for all of the inputs and the fData should only be calculated once. + // The state is manpulated in such a way that there is one event created per unique + // input annotation file. In turn, the featureData file can joined into the original input + // channel which allows it to be shared across events if required. + f_data_ch = input_ch + | toSortedList() + | flatMap {ids_and_states -> + def annotation_files = ids_and_states.inject([:]){ old_state, id_and_state -> + def (id, state) = id_and_state + def annotation_file = state.annotation + def new_state = old_state + [(annotation_file): (old_state.getOrDefault(annotation_file, []) + [id])] + return new_state + } + def file_names = annotation_files.keySet().collect{it.name} + assert (file_names.toSet().size() == file_names.size()), + "Please make sure that the annotation files have unique file names." + def new_states = annotation_files.collect{annotation_file, value -> + def new_state = [annotation_file.name , ["annotation": annotation_file, "event_ids": value]] + return new_state + } + return new_states + } + | create_fdata.run( + directives: [label: ["lowmem", "lowcpu"]], + fromState: [ + "gtf": "annotation", + "output": "f_data" + ], + toState: ["f_data": "output"] + ) + | flatMap {_, state -> + def new_states = state.event_ids.collect{event_id -> + [event_id, ["f_data": state.f_data]] + } + return new_states + } + + // Perform mapping of each well. + demultiplex_ch = input_ch + | well_demultiplex.run( + fromState: [ + "input_r1": "input_r1", + "input_r2": "input_r2", + "barcodesFasta": "barcodesFasta", + ], + toState: {id, result, state -> + def all_fastq = result.output_r1 + result.output_r2 + def output_dir = all_fastq.collect{it.parent}.unique() + assert output_dir.size() == 1, "Expected output from well demultiplexing (id $id) to reside into one directory. Found: $output_dir" + def new_state = state + [ + "input_r1": result.output_r1, + "input_r2": result.output_r2, + "fastq_output_directory": output_dir[0], + ] + return new_state + } + ) + + fastq_output_directory_ch = demultiplex_ch + | map {id, state -> + def new_event = [state.sample_id, state] + return new_event + } + | groupTuple(by: 0, sort: "hash") + | map {id, states -> + def fastq_output_dirs = states.collect{it.fastq_output_directory} + def new_state = ["fastq_output_directory": fastq_output_dirs] + def new_event = [id, new_state] + return [id, new_state] + } + + + concat_samples_ch = demultiplex_ch.join(f_data_ch) + | map {id, demultiplex_state, f_data_state -> + def newState = demultiplex_state + ["f_data": f_data_state["f_data"]] + [id, newState] + } + | concatRuns.run( + fromState: [ + "input_r1": "input_r1", + "input_r2": "input_r2", + "sample_id": "sample_id", + ], + toState: {id, result, state -> + def state_overwite = [ + "input_r1": result.output_r1, + "input_r2": result.output_r2, + "_meta": ["join_id": state.run_id] + ] + return state + state_overwite + } + ) + + pool_ch = concat_samples_ch.join(fastq_output_directory_ch) + | map {id, concat_state, fastq_output_directory_state -> + def new_state = concat_state + fastq_output_directory_state + return [id, new_state] + } + | parallel_map.run( + directives: ["label": ["highmem", "lowcpu"]], + fromState: {id, state -> + [ + "input_r1": state.input_r1, + "input_r2": state.input_r2, + "barcodesFasta": state.barcodesFasta, + "umiLength": state.umi_length, + "output": state.star_output[0], + "genomeDir": state.genomeDir, + ] + }, + toState: [ + "star_output": "output", + ] + ) + // Split the events from 1 event per pool into events per well + // and add extra metadata about the wells to the state. + | well_metadata.run( + fromState: [ + "barcodesFasta": "barcodesFasta", + "input_r1": "input_r1", + "input_r2": "input_r2", + "star_mapping": "star_output" + ], + toState: [ + "input_r1": "output_r1", + "input_r2": "output_r2", + "pool": "pool", + "well_id": "well_id", + "barcode": "barcode", + "lane": "lane", + "n_wells": "n_wells", + "star_mapping": "well_star_mapping", + ] + ) + // Use the bam file to generate statistics + | generate_well_statistics.run( + directives: [label: ["verylowmem", "verylowcpu"]], + fromState: { id, state -> + [ + "input": state.star_mapping.resolve('Aligned.sortedByCoord.out.bam'), + "barcode": state.barcode, + "well_id": state.well_id, + ] + }, + toState: [ + "nrReadsNrGenesPerChromWell": "nrReadsNrGenesPerChrom", + ] + ) + // Join the events back to pool-level + | map {id, state -> + // Create a special groupKey, such that groupTuple + // knows when all the barcodes have been grouped into 1 event. + // This way the processing is as distributed as possible. + def key = groupKey(state.pool, state.n_wells) + def newEvent = [key, state] + return newEvent + } + // Use a custom sorting function because sort: 'hash' + // requires a hash to be calculated on every entry of the state + // This is inefficient when the number of events is large + // (i.e large number or barcodes). + // Sorting on lexographical order of the barcode is sufficient here. + | groupTuple(sort: {a, b -> a.barcode <=> b.barcode}) + | map {id, states -> + // Gather the keys from all states. for some state items, + // we need gather all the different items from across the states + def barcodes = states.collect{it.barcode} + assert barcodes.clone().unique().size() == barcodes.size(), \ + "Error when gathering information for pool ${id}, barcodes are not unique!" + def well_ids = states.collect{it.well_id} + assert well_ids.clone().unique().size() == well_ids.size(), \ + "Error when gathering information for pool ${id}, well IDs are not unique!" + def custom_state = [ + "input_r1": states.collect{it.input_r1}, + "input_r2": states.collect{it.input_r2}, + "barcode": barcodes, + "well_id": well_ids, + "star_mapping": states.collect{it.star_mapping}, + // Well and pool stats should be carefully kept separate. + // The workflow argument points to the name for the pool statistics: + "nrReadsNrGenesPerChromWell": states.collect{it.nrReadsNrGenesPerChromWell}, + "nrReadsNrGenesPerChromPool": states[0].nrReadsNrGenesPerChrom + ] + //For many state items, the value is the same across states. + def other_state_keys = states.inject([].toSet()){ current_keys, state -> + def new_keys = current_keys + state.keySet() + return new_keys + }.minus(custom_state.keySet()) + // All other state should have a unique value + def old_state_items = other_state_keys.inject([:]){ old_state, argument_name -> + argument_values = states.collect{it.get(argument_name)}.unique() + assert argument_values.size() == 1, "Arguments should be the same across modalities. Please report this \ + as a bug. Argument name: $argument_name, \ + argument value: $argument_values" + def argument_value + argument_values.each { argument_value = it } + def current_state = old_state + [(argument_name): argument_value] + return current_state + } + + def new_state = custom_state + old_state_items + [id.getGroupTarget(), new_state] + } + + pool_statistics_ch = pool_ch + | generate_pool_statistics.run( + directives: ["label": ["lowmem", "verylowcpu"]], + fromState: [ + "nrReadsNrGenesPerChrom": "nrReadsNrGenesPerChromWell", + "nrReadsNrGenesPerChromPool": "nrReadsNrGenesPerChromPool" + ], + toState: [ + "nrReadsNrGenesPerChromPool": "nrReadsNrGenesPerChromPool" + ] + ) + + // The statistics from the STAR logs of different wells are joined + // on pool level + star_logs_ch = pool_ch + | combine_star_logs.run( + directives: ["label": ["lowmem", "verylowcpu"]], + fromState: {id, state -> [ + "star_logs": state.star_output.collect{it.resolve("Log.final.out")}, + "gene_summary_logs": state.star_output.collect{it.resolve("Solo.out/Gene/Summary.csv")}, + "reads_per_gene_logs": state.star_output.collect{it.resolve("ReadsPerGene.out.tab")}, + "barcodes": state.barcode, + "output": state.star_qc_metrics + ] + }, + toState: [ + "star_qc_metrics": "output", + ] + ) + + eset_ch = star_logs_ch.join(pool_statistics_ch, remainder: true) + | map {id, star_logs_state, pool_statistics_state -> + def newState = star_logs_state + ["nrReadsNrGenesPerChromPool": pool_statistics_state.nrReadsNrGenesPerChromPool] + return [id, newState] + } + | create_pdata.run( + directives: [label: ["lowmem", "lowcpu"]], + fromState: [ + "star_stats_file": "star_qc_metrics", + "nrReadsNrGenesPerChromPool": "nrReadsNrGenesPerChromPool", + "output": "p_data" + ], + toState: ["p_data": "output"], + ) + | create_eset.run( + directives: [label: ["lowmem", "lowcpu"]], + fromState: [ + "pDataFile": "p_data", + "fDataFile": "f_data", + "mappingDir": "star_output", + "output": "eset", + "barcodes": "barcode", + "poolName": "pool", + ], + toState: [ + "eset": "output", + ] + ) + + report_channel = eset_ch + | toSortedList() + | map {ids_and_states -> + def states = ids_and_states.collect{it[1]} + def html_report = states[0].html_report + def ids = ids_and_states.collect{it[0]} + def esets = states.collect{it.eset} + ["report", ["esets": esets, "html_report": html_report, "original_ids": ids]] + } + | create_report.run( + fromState: [ + "eset": "esets", + "output_report": "html_report", + ], + toState: [ + "html_report": "output_report" + ] + ) + | flatMap {id, state -> + state.original_ids.collect{original_id -> + [original_id, ["html_report": state.html_report]] + } + } + + output_ch = eset_ch.join(report_channel) + | map {id, state_eset, state_report -> + def new_state = state_eset + [ + "html_report": state_report.html_report, + ] + [id, new_state] + } + | setState([ + "star_output": "star_output", + "fastq_output": "fastq_output_directory", + "nrReadsNrGenesPerChrom": "nrReadsNrGenesPerChromPool", + "star_qc_metrics": "star_qc_metrics", + "eset": "eset", + "f_data": "f_data", + "p_data": "p_data", + "html_report": "html_report", + "run_params": "run_params", + "_meta": "_meta", + ]) + + emit: + output_ch +} diff --git a/src/workflows/htrnaseq/nextflow.config b/src/workflows/htrnaseq/nextflow.config new file mode 100644 index 00000000..5a3b9c84 --- /dev/null +++ b/src/workflows/htrnaseq/nextflow.config @@ -0,0 +1,8 @@ + +params { + rootDir = java.nio.file.Paths.get("$projectDir/../../../").toAbsolutePath().normalize().toString() +} + + +// include common settings +includeConfig("${params.rootDir}/src/config/labels.config") \ No newline at end of file diff --git a/src/workflows/htrnaseq/test.nf b/src/workflows/htrnaseq/test.nf new file mode 100644 index 00000000..24808ed0 --- /dev/null +++ b/src/workflows/htrnaseq/test.nf @@ -0,0 +1,70 @@ +nextflow.enable.dsl=2 +targetDir = params.rootDir + "/target/nextflow" + +include { htrnaseq } from targetDir + "/workflows/htrnaseq/main.nf" +include { check_eset } from targetDir + "/integration_test_components/htrnaseq/check_eset/main.nf" + + +params.resources_test = "gs://viash-hub-test-data/htrnaseq/v1/" + +workflow test_wf { + resources_test_file = file(params.resources_test) + input_ch = Channel.fromList([ + [ + id: "sample_one", + input_r1: resources_test_file.resolve("100k/SRR14730301/VH02001612_S9_R1_001.fastq"), + input_r2: resources_test_file.resolve("100k/SRR14730301/VH02001612_S9_R2_001.fastq"), + genomeDir: resources_test_file.resolve("genomeDir/gencode.v41.star.sparse"), + barcodesFasta: resources_test_file.resolve("360-wells-with-ids.fasta"), + annotation: resources_test_file.resolve("genomeDir/gencode.v41.annotation.gtf.gz") + ], + [ + id: "sample_two", + input_r1: resources_test_file.resolve("100k/SRR14730302/VH02001614_S8_R1_001.fastq"), + input_r2: resources_test_file.resolve("100k/SRR14730302/VH02001614_S8_R2_001.fastq"), + genomeDir: resources_test_file.resolve("genomeDir/gencode.v41.star.sparse"), + barcodesFasta: resources_test_file.resolve("360-wells-with-ids.fasta"), + annotation: resources_test_file.resolve("genomeDir/gencode.v41.annotation.gtf.gz") + ] + ]) + | map{ state -> [state.id, state] } + | view { "Input: $it" } + | htrnaseq.run( + toState: [ + "eset": "eset", + "star_output": "star_output", + ] + ) + | check_eset.run( + runIf: {id, state -> id == "sample_one"}, + toState: [ + "eset": "eset", + "star_output": "star_output" + ] + ) +} + + +workflow test_wf2 { + // Test the edge case where one of the barcodes has no reads + resources_test_file = file(params.resources_test) + input_ch = Channel.fromList([ + [ + id: "sample_one", + input_r1: resources_test_file.resolve("100k/SRR14730301/VH02001612_S9_R1_001.fastq"), + input_r2: resources_test_file.resolve("100k/SRR14730301/VH02001612_S9_R2_001.fastq"), + genomeDir: resources_test_file.resolve("genomeDir/gencode.v41.star.sparse"), + barcodesFasta: resources_test_file.resolve("2-wells-1-no-reads.fasta"), + annotation: resources_test_file.resolve("genomeDir/gencode.v41.annotation.gtf.gz") + ], + ]) + | map{ state -> [state.id, state] } + | view { "Input: $it" } + | htrnaseq.run( + toState: [ + "eset": "eset", + "star_output": "star_output", + ] + ) +} + diff --git a/src/workflows/runner/config.vsh.yaml b/src/workflows/runner/config.vsh.yaml new file mode 100644 index 00000000..b6101e8a --- /dev/null +++ b/src/workflows/runner/config.vsh.yaml @@ -0,0 +1,124 @@ +name: runner +namespace: workflows +description: Runner for HT RNA-seq pipeline +argument_groups: + - name: Input arguments + arguments: + - name: --input + description: | + Base directory of the form `s3://Sequencing///`. + Must contains FASTQ files in the format `PoolName_S*_L*_R1_001.fastq.gz` where + * PoolName is a unique ID for the microwell plates or combination thereof. + * S followed by a running number: the sample number based on the order + that samples are listed in the sample sheet (that was used to demultiplex the pools) + starting with 1 (e.g. S1) + * (Optional) the lane number (e.g. L001) + * _001 fixed suffix. + type: file + required: true + - name: --barcodesFasta + type: file + required: true + - name: --genomeDir + type: file + required: true + - name: --annotation + type: file + required: true + - name: --pools + description: | + Filter the FASTQ files in the input directory to only include pools from the provided list. + Pool names are inferred from the FASTQ file names (see input argument for more information). + By default all pools are included. + type: string + multiple: true + - name: "--umi_length" + description: | + Length of the UMI sequences + type: integer + min: 1 + default: 10 + - name: "--run_params" + type: file + required: false + direction: output + default: params.yaml + - name: Metadata arguments + arguments: + - name: --id + description: Unique identifier for the run + type: string + - name: --project_id + description: Project ID + type: string + required: true + - name: --experiment_id + description: Experiment ID + type: string + required: true + - name: Publish arguments + arguments: + - name: --fastq_publish_dir + type: string + required: true + - name: --results_publish_dir + type: string + required: true + - name: Output arguments + arguments: + - name: "--star_output_dir" + type: file + direction: output + default: "star_output" + - name: "--nrReadsNrGenesPerChrom_dir" + type: file + direction: output + default: "nrReadsNrGenesPerChrom" + - name: "--star_qc_metrics_dir" + type: file + direction: output + default: "starLogs" + - name: "--eset_dir" + type: file + direction: output + default: "esets" + - name: "--f_data_dir" + type: file + direction: output + default: "fData" + - name: "--p_data_dir" + type: file + direction: output + default: "pData" + +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - path: disable_publishfiles_process.config + +test_resources: + - type: nextflow_script + path: test.nf + entrypoint: test_wf + +dependencies: + - name: utils/listInputDir + repository: local + - name: workflows/htrnaseq + repository: local + - name: io/publish_fastqs + repository: local + - name: io/publish_results + repository: local + - name: utils/save_params + repository: local + +runners: + - type: nextflow + config: + script: + - includeConfig("disable_publishfiles_process.config") + +engines: + - type: native diff --git a/src/workflows/runner/disable_publishfiles_process.config b/src/workflows/runner/disable_publishfiles_process.config new file mode 100644 index 00000000..6cff5e26 --- /dev/null +++ b/src/workflows/runner/disable_publishfiles_process.config @@ -0,0 +1,5 @@ +process { + withName: publishFilesProc { + publishDir = [ enabled: false ] + } +} \ No newline at end of file diff --git a/src/workflows/runner/integration_test.sh b/src/workflows/runner/integration_test.sh new file mode 100755 index 00000000..eb67d014 --- /dev/null +++ b/src/workflows/runner/integration_test.sh @@ -0,0 +1,24 @@ +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +set -eo pipefail + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +nextflow \ + run . \ + -main-script src/workflows/runner/test.nf \ + -config ./src/config/labels.config \ + -entry test_wf \ + -resume \ + -profile docker,local + +nextflow \ + run . \ + -main-script src/workflows/runner/test.nf \ + -config ./src/config/labels.config \ + -entry test_wf_with_lanes \ + -resume \ + -profile docker,local + diff --git a/src/workflows/runner/main.nf b/src/workflows/runner/main.nf new file mode 100644 index 00000000..d2111352 --- /dev/null +++ b/src/workflows/runner/main.nf @@ -0,0 +1,295 @@ +def date = new Date().format('yyyyMMdd_hhmmss') + +def viash_config = java.nio.file.Paths.get("${moduleDir}/_viash.yaml") +def version = get_version(viash_config) + +workflow run_wf { + take: + raw_ch + + main: + input_ch = raw_ch + // List the FASTQ files per input directory + // Be careful: an event per lane is created! + | map {id, state -> + def new_state = state + ["run_id": id] + return [id, new_state] + } + + save_params_ch = input_ch + | toSortedList() + | map { states -> + def new_id = "save_params" + def all_states = states.collect{it[1]} + def run_params_output_templates = all_states.collect{it.run_params} + assert run_params_output_templates.unique().size() == 1: "The value for the 'run_params' parameter is not the same across runs." + def new_state = ["run_params": run_params_output_templates[0], "all_states": all_states] + return [new_id, new_state] + } + | save_params.run( + key: "save_params_runner", + fromState: {id, state -> + + def convertPaths + convertPaths = { value -> + if (value instanceof java.nio.file.Path) + return value.toUriString() + else if (value instanceof List) + return value.collect { convertPaths(it) } + else if (value instanceof Collection) + throw new UnsupportedOperationException("Collections other than Lists are not supported") + else + return value + } + + // Apply conversion to all state values + def convertedState = state.all_states.collect{it.collectEntries { k, v -> [(k): convertPaths(v)] }} + + def yaml = new org.yaml.snakeyaml.Yaml() + def yamlString = yaml.dump(convertedState) + def encodedYaml = yamlString.bytes.encodeBase64().toString() + + return [ + "id": id, + "params_yaml": encodedYaml, + "output": state.run_params + ] + }, + toState: ["run_params": "output"] + ) + + htrnaseq_ch = input_ch + | map { id, state -> + // The argument names for this workflow and the htrnaseq workflow may overlap + // here, we store a copy in order to make sure to not accidentally overwrite the state. + def new_state = state + [ + "star_output_dir_workflow": state.star_output_dir, + "nrReadsNrGenesPerChrom_dir_workflow": state.nrReadsNrGenesPerChrom_dir, + "star_qc_metrics_dir_workflow": state.star_qc_metrics_dir, + "eset_dir_workflow": state.eset_dir, + "f_data_dir_workflow": state.f_data_dir, + "p_data_dir_workflow": state.p_data_dir + ] + return [id, new_state] + } + | listInputDir.run( + fromState: [ + "input": "input", + "pools": "pools", + ], + toState: { id, state, result -> + def clean_state = state.findAll{ it.key != "input" } + clean_state + result + } + ) + // ListInputDir puts the sample_id as the event ID (slot 0 from the tuple). + // The sample_id was inferred from the start of the file name, + // and it can be used to group the FASTQ files, because an input folder + // can contain input files from multiple samples (pools). Additionally, + // there might be multiple FASTQs for a single sample that correspond to the + // lanes. So the fastq files must be gathered across lanes and input folders + // in order to create an input lists for R1 and R2. + // The ID of the event here is important! It determines the name of the output + // folders for the FASTQ files and these folders are published as-is later. + // The folder where the FASTQ files are stored in should be named after the run ID. + | map {id, state -> ["${state.sample_id}/${state.run_id}".toString(), state]} + | groupTuple(by: 0, sort: "hash") + | map {id, states -> + def new_r1 = states.collect{it.r1_output} + def new_r2 = states.collect{it.r2_output} + // This assumes that, except for r1 and r2, + // the keys across the grouped states are the same. + // TODO: this can be asserted. + def new_state = states[0] + [ + "r1": new_r1, + "r2": new_r2, + ] + return [id, new_state] + } + | view {"Pool inputs after listing directory: $it"} + | htrnaseq.run( + args: [ + f_data: 'fData/$id.txt', + p_data: 'pData/$id.txt', + star_output: 'star_output/$id/*', + fastq_output: 'fastq/*', + eset: 'esets/$id.rds', + nrReadsNrGenesPerChrom: 'nrReadsNrGenesPerChrom/$id.txt', + star_qc_metrics: 'starLogs/$id.txt', + html_report: "report.html", + run_params: null + ], + fromState: [ + input_r1: "r1", + input_r2: "r2", + barcodesFasta: "barcodesFasta", + genomeDir: "genomeDir", + annotation: "annotation", + umi_length: "umi_length", + sample_id: "sample_id", + ], + toState: { id, result, state -> state + result } + ) + + // The HT-RNAseq workflow outputs multiple events, one per 'pool' (usually a plate) + // but for publishing the results, this is not handy because we want to use the $id + // variable as a pointer to the target data. + // So, we should combine everything together + results_publish_ch = htrnaseq_ch + | combine(save_params_ch) + | map {new_id, grouped_ch_state, save_params_id, save_params_state -> + def new_state = grouped_ch_state + ["run_params": save_params_state.run_params] + return [new_id, new_state] + } + | toSortedList + | map{ vs -> + def states = vs.collect{it[1]} + + // The STAR output is a directory for each well in a plate (or pool of plates). + // The wells are grouped into a directory per sample. The name of this directory should + // match the sample_id. + def star_output_samples = states.collectMany{state -> + state.star_output.collect{ + def star_sample_dir = it.parent + assert star_sample_dir.name == state.sample_id: "Unexpected state: the parent directory of STAR output \ + path '${it}' should match with the sample ID ${sample_id}" + star_sample_dir + } + } + def new_state = [ + "star_output": star_output_samples, + ] + + // Keys for which the values should be the same across samples + def state_keys_unique = [ + "html_report", + "project_id", + "experiment_id", + "star_output_dir_workflow", + "nrReadsNrGenesPerChrom_dir_workflow", + "star_qc_metrics_dir_workflow", + "eset_dir_workflow", + "f_data_dir_workflow", + "p_data_dir_workflow", + "f_data", + "run_params" + ] + def state_unique_keys = state_keys_unique.inject([:]) { state_to_update, argument_name -> + argument_values = states.collect{it.get(argument_name)}.unique() + assert argument_values.size() == 1, "State error: values for argument $argument_name should be the same across states. \ + Argument values: $argument_values" + // take the unique value from the set (there is only one) + def argument_value + argument_values.each { argument_value = it } + state_to_update + [(argument_name): argument_value] + } + + // Keys that just require gathering of values across samples + def state_keys_collect = [ + "nrReadsNrGenesPerChrom", + "star_qc_metrics", + "eset", + "p_data", + ] + def state_collect = state_keys_collect.collectEntries{ key_ -> + [key_, states.collect{it.get(key_)}] + } + + new_state = new_state + state_unique_keys + state_collect + [states[0].run_id, new_state] + } + | publish_results.run( + fromState: { id, state -> + def prefix = "${state.project_id}/${state.experiment_id}/data_processed/${date}_htrnaseq_${version}" + + println("Publising results to ${params.results_publish_dir}/${prefix}") + + [ + // Inputs + star_output: state.star_output, + nrReadsNrGenesPerChrom: state.nrReadsNrGenesPerChrom, + star_qc_metrics: state.star_qc_metrics, + eset: state.eset, + f_data: state.f_data, + p_data: state.p_data, + html_report: state.html_report, + run_params: state.run_params, + // Output locations + run_params_output: "${prefix}/${state.run_params.name}", + html_report_output: "${prefix}/${state.html_report.name}", + star_output_dir: "${prefix}/${state.star_output_dir_workflow}", + nrReadsNrGenesPerChrom_dir: "${prefix}/${state.nrReadsNrGenesPerChrom_dir_workflow}", + star_qc_metrics_dir: "${prefix}/${state.star_qc_metrics_dir_workflow}", + eset_dir: "${prefix}/${state.eset_dir_workflow}", + f_data_dir: "${prefix}/${state.f_data_dir_workflow}", + p_data_dir: "${prefix}/${state.p_data_dir_workflow}" + ] + }, + toState: { id, result, state -> result }, + directives: [ + publishDir: [ + path: "${params.results_publish_dir}", + overwrite: false, + mode: "copy" + ] + ] + ) + | setState([ + "star_output_dir", + "nrReadsNrGenesPerChrom_dir", + "star_qc_metrics_dir", + "eset_dir", + "f_data_dir", + "p_data_dir", + ] + ) + + fastq_publish_ch = htrnaseq_ch + // The output from the htrnaseq workflow is on sample (i.e. pool) level + // Multiple sequencing runs may have contributes to the FASTQ files from this pool. + // So the fastq_output is a list of directories, one for each run. + // We assume that the names of the folders containing the FASTQ files are equal to the pool names. + | flatMap {id, state -> + state.fastq_output.collect{fastq_dir -> + def run_id = fastq_dir.name + def new_id = "${run_id}/${state.sample_id}" + def new_state = [ + "fastq_output": fastq_dir.listFiles(), + "sample_id": state.sample_id, + "run_id": run_id, + "output": "${run_id}/${date}_htrnaseq_${version}/${state.sample_id}".toString() + ] + [new_id, new_state] + } + } + // A folder containing the FASTQ files from a certain pool may be present in the state from + // multiple samples; if that pool contributed to the data from those samples. + // Those FASTQ files will only be published once by filtering out the duplicate events here. + | unique{it[0]} + | publish_fastqs.run( + fromState: [ + "input": "fastq_output", + "output": "output", + ], + toState: { id, result, state -> state }, + directives: [ + publishDir: [ + path: "${params.fastq_publish_dir}", + overwrite: false, + mode: "copy" + ] + ] + ) + + emit: + results_publish_ch + +} + +def get_version(inputFile) { + def yamlSlurper = new groovy.yaml.YamlSlurper() + def loaded_viash_config = yamlSlurper.parse(file(inputFile)) + def version = (loaded_viash_config.version) ? loaded_viash_config.version : "unknown_version" + println("HT-RNAseq version to be used: ${version}") + return version +} \ No newline at end of file diff --git a/src/workflows/runner/nextflow.config b/src/workflows/runner/nextflow.config new file mode 100644 index 00000000..1c43d0d6 --- /dev/null +++ b/src/workflows/runner/nextflow.config @@ -0,0 +1,16 @@ +manifest { + nextflowVersion = '!>=20.12.1-edge' +} + +params { + rootDir = java.nio.file.Paths.get("$projectDir/../../../").toAbsolutePath().normalize().toString() +} + +process { + withName: publishStatesProc { + publishDir = [ enabled: false ] + } +} + +// include common settings +includeConfig("${params.rootDir}/src/config/labels.config") diff --git a/src/workflows/runner/test.nf b/src/workflows/runner/test.nf new file mode 100644 index 00000000..c9269907 --- /dev/null +++ b/src/workflows/runner/test.nf @@ -0,0 +1,330 @@ +import java.nio.file.Files +import nextflow.exception.WorkflowScriptErrorException + +def viash_config = java.nio.file.Paths.get("${params.rootDir}/target/nextflow/workflows/runner/_viash.yaml") + +def get_version(inputFile) { + def yamlSlurper = new groovy.yaml.YamlSlurper() + def loaded_viash_config = yamlSlurper.parse(file(inputFile)) + def version = (loaded_viash_config.version) ? loaded_viash_config.version : "unknown_version" + println("HT-RNAseq version to be used: ${version}") + return version +} + +// Create temporary directory for the publish_dir if it is not defined +if (!params.containsKey("publish_dir") && params.containsKey("publishDir")) { + params.publish_dir = params.publishDir +} + +if (!params.containsKey("publish_dir")) { + def tempDir = Files.createTempDirectory("demultiplex_runner_integration_test") + println "Created temp directory: $tempDir" + // Register shutdown hook to delete it on JVM exit + Runtime.runtime.addShutdownHook(new Thread({ + try { + // Delete directory recursively + Files.walk(tempDir) + .sorted(Comparator.reverseOrder()) + .forEach { Files.delete(it) } + println "Deleted temp directory: $tempDir" + } catch (Exception e) { + println "Failed to delete temp directory: $e" + } + })) + params.publish_dir = tempDir +} + +params.fastq_publish_dir = (file(params.publish_dir) / "fastq").toUriString() +params.results_publish_dir = (file(params.publish_dir) / "results").toUriString() +assert file(params.fastq_publish_dir).isEmpty() +assert file(params.results_publish_dir).isEmpty() + +// The module inherits the parameters defined before the include statement, +// therefore any parameters set afterwards will not be used by the module. + +include { runner } from params.rootDir + "/target/nextflow/workflows/runner/main.nf" +params.resources_test = params.rootDir + "/resources_test" + +workflow test_wf { + pipeline_version = get_version(viash_config) + resources_test = file(params.resources_test) + + // results_publish_dir and results_publish_dir are inherited using params + // but they must be defined in the state as well because viash will check + // if all arguments are present in the hashmap + output_ch = Channel.fromList([ + [ + id: "run_1", + input: resources_test.resolve("10k/SRR14730301"), + genomeDir: resources_test.resolve("genomeDir/subset/Homo_sapiens/v0.0.3"), + barcodesFasta: resources_test.resolve("2-wells-with-ids.fasta"), + annotation: resources_test.resolve("genomeDir/gencode.v41.annotation.gtf.gz"), + project_id: "foo", + experiment_id: "bar", + fastq_publish_dir: params.fastq_publish_dir, + results_publish_dir: params.results_publish_dir, + ], + [ + id: "run_2", + input: resources_test.resolve("10k/SRR14730301"), + genomeDir: resources_test.resolve("genomeDir/subset/Homo_sapiens/v0.0.3"), + barcodesFasta: resources_test.resolve("2-wells-with-ids.fasta"), + annotation: resources_test.resolve("genomeDir/gencode.v41.annotation.gtf.gz"), + project_id: "foo", + experiment_id: "bar", + fastq_publish_dir: params.fastq_publish_dir, + results_publish_dir: params.results_publish_dir, + ], + [ + id: "run_3", + input:resources_test.resolve("10k/SRR14730302"), + genomeDir: resources_test.resolve("genomeDir/subset/Homo_sapiens/v0.0.3"), + barcodesFasta: resources_test.resolve("2-wells-with-ids.fasta"), + annotation: resources_test.resolve("genomeDir/gencode.v41.annotation.gtf.gz"), + project_id: "foo", + experiment_id: "bar", + fastq_publish_dir: params.fastq_publish_dir, + results_publish_dir: params.results_publish_dir, + ] + ]) + | map { state -> [state.id, state]} + | runner.run( + toState: { id, output, state -> output + [orig_input: state.input] } + ) + | view { output -> + assert output.size() == 2 : "outputs should contain two elements; [id, file]" + "Output: $output" + } + + tosortedlistch = output_ch + | toSortedList() + | map {events -> + assert events.size() == 1, "Expected one events to be output, found ${events.size()}" + events + } + | map {states -> + def output_state = states[0][1] + assert output_state.eset_dir.listFiles().collect{it.name}.toSet() == ["VH02001612.rds", "VH02001614.rds"].toSet() + assert output_state.star_output_dir.listFiles().collect{it.name}.toSet() == ["VH02001612", "VH02001614"].toSet() + ["VH02001612", "VH02001614"].each{it -> + assert output_state.star_output_dir.resolve(it).listFiles().collect{it.name}.toSet() == ["ACACCGAATT", "GGCTATTGAT"].toSet() + } + assert output_state.star_qc_metrics_dir.listFiles().collect{it.name}.toSet() == ["VH02001612.txt", "VH02001614.txt"].toSet() + assert output_state.nrReadsNrGenesPerChrom_dir.listFiles().collect{it.name}.toSet() == ["VH02001612.txt", "VH02001614.txt"].toSet() + } + + + workflow.onComplete = { + try { + // Nexflow only allows exceptions generated using the 'error' function (which throws WorkflowScriptErrorException). + // So in order for the assert statement to work (or allow other errors to let the tests to fail) + // We need to wrap these in WorkflowScriptErrorException. See https://github.com/nextflow-io/nextflow/pull/4458/files + // The error message will show up in .nextflow.log + def fastq_subdir = file("${params.fastq_publish_dir}") + assert fastq_subdir.isDirectory() + def found_fastq_folders = fastq_subdir.listFiles().findAll{it.isDirectory()}.collect{it.name}.toSet() + def expected_run_folders = ["run_1", "run_2", "run_3"].toSet() + assert found_fastq_folders == expected_run_folders, "Expected correct run folders to be present. Found: ${found_fastq_folders}" + unique_dirs = [ + "run1": files("${fastq_subdir.toUriString()}/run_1/*_htrnaseq_${pipeline_version}", type: 'any'), + "run2": files("${fastq_subdir.toUriString()}/run_2/*_htrnaseq_${pipeline_version}", type: 'any'), + "run3": files("${fastq_subdir.toUriString()}/run_3/*_htrnaseq_${pipeline_version}", type: 'any'), + ] + assert unique_dirs.every{it.value.size() == 1} + unique_dirs = unique_dirs.collectEntries{k, v -> [k, v[0]]} + + assert unique_dirs.every{it.value.isDirectory()} + assert unique_dirs.collect{_key, _value -> _value.name}.toSet().size() == 1 + def expected_samples = [ + "run1": "VH02001612", + "run2": "VH02001612", + "run3": "VH02001614" + ] + + unique_dirs.each{_key, _value -> + def expected_sample = expected_samples[_key] + def expected_sample_dir = _value.resolve(expected_sample) + assert expected_sample_dir.isDirectory(), "Expected ${expected_sample} to be present in ${_value}" + def expected_fastq_files = [ + "A1_R1_001.fastq", "A1_R2_001.fastq", + "B1_R1_001.fastq", "B1_R2_001.fastq", + "unknown_R1_001.fastq", "unknown_R2_001.fastq"] + def found_files = files("${expected_sample_dir}/*.fastq", type: 'any') + assert found_files.every{it.isFile()} + assert found_files.collect{it.name}.toSet() == expected_fastq_files.toSet() + } + + def results_subdir = file("${params.results_publish_dir}") + assert fastq_subdir.isDirectory() + def expected_subdir = file("${results_subdir}/foo/bar/data_processed", type: 'any') + assert expected_subdir.isDirectory() + def expected_result_dir = files("${expected_subdir}/*_htrnaseq_${pipeline_version}", type: 'any') + assert expected_result_dir.size() == 1 + expected_result_dir = expected_result_dir[0] + assert expected_result_dir.isDirectory() + def expected_esets = ["VH02001612.rds", "VH02001614.rds"] + def found_esets = files("${expected_result_dir}/esets/*.rds", type: 'any') + assert found_esets.size() == 2 + assert found_esets.collect{it.name}.toSet() == expected_esets.toSet() + expected_table_filenames = ["VH02001612.txt", "VH02001614.txt"] + def found_pdata = files("${expected_result_dir}/pData/*.txt", type: 'any') + assert found_pdata.size() == 2 + assert found_pdata.collect{it.name}.toSet() == expected_table_filenames.toSet() + def found_nr_genes_nr_reads = files("${expected_result_dir}/nrReadsNrGenesPerChrom/*.txt", type: 'any') + assert found_nr_genes_nr_reads.size() == 2 + assert found_nr_genes_nr_reads.collect{it.name}.toSet() == expected_table_filenames.toSet() + def found_star_logs = files("${expected_result_dir}/starLogs/*.txt", type: 'any') + assert found_star_logs.size() == 2 + assert found_star_logs.collect{it.name}.toSet() == expected_table_filenames.toSet() + def star_output = file("${expected_result_dir}/star_output", type: 'any') + assert star_output.isDirectory() + + assert files("${star_output}/*", type: 'any').collect{it.name}.toSet() == ["VH02001612", "VH02001614"].toSet() + assert files("${star_output}/VH02001612/*", type: 'any').collect{it.name}.toSet() == ["ACACCGAATT", "GGCTATTGAT"].toSet() + assert files("${star_output}/VH02001614/*", type: 'any').collect{it.name}.toSet() == ["ACACCGAATT", "GGCTATTGAT"].toSet() + assert file("${expected_result_dir}/report.html").isFile() + assert file("${expected_result_dir}/params.yaml").isFile() + assert file("${expected_result_dir}/fData/fData.gencode.v41.annotation.gtf.gz.txt").isFile() + + } catch (Exception e) { + throw new WorkflowScriptErrorException("Integration test failed!", e) + } + } +} + + +workflow test_wf_with_lanes { + pipeline_version = get_version(viash_config) + resources_test = file(params.resources_test) + + // results_publish_dir and results_publish_dir are inherited using params + // but they must be defined in the state as well because viash will check + // if all arguments are present in the hashmap + output_ch = Channel.fromList([ + [ + id: "run_1", + input: resources_test.resolve("10k_with_lanes/SRR14730301"), + genomeDir: resources_test.resolve("genomeDir/subset/Homo_sapiens/v0.0.3"), + barcodesFasta: resources_test.resolve("2-wells-with-ids.fasta"), + annotation: resources_test.resolve("genomeDir/gencode.v41.annotation.gtf.gz"), + project_id: "foo", + experiment_id: "bar", + fastq_publish_dir: params.fastq_publish_dir, + results_publish_dir: params.results_publish_dir, + ], + [ + id: "run_2", + input: resources_test.resolve("10k_with_lanes/SRR14730301"), + genomeDir: resources_test.resolve("genomeDir/subset/Homo_sapiens/v0.0.3"), + barcodesFasta: resources_test.resolve("2-wells-with-ids.fasta"), + annotation: resources_test.resolve("genomeDir/gencode.v41.annotation.gtf.gz"), + project_id: "foo", + experiment_id: "bar", + fastq_publish_dir: params.fastq_publish_dir, + results_publish_dir: params.results_publish_dir, + ], + [ + id: "run_3", + input:resources_test.resolve("10k_with_lanes/SRR14730302"), + genomeDir: resources_test.resolve("genomeDir/subset/Homo_sapiens/v0.0.3"), + barcodesFasta: resources_test.resolve("2-wells-with-ids.fasta"), + annotation: resources_test.resolve("genomeDir/gencode.v41.annotation.gtf.gz"), + project_id: "foo", + experiment_id: "bar", + fastq_publish_dir: params.fastq_publish_dir, + results_publish_dir: params.results_publish_dir, + ] + ]) + | map { state -> [state.id, state]} + | runner.run( + toState: { id, output, state -> output + [orig_input: state.input] } + ) + | view { output -> + assert output.size() == 2 : "outputs should contain two elements; [id, file]" + "Output: $output" + } + + tosortedlistch = output_ch + | toSortedList() + | map {events -> + assert events.size() == 1, "Expected one events to be output, found ${events.size()}" + } + + + workflow.onComplete = { + try { + // Nexflow only allows exceptions generated using the 'error' function (which throws WorkflowScriptErrorException). + // So in order for the assert statement to work (or allow other errors to let the tests to fail) + // We need to wrap these in WorkflowScriptErrorException. See https://github.com/nextflow-io/nextflow/pull/4458/files + // The error message will show up in .nextflow.log + def fastq_subdir = file("${params.fastq_publish_dir}") + assert fastq_subdir.isDirectory() + def found_fastq_folders = fastq_subdir.listFiles().findAll{it.isDirectory()}.collect{it.name}.toSet() + def expected_run_folders = ["run_1", "run_2", "run_3"].toSet() + assert found_fastq_folders == expected_run_folders, "Expected correct run folders to be present. Found: ${found_fastq_folders}" + unique_dirs = [ + "run1": files("${fastq_subdir.toUriString()}/run_1/*_htrnaseq_${pipeline_version}", type: 'any'), + "run2": files("${fastq_subdir.toUriString()}/run_2/*_htrnaseq_${pipeline_version}", type: 'any'), + "run3": files("${fastq_subdir.toUriString()}/run_3/*_htrnaseq_${pipeline_version}", type: 'any'), + ] + assert unique_dirs.every{it.value.size() == 1} + unique_dirs = unique_dirs.collectEntries{k, v -> [k, v[0]]} + + assert unique_dirs.every{it.value.isDirectory()} + assert unique_dirs.collect{_key, _value -> _value.name}.toSet().size() == 1 + def expected_samples = [ + "run1": "VH02001612", + "run2": "VH02001612", + "run3": "VH02001614" + ] + + unique_dirs.each{_key, _value -> + def expected_sample = expected_samples[_key] + def expected_sample_dir = _value.resolve(expected_sample) + assert expected_sample_dir.isDirectory(), "Expected ${expected_sample} to be present in ${_value}" + def expected_fastq_files = [ + "A1_R1_001.fastq", "A1_R2_001.fastq", + "B1_R1_001.fastq", "B1_R2_001.fastq", + "unknown_R1_001.fastq", "unknown_R2_001.fastq"] + def found_files = files("${expected_sample_dir}/*.fastq", type: 'any') + assert found_files.every{it.isFile()} + assert found_files.collect{it.name}.toSet() == expected_fastq_files.toSet() + } + + def results_subdir = file("${params.results_publish_dir}") + assert fastq_subdir.isDirectory() + def expected_subdir = file("${results_subdir}/foo/bar/data_processed", type: 'any') + assert expected_subdir.isDirectory() + def expected_result_dir = files("${expected_subdir}/*_htrnaseq_${pipeline_version}", type: 'any') + assert expected_result_dir.size() == 1 + expected_result_dir = expected_result_dir[0] + assert expected_result_dir.isDirectory() + def expected_esets = ["VH02001612.rds", "VH02001614.rds"] + def found_esets = files("${expected_result_dir}/esets/*.rds", type: 'any') + assert found_esets.size() == 2 + assert found_esets.collect{it.name}.toSet() == expected_esets.toSet() + expected_table_filenames = ["VH02001612.txt", "VH02001614.txt"] + def found_pdata = files("${expected_result_dir}/pData/*.txt", type: 'any') + assert found_pdata.size() == 2 + assert found_pdata.collect{it.name}.toSet() == expected_table_filenames.toSet() + def found_nr_genes_nr_reads = files("${expected_result_dir}/nrReadsNrGenesPerChrom/*.txt", type: 'any') + assert found_nr_genes_nr_reads.size() == 2 + assert found_nr_genes_nr_reads.collect{it.name}.toSet() == expected_table_filenames.toSet() + def found_star_logs = files("${expected_result_dir}/starLogs/*.txt", type: 'any') + assert found_star_logs.size() == 2 + assert found_star_logs.collect{it.name}.toSet() == expected_table_filenames.toSet() + def star_output = file("${expected_result_dir}/star_output", type: 'any') + assert star_output.isDirectory() + + assert files("${star_output}/*", type: 'any').collect{it.name}.toSet() == ["VH02001612", "VH02001614"].toSet() + assert files("${star_output}/VH02001612/*", type: 'any').collect{it.name}.toSet() == ["ACACCGAATT", "GGCTATTGAT"].toSet() + assert files("${star_output}/VH02001614/*", type: 'any').collect{it.name}.toSet() == ["ACACCGAATT", "GGCTATTGAT"].toSet() + assert file("${expected_result_dir}/report.html").isFile() + assert file("${expected_result_dir}/params.yaml").isFile() + assert file("${expected_result_dir}/fData/fData.gencode.v41.annotation.gtf.gz.txt").isFile() + + } catch (Exception e) { + throw new WorkflowScriptErrorException("Integration test failed!", e) + } + } +} \ No newline at end of file diff --git a/src/workflows/well_demultiplex/config.vsh.yaml b/src/workflows/well_demultiplex/config.vsh.yaml new file mode 100644 index 00000000..28ec3216 --- /dev/null +++ b/src/workflows/well_demultiplex/config.vsh.yaml @@ -0,0 +1,81 @@ +name: well_demultiplex +namespace: workflows +description: Demultiplexing on well level +authors: + - __merge__: /src/base/authors/dries_schaumont.yaml + roles: [ maintainer ] + - __merge__: /src/base/authors/marijke_van_moerbeke.yaml + roles: [ contributor ] +argument_groups: + - name: Input arguments + arguments: + - name: --input_r1 + description: | + Forward reads in FASTQ format. Multiple files can be provided which will + be demultiplexed separately before joining the results for each individual well. + type: file + required: true + multiple: true + - name: --input_r2 + description: | + Reverse reads in FASTQ format. Multiple files can be provided which will + be demultiplexed separately before joining the results for each individual well. + type: file + required: true + multiple: true + - name: --barcodesFasta + type: file + required: true + - name: Output arguments + arguments: + - name: --output_r1 + description: List of demultiplexed fastq files + type: file + direction: output + multiple: true + required: true + default: "fastq/*_R1_001.fastq" + - name: "--output_r2" + description: List of demultiplexed fastq files + type: file + direction: output + multiple: true + required: true + default: "fastq/*_R2_001.fastq" +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + +# Test dataset: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM5357044 +test_resources: + - type: nextflow_script + path: test.nf + entrypoint: test_wf + - type: nextflow_script + path: test.nf + entrypoint: test_wf2 + +dependencies: + - name: cutadapt + repository: bb + - name: concat_text + repository: cb + - name: move_files_to_directory + repository: cb +repositories: + - name: bb + type: vsh + repo: biobox + tag: v0.3.1 + - name: cb + type: vsh + repo: craftbox + tag: v0.3.0 + +runners: + - type: nextflow + +engines: + - type: native + diff --git a/src/workflows/well_demultiplex/integration_test.sh b/src/workflows/well_demultiplex/integration_test.sh new file mode 100755 index 00000000..005a3daf --- /dev/null +++ b/src/workflows/well_demultiplex/integration_test.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +# Make sure the workflow is built +viash ns build --setup cb --parallel + +export NXF_VER=24.04.4 + +nextflow \ + run . \ + -main-script src/workflows/well_demultiplex/test.nf \ + -config ./src/config/labels.config \ + -entry test_wf \ + -resume \ + -profile docker,local \ + --publish_dir output + + +nextflow \ + run . \ + -main-script src/workflows/well_demultiplex/test.nf \ + -config ./src/config/labels.config \ + -entry test_wf2 \ + -resume \ + -profile docker,local \ + --publish_dir output_2 \ + diff --git a/src/workflows/well_demultiplex/main.nf b/src/workflows/well_demultiplex/main.nf new file mode 100644 index 00000000..c8286457 --- /dev/null +++ b/src/workflows/well_demultiplex/main.nf @@ -0,0 +1,275 @@ +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + /* + For each pool (i.e. event) in the channel, a list of R1 and R2 input + reads is provided which correspond to the lanes. If there are multiple lanes, + we can demultiplex into the wells for each lane in parallel. Therefore, cutadapt + must be started multiple times and we need an event per lane. The events are + created by taking the R1 and R2 pairs from the input lists. The index of the elements + in these lists are added to the ID in order to make them unique. + + The same pools may be present in multiple sequencing runs. Here, the events must be unique + across boths runs and samples. When called from the htrnaseq workflow; the events have the + format '{pool_id}/{run_id}' + */ + | flatMap {id, state -> + assert state.input_r1.size() == state.input_r2.size(), \ + "Expected equal number of inputs for R1 and R2" + if (state.input_r1.size() == 1) { + // special case where we do not want to adjust the ID to add an index. + // If we do add an index, the file paths will contain "_0", which + // will not be removed. For the scenarios where we do have multiple lanes, + // the files will be concatenated later and a new file path without the index + // is created at that point. + def newState = state + [ + "input_r1": state.input_r1[0], + "input_r2": state.input_r2[0], + "pool_and_run_id": id, + "n_lanes": 1, + "lane_sorting": 1, + ] + return [[id, newState]] + } + // Store the number of lanes that were encountered here in order to + // group them together in an asynchronous manner later by providing + // the expected number of events to be grouped to groupTuple. + // see https://www.nextflow.io/docs/latest/reference/operator.html#grouptuple + [state.input_r1, state.input_r2].transpose().withIndex().collect{ input_pair, index -> + def single_input_r1 = input_pair[0] + def single_input_r2 = input_pair[1] + def newState = state + ["input_r1": single_input_r1, + "input_r2": single_input_r2, + "pool_and_run_id": id, + "n_lanes": state.input_r1.size(), + "lane_sorting": index] + def newId = id + "_" + index + [newId, newState] + } + } + | cutadapt.run( + directives: [label: ["highmem", "midcpu"]], + fromState: { id, state -> + [ + input: state.input_r1, + input_r2: state.input_r2, + no_indels: true, + action: "none", + front_fasta: state.barcodesFasta, + output: "*_001.fastq", + error_rate: 0.10, + demultiplex_mode: "single", + output_r1: state.output_r1, + output_r2: state.output_r2, + ] + }, + toState: { id, result, state -> + def newState = [ + "pool_and_run_id": state.pool_and_run_id, + "n_lanes": state.n_lanes, + "output": result.output, + "lane_sorting": state.lane_sorting, + ] + return newState + } + ) + | flatMap{ id, state -> + // The output from cutadapt should be in the format {name}_R(1|2)_001.fastq + // See https://github.com/viash-hub/biobox/blob/952ff0843093b538cbfd6fefdecf2e7a0bc9e70b/src/cutadapt/script.sh#L226 + // Here, {name} is the name of the sequence in the barcode fasta: https://cutadapt.readthedocs.io/en/v5.0/guide.html#named-adapters + state.output.collect{ p -> + def path_as_string = p.name + // Check for correct output file name format + assert (path_as_string.endsWith("_R1_001.fastq") || path_as_string.endsWith("_R2_001.fastq")), \ + "Expected cutadapt output to contain files ending in '_R1_001.fastq' or _R1_001.fastq' only. Found: ${p}." + // Detect read orientation from file name + def pair_end = path_as_string.endsWith("_R1_001.fastq") ? "R1" : "R2" + // Use the start of the file + def barcode_id = p.name - ~/_R(1|2)_001\.fastq$/ + def new_id = state.pool_and_run_id + "__" + barcode_id + [ + new_id, + [ + "pool_and_run_id": state.pool_and_run_id, + "barcode_id": barcode_id, + "output": p, + "pair_end": pair_end, + "n_lanes": state.n_lanes, + "lane_sorting": state.lane_sorting, + ] + ] + } + } + /* + At this point, the events are provided on the smallest possible level, + as each event represents the reads for a certain orientation from a + particular lane and a single well. Here, we join these events back together + on well level, gathering FASTQS across the lanes and read orientations. + In order to make this joining as efficient as possible, the number of + lanes which are expected to be gathered were stored in the state earlier. + This way, the processing of a well can continue as as soon as all of + the lanes have been gathered. The number of lanes times 2 (forward + and reverse orientation) represents the total number of FASTQS (events) + to be included for a certain well. + */ + | map {id, state -> + def group_key = groupKey(id, state.n_lanes * 2) + return [group_key, state] + } + | groupTuple(by: 0, remainder: true, sort: {a, b -> + // Make sure that the grouped states are in order, + // meaning forward and reverse FASTQs are paired and the FASTQ + // for the forward reads comes before the reverse reads FASTQ. + if (a.lane_sorting == b.lane_sorting) { + return a.pair_end <=> b.pair_end + } + return a.lane_sorting <=> b.lane_sorting + }) + | map {group_key, states -> + // The states are in one long flat list, group them into pairs + // This assumes that the FASTQ files are already in order! + // (See the 'sort' argument of groupTuple above) + def output_pairs = states.collate(2) + + // Sanity check the state + output_pairs.each{ pair -> + assert pair.size() == 2, \ + "State error: expected FASTQ pairs as output from cutadapt, " + + "found output state: $pair" + def (first, second) = pair + def should_be_the_same = [ + "barcode_id", + "pool_and_run_id", + "lane_sorting", + ] + should_be_the_same.each { attr_to_check -> + first_attr = first.get(attr_to_check) + second_attr = second.get(attr_to_check) + assert first_attr == second_attr, \ + "State error: expected FASTQ pairs from cutadapt to have " + + "the same detected ${attr_to_check}. Found: " + + "$first_attr and $second_attr" + } + } + // Forward and reverse reads should be designated + // by 'R1' and 'R2', and sorted lexographically. + def r1_output = output_pairs.collect{ + def forward_output = it[0].output + assert forward_output.name.endsWith("R1_001.fastq"), \ + "State error: expected first item from FASTQ pair to have " + + "orientation 'R1', found ${forward_output.name}." + return it[0].output + } + def r2_output = output_pairs.collect{ + def forward_output = it[1].output + assert forward_output.name.endsWith("R2_001.fastq"), \ + "State error: expected first item from FASTQ pair to have " + + "orientation 'R2', found ${forward_output.name}." + return it[1].output + } + assert r1_output.size() == r2_output.size() + + /* The lane sorting represents the order of the FASTQ files + as provided by the input. The order of the FASTQ files should + remain the same in the well output. This is because the result of STAR + can differ based on the order of the reads in the FASTQ file. + Even when the same reads are provided, the order of them matters. + */ + def lane_sorting = output_pairs.it[0].lane_sorting + def sorting_is_monotonically_increasing = lane_sorting.withIndex().every { i, idx -> + idx == 0 || lane_sorting[idx - 1] <= i + } + assert sorting_is_monotonically_increasing, \ + "State error: expected the order of the FASTQ files after grouping " + + "the cutadapt output to be the same as the order in the input. " + + "Found sorting ${lane_sorting}, R1 output: ${r1_output}, R2 output: ${r2_output}." + + // Here we pick the state from the first item in the list of states + // and overwrite the keys which are different across states + def first_state = states[0] + // The id is the sequence name for the barcode (from the FASTA file). + def new_state = first_state + ["output_r1": r1_output, "output_r2": r2_output] + // group_key.target is an attribute from an object created with nextflow's groupKey() + // It is the Id by which the events were joined using groupTuple + return [group_key.target, new_state] + } + | view {"State after running cutadapt: $it"} + // TODO: Expand this into matching a whitelist/blacklist of barcodes + // ... and turn into separate component + + // This is contatenation of the FASTQ files from different lanes + // Concatenation of FASTQ files from the different runs is done later. + | concat_text.run( + directives: [label: ["lowmem", "lowcpu"]], + key: "concat_txt_r1", + runIf: {id, state -> state.output_r1.size() > 1}, + fromState: { id, state -> + [ + input: state.output_r1, + gzip_output: false, + output: "${state.barcode_id}_R1_001.fastq" + ] + }, + toState: { id, result, state -> + def newState = state + [ output_r1: [ result.output ] ] + return newState + } + ) + | concat_text.run( + directives: [label: ["lowmem", "lowcpu"]], + key: "concat_text_r2", + runIf: {id, state -> state.output_r2.size() > 1}, + fromState: { id, state -> + [ + input: state.output_r2, + gzip_output: false, + output: "${state.barcode_id}_R2_001.fastq", + ] + }, + toState: { id, result, state -> + def newState = state + [ output_r2: [ result.output ] ] + return newState + } + ) + // Group the concatenated files back on pool level + | map {id, state -> + def new_event = [state.pool_and_run_id, state] + return new_event + } + | groupTuple(by: 0, sort: {a, b -> a.barcode_id <=> b.barcode_id}) + | map {id, states -> + def output_r1 = states.collect{it.output_r1}.flatten() + def output_r2 = states.collect{it.output_r2}.flatten() + def pools = states.collect{it.pool_and_run_id} + assert pools.toSet().size() == 1, "Unexpected state: pool ID to be unique. Found: ${pools}." + def output_state = ["output_r1": output_r1, "output_r2": output_r2, "pool_and_run_id": pools[0]] + return [id, output_state] + } + // The concatenation of lanes happens in different work directories (each well is processed a different + // concat_text process). Here we make sure that the FASTQ files are gathered in a single directory. + // This could be skipped when no concatenation was done since cutadapt will output in a directory already. + // But since we are copying symlinks most of the time there is almost no performance penalty here. + | move_files_to_directory.run( + fromState: { id, state -> + [ + "input": state.output_r1 + state.output_r2, + // Remark: the fastq path part may seem superfluous but is necessary for publising later + "output": "fastq/${state.pool_and_run_id}/", + "keep_symbolic_links": true + ] + }, + toState: {id, result, state -> + def new_state = [ + "output_r1": state.output_r1.collect{result.output.resolve(it.name)}, + "output_r2": state.output_r2.collect{result.output.resolve(it.name)}, + ] + new_state + } + ) + + emit: + output_ch +} diff --git a/src/workflows/well_demultiplex/nextflow.config b/src/workflows/well_demultiplex/nextflow.config new file mode 100644 index 00000000..3643e2fe --- /dev/null +++ b/src/workflows/well_demultiplex/nextflow.config @@ -0,0 +1,11 @@ +manifest { + nextflowVersion = '!>=20.12.1-edge' +} + +params { + rootDir = java.nio.file.Paths.get("$projectDir/../../../").toAbsolutePath().normalize().toString() +} + + +// include common settings +includeConfig("${params.rootDir}/src/config/labels.config") \ No newline at end of file diff --git a/src/workflows/well_demultiplex/test.nf b/src/workflows/well_demultiplex/test.nf new file mode 100644 index 00000000..ff88f0a2 --- /dev/null +++ b/src/workflows/well_demultiplex/test.nf @@ -0,0 +1,105 @@ +include { well_demultiplex } from params.rootDir + "/target/nextflow/workflows/well_demultiplex/main.nf" +include { check_cutadapt_output } from params.rootDir + "/target/nextflow/integration_test_components/well_demultiplexing/check_cutadapt_output/main.nf" + +params.resources_test = "gs://viash-hub-test-data/htrnaseq/v1/" + +workflow test_wf { + resources_test_file = file(params.resources_test) + output_ch = Channel.fromList([ + [ + id: "SRR14730301", + input_r1: resources_test_file.resolve("100k/SRR14730301/VH02001612_S9_R1_001.fastq"), + input_r2: resources_test_file.resolve("100k/SRR14730301/VH02001612_S9_R2_001.fastq"), + barcodesFasta: resources_test_file.resolve("2-wells-with-ids.fasta"), + ], + [ + id: "SRR14730302", + input_r1: resources_test_file.resolve("100k/SRR14730302/VH02001614_S8_R1_001.fastq"), + input_r2: resources_test_file.resolve("100k/SRR14730302/VH02001614_S8_R2_001.fastq"), + barcodesFasta: resources_test_file.resolve("2-wells-with-ids.fasta"), + ], + ]) + | map { state -> [ state.id, state ] } + | well_demultiplex.run( + fromState: { id, state -> + [ + input_r1: state.input_r1, + input_r2: state.input_r2, + barcodesFasta: state.barcodesFasta, + ] + }, + toState: { id, output, state -> + output } + ) + | view { output -> + assert output.size() == 2 : "outputs should contain two elements; [id, file]" + assert output[1].output_r1.size() == 3: "Expected 3 forward fastq files: 2 wells and 1 unknown" + assert output[1].output_r2.size() == 3: "Expected 3 reverse fastq files: 2 wells and 1 unknown" + "Output: $output" + } + | toSortedList() + | view { output -> + assert output.size() == 2 : "Should have found two pools!" + } + | map {output -> + def ids = output.collect{it[0]} + def states = output.collect{it[1]} + def output_r1 = states.collect{it.output_r1}.flatten() + def output_r2 = states.collect{it.output_r2}.flatten() + def ids_pool_1 = states[0].output_r1.collect{ids[0] + "__" + (it.name - ~/_R(1|2)_001.fastq$/) } + def ids_pool_2 = states[1].output_r2.collect{ids[1] + "__" + (it.name - ~/_R(1|2)_001.fastq$/) } + + def new_state = [ + "ids": ids_pool_1 + ids_pool_2, + "fastq_r1": output_r1, + "fastq_r2": output_r2 + ] + ["integration_test_check", new_state] + } + | check_cutadapt_output.run( + fromState: {id, state -> state} + ) +} + + +workflow test_wf2 { + resources_test_file = file(params.resources_test) + output_ch = Channel.fromList([ + [ + id: "SRR14730301", + input_r1: + [ + resources_test_file.resolve("100k/SRR14730301/VH02001612_S9_R1_001.fastq"), + resources_test_file.resolve("100k/SRR14730302/VH02001614_S8_R1_001.fastq"), + ], + input_r2: + [ + resources_test_file.resolve("100k/SRR14730301/VH02001612_S9_R2_001.fastq"), + resources_test_file.resolve("100k/SRR14730302/VH02001614_S8_R2_001.fastq"), + ], + barcodesFasta: resources_test_file.resolve("2-wells-with-ids.fasta"), + ], + ]) + | map { state -> [ state.id, state ] } + | well_demultiplex.run( + fromState: { id, state -> + [ + input_r1: state.input_r1, + input_r2: state.input_r2, + barcodesFasta: state.barcodesFasta, + ] + }, + toState: { id, output, state -> + output } + ) + | view { output -> + assert output.size() == 2 : "outputs should contain two elements; [id, file]" + assert output[1].output_r1.size() == 3: "Expected 3 forward fastq files: 2 wells and 1 unknown" + assert output[1].output_r2.size() == 3: "Expected 3 reverse fastq files: 2 wells and 1 unknown" + "Output: $output" + } + | toSortedList() + | view { output -> + assert output.size() == 1 : "Should have found 1 pool" + } +} diff --git a/src/workflows/well_metadata/config.vsh.yaml b/src/workflows/well_metadata/config.vsh.yaml new file mode 100644 index 00000000..3be18d1b --- /dev/null +++ b/src/workflows/well_metadata/config.vsh.yaml @@ -0,0 +1,66 @@ +name: well_metadata +namespace: workflows +authors: + - __merge__: /src/base/authors/dries_schaumont.yaml + roles: [ maintainer ] +argument_groups: + - name: Input arguments + arguments: + - name: "--input_r1" + type: file + description: | + List of FASTQ files for the forward reads, 1 per well. + required: true + multiple: true + - name: "--input_r2" + type: file + description: | + List of FASTQ files for the forward reads, 1 per well. + multiple: true + required: true + - name: "--star_mapping" + type: file + multiple: true + required: true + - name: --barcodesFasta + type: file + required: true + - name: Output arguments + arguments: + - name: --output_r1 + description: Output fastq file. + type: file + direction: output + required: true + - name: "--output_r2" + description: Output fastq file. + type: file + direction: output + required: true + - name: "--pool" + type: string + description: The original pool / sample name + direction: output + - name: "--well_id" + type: string + direction: output + - name: "--barcode" + type: string + direction: output + - name: "--n_wells" + type: integer + direction: output + description: The number of wells in the pool is well is a part of. + - name: "--well_star_mapping" + type: file + direction: output +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + +runners: + - type: nextflow + +engines: + - type: native diff --git a/src/workflows/well_metadata/main.nf b/src/workflows/well_metadata/main.nf new file mode 100644 index 00000000..e2f1e16f --- /dev/null +++ b/src/workflows/well_metadata/main.nf @@ -0,0 +1,123 @@ +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + /* + Parse the fasta file containing the barcodes and do the following: + - The sequence headers must not contain any whitespaces + - The headers (Well IDs) must be unique + - The barcodes must be unique + - Store the number of barcodes in the state + - Add a barcode to well ID (header) mapping to the state, + in order to be able to retreive the well ID based on the FASTQ name after well demultiplexing + */ + | map {id, state -> + def n_wells = state.barcodesFasta.countFasta() as int + // The header is the full header, the id is the part header up to the first whitespace character + // We do not allow whitespace in the header of the fasta file, so assert this. + def fasta_entries = state.barcodesFasta.splitFasta( + record: ["id": true, "header": true, "seqString": true] + ) + assert fasta_entries.every{it.id == it.header}, \ + "The barcodes FASTA headers must not contain any whitespace!" + // Check if the fasta headers are unique + def fasta_ids = fasta_entries.collect{it.id} + assert fasta_ids.clone().unique() == fasta_ids, \ + "The barcodes FASTA entries must have a unique name!" + // Check if the sequences are unique + def fasta_sequences = fasta_entries.collect{it.seqString} + assert fasta_sequences.clone().unique() == fasta_sequences, \ + "The barcodes FASTA sequences must be unique!" + def well_id_matcher = /^([A-Za-z]+)0*([1-9]?[0-9]+)$/ + def entries_corrected_id = fasta_entries.collectEntries { it -> + def unformatted_id = it.header + def id_matched_to_format = unformatted_id =~ well_id_matcher + assert (id_matched_to_format && id_matched_to_format.getCount() == 1), \ + "The FASTA headers must match the coordinate system of a well plate (e.g. A01, B01, ... or AA1, AB1, ...). Found: ${unformatted_id}" + def id_letters = id_matched_to_format[0][1].toUpperCase() + def id_numbers = id_matched_to_format[0][2] + [it.seqString.replaceAll("[^ACGTacgt]", ""), "${id_letters}${id_numbers}".toString()] + } + def newState = state + [ + "n_wells": n_wells, + "barcode_well_id_mapping": entries_corrected_id, + ] + [id, newState] + } + | flatMap{ id, state -> + def new_events = state.star_mapping.collect{ star_output_dir -> + def pool = id + // Get the barcode from the STAR file. + // One STAR output contains the results for one + // well barcode. We can look for the barcode in + // the 'Solo.out/Gene/raw/barcode.tsv' file. + def barcodes_files = files("${star_output_dir}/Solo.out/Gene/raw/barcodes.tsv") + assert barcodes_files.size() == 1, \ + "Exactly one file should have matched the barcodes files (found: $barcodes_files)." + def barcode + barcodes_files.each{ it -> + assert it.countLines() == 1, + "Expected only one barcode in a single STAR output." + barcode = it.text.trim() + } + def well_id = state.barcode_well_id_mapping[barcode] + assert well_id, "Could not find Well ID in FASTA file for barcode ${barcode}." + def return_state = [ + "${pool}__${well_id}".toString(), + [ + "barcode": barcode, + "well_id": well_id, + "pool": pool, + "n_wells": state.n_wells, + "output_r1": state.input_r1, + "output_r2": state.input_r2, + "well_star_mapping": star_output_dir, + "_meta": ["join_id": pool] + ] + ] + } + return new_events + } + // Parse the file names to obtain metadata about the output + | map{ id, state -> + // Populate the new state + def fastq_files = [state.output_r1, state.output_r2].transpose().findResult{ fastq_pair -> + def (forward_fastq, reverse_fastq) = fastq_pair + def fastq_r1_name = forward_fastq.name + def fastq_r2_name = reverse_fastq.name + // Get the well ID, and also check if it matches between the forward and reverse FASTQ + def well_id = null + [fastq_r1_name, fastq_r2_name].each { file_name -> + def well_id_matcher = file_name =~ /^([A-Za-z0-9]*|unknown)_R?.*/ + assert well_id_matcher, \ + "Could not find Well ID in the name of FASTQ file ($file_name) output from cutadapt." + def current_well_id = well_id_matcher[0][1] + if (!well_id) { + well_id = current_well_id + } else { + assert well_id == current_well_id, + "Well ID for forward and reverse fastq file did not match! File names: ${fastq_r1_name} and ${fastq_r2_name}" + } + } + assert (well_id != null), \ + "No Well ID could be deduced from files ${fastq_r1_name} and ${fastq_r2_name}." + + if (well_id == "unknown" || well_id != state.well_id) { + return null + } + return fastq_pair + } + + def new_state = state + [ + "output_r1": fastq_files[0], + "output_r2": fastq_files[1] + ] + return [id, new_state] + } + | setState(["output_r1", "output_r2", "pool", "well_id", "n_wells", "barcode", "well_star_mapping", "_meta"]) + + emit: + output_ch +} \ No newline at end of file diff --git a/target/.build.yaml b/target/.build.yaml new file mode 100644 index 00000000..e69de29b diff --git a/target/dependencies/vsh/vsh/biobox/v0.3.1/nextflow/cutadapt/.config.vsh.yaml b/target/dependencies/vsh/vsh/biobox/v0.3.1/nextflow/cutadapt/.config.vsh.yaml new file mode 100644 index 00000000..91567579 --- /dev/null +++ b/target/dependencies/vsh/vsh/biobox/v0.3.1/nextflow/cutadapt/.config.vsh.yaml @@ -0,0 +1,781 @@ +name: "cutadapt" +version: "v0.3.1" +authors: +- name: "Toni Verbeiren" + roles: + - "author" + - "maintainer" + info: + links: + github: "tverbeiren" + linkedin: "verbeiren" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist and CEO" +argument_groups: +- name: "Specify Adapters for R1" + arguments: + - type: "string" + name: "--adapter" + alternatives: + - "-a" + description: "Sequence of an adapter ligated to the 3' end (paired data:\nof the\ + \ first read). The adapter and subsequent bases are\ntrimmed. If a '$' character\ + \ is appended ('anchoring'), the\nadapter is only found if it is a suffix of\ + \ the read.\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--front" + alternatives: + - "-g" + description: "Sequence of an adapter ligated to the 5' end (paired data:\nof the\ + \ first read). The adapter and any preceding bases\nare trimmed. Partial matches\ + \ at the 5' end are allowed. If\na '^' character is prepended ('anchoring'),\ + \ the adapter is\nonly found if it is a prefix of the read.\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--anywhere" + alternatives: + - "-b" + description: "Sequence of an adapter that may be ligated to the 5' or 3'\nend\ + \ (paired data: of the first read). Both types of\nmatches as described under\ + \ -a and -g are allowed. If the\nfirst base of the read is part of the match,\ + \ the behavior\nis as with -g, otherwise as with -a. This option is mostly\n\ + for rescuing failed library preparations - do not use if\nyou know which end\ + \ your adapter was ligated to!\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Specify Adapters using Fasta files for R1" + arguments: + - type: "file" + name: "--adapter_fasta" + description: "Fasta file containing sequences of an adapter ligated to the 3'\ + \ end (paired data:\nof the first read). The adapter and subsequent bases are\n\ + trimmed. If a '$' character is appended ('anchoring'), the\nadapter is only\ + \ found if it is a suffix of the read.\n" + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--front_fasta" + description: "Fasta file containing sequences of an adapter ligated to the 5'\ + \ end (paired data:\nof the first read). The adapter and any preceding bases\n\ + are trimmed. Partial matches at the 5' end are allowed. If\na '^' character\ + \ is prepended ('anchoring'), the adapter is\nonly found if it is a prefix of\ + \ the read.\n" + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--anywhere_fasta" + description: "Fasta file containing sequences of an adapter that may be ligated\ + \ to the 5' or 3'\nend (paired data: of the first read). Both types of\nmatches\ + \ as described under -a and -g are allowed. If the\nfirst base of the read is\ + \ part of the match, the behavior\nis as with -g, otherwise as with -a. This\ + \ option is mostly\nfor rescuing failed library preparations - do not use if\n\ + you know which end your adapter was ligated to!\n" + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Specify Adapters for R2" + arguments: + - type: "string" + name: "--adapter_r2" + alternatives: + - "-A" + description: "Sequence of an adapter ligated to the 3' end (paired data:\nof the\ + \ first read). The adapter and subsequent bases are\ntrimmed. If a '$' character\ + \ is appended ('anchoring'), the\nadapter is only found if it is a suffix of\ + \ the read.\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--front_r2" + alternatives: + - "-G" + description: "Sequence of an adapter ligated to the 5' end (paired data:\nof the\ + \ first read). The adapter and any preceding bases\nare trimmed. Partial matches\ + \ at the 5' end are allowed. If\na '^' character is prepended ('anchoring'),\ + \ the adapter is\nonly found if it is a prefix of the read.\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--anywhere_r2" + alternatives: + - "-B" + description: "Sequence of an adapter that may be ligated to the 5' or 3'\nend\ + \ (paired data: of the first read). Both types of\nmatches as described under\ + \ -a and -g are allowed. If the\nfirst base of the read is part of the match,\ + \ the behavior\nis as with -g, otherwise as with -a. This option is mostly\n\ + for rescuing failed library preparations - do not use if\nyou know which end\ + \ your adapter was ligated to!\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Specify Adapters using Fasta files for R2" + arguments: + - type: "file" + name: "--adapter_r2_fasta" + description: "Fasta file containing sequences of an adapter ligated to the 3'\ + \ end (paired data:\nof the first read). The adapter and subsequent bases are\n\ + trimmed. If a '$' character is appended ('anchoring'), the\nadapter is only\ + \ found if it is a suffix of the read.\n" + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--front_r2_fasta" + description: "Fasta file containing sequences of an adapter ligated to the 5'\ + \ end (paired data:\nof the first read). The adapter and any preceding bases\n\ + are trimmed. Partial matches at the 5' end are allowed. If\na '^' character\ + \ is prepended ('anchoring'), the adapter is\nonly found if it is a prefix of\ + \ the read.\n" + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--anywhere_r2_fasta" + description: "Fasta file containing sequences of an adapter that may be ligated\ + \ to the 5' or 3'\nend (paired data: of the first read). Both types of\nmatches\ + \ as described under -a and -g are allowed. If the\nfirst base of the read is\ + \ part of the match, the behavior\nis as with -g, otherwise as with -a. This\ + \ option is mostly\nfor rescuing failed library preparations - do not use if\n\ + you know which end your adapter was ligated to!\n" + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Paired-end options" + arguments: + - type: "boolean_true" + name: "--pair_adapters" + description: "Treat adapters given with -a/-A etc. as pairs. Either both\nor none\ + \ are removed from each read pair.\n" + info: null + direction: "input" + - type: "string" + name: "--pair_filter" + description: "Which of the reads in a paired-end read have to match the\nfiltering\ + \ criterion in order for the pair to be filtered.\n" + info: null + required: false + choices: + - "any" + - "both" + - "first" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--interleaved" + description: "Read and/or write interleaved paired-end reads.\n" + info: null + direction: "input" +- name: "Input parameters" + arguments: + - type: "file" + name: "--input" + description: "Input fastq file for single-end reads or R1 for paired-end reads.\n" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--input_r2" + description: "Input fastq file for R2 in the case of paired-end reads.\n" + info: null + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "double" + name: "--error_rate" + alternatives: + - "-E" + - "--errors" + description: "Maximum allowed error rate (if 0 <= E < 1), or absolute\nnumber\ + \ of errors for full-length adapter match (if E is an\ninteger >= 1). Error\ + \ rate = no. of errors divided by\nlength of matching region. Default: 0.1 (10%).\n" + info: null + example: + - 0.1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--no_indels" + description: "Allow only mismatches in alignments.\n" + info: null + direction: "input" + - type: "integer" + name: "--times" + alternatives: + - "-n" + description: "Remove up to COUNT adapters from each read. Default: 1.\n" + info: null + example: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--overlap" + alternatives: + - "-O" + description: "Require MINLENGTH overlap between read and adapter for an\nadapter\ + \ to be found. The default is 3.\n" + info: null + example: + - 3 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--match_read_wildcards" + description: "Interpret IUPAC wildcards in reads.\n" + info: null + direction: "input" + - type: "boolean_true" + name: "--no_match_adapter_wildcards" + description: "Do not interpret IUPAC wildcards in adapters.\n" + info: null + direction: "input" + - type: "string" + name: "--action" + description: "What to do if a match was found. trim: trim adapter and\nup- or\ + \ downstream sequence; retain: trim, but retain\nadapter; mask: replace with\ + \ 'N' characters; lowercase:\nconvert to lowercase; none: leave unchanged.\n\ + The default is trim.\n" + info: null + example: + - "trim" + required: false + choices: + - "trim" + - "retain" + - "mask" + - "lowercase" + - "none" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--revcomp" + alternatives: + - "--rc" + description: "Check both the read and its reverse complement for adapter\nmatches.\ + \ If match is on reverse-complemented version,\noutput that one.\n" + info: null + direction: "input" +- name: "Demultiplexing options" + arguments: + - type: "string" + name: "--demultiplex_mode" + description: "Enable demultiplexing and set the mode for it.\nWith mode 'unique_dual',\ + \ adapters from the first and second read are used,\nand the indexes from the\ + \ reads are only used in pairs. This implies\n--pair_adapters.\nEnabling mode\ + \ 'combinatorial_dual' allows all combinations of the sets of indexes\non R1\ + \ and R2. It is necessary to write each read pair to an output\nfile depending\ + \ on the adapters found on both R1 and R2.\nMode 'single', uses indexes or barcodes\ + \ located at the 5'\nend of the R1 read (single). \n" + info: null + required: false + choices: + - "single" + - "unique_dual" + - "combinatorial_dual" + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Read modifications" + arguments: + - type: "integer" + name: "--cut" + alternatives: + - "-u" + description: "Remove LEN bases from each read (or R1 if paired; use --cut_r2\n\ + option for R2). If LEN is positive, remove bases from the\nbeginning. If LEN\ + \ is negative, remove bases from the end.\nCan be used twice if LENs have different\ + \ signs. Applied\n*before* adapter trimming.\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--cut_r2" + description: "Remove LEN bases from each read (for R2). If LEN is positive, remove\ + \ bases from the\nbeginning. If LEN is negative, remove bases from the end.\n\ + Can be used twice if LENs have different signs. Applied\n*before* adapter trimming.\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--nextseq_trim" + description: "NextSeq-specific quality trimming (each read). Trims also\ndark\ + \ cycles appearing as high-quality G bases.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--quality_cutoff" + alternatives: + - "-q" + description: "Trim low-quality bases from 5' and/or 3' ends of each read\nbefore\ + \ adapter removal. Applied to both reads if data is\npaired. If one value is\ + \ given, only the 3' end is trimmed.\nIf two comma-separated cutoffs are given,\ + \ the 5' end is\ntrimmed with the first cutoff, the 3' end with the second.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--quality_cutoff_r2" + alternatives: + - "-Q" + description: "Quality-trimming cutoff for R2. Default: same as for R1\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--quality_base" + description: "Assume that quality values in FASTQ are encoded as\nascii(quality\ + \ + N). This needs to be set to 64 for some\nold Illumina FASTQ files. The default\ + \ is 33.\n" + info: null + example: + - 33 + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--poly_a" + description: "Trim poly-A tails" + info: null + direction: "input" + - type: "integer" + name: "--length" + alternatives: + - "-l" + description: "Shorten reads to LENGTH. Positive values remove bases at\nthe end\ + \ while negative ones remove bases at the beginning.\nThis and the following\ + \ modifications are applied after\nadapter trimming.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--trim_n" + description: "Trim N's on ends of reads." + info: null + direction: "input" + - type: "string" + name: "--length_tag" + description: "Search for TAG followed by a decimal number in the\ndescription\ + \ field of the read. Replace the decimal number\nwith the correct length of\ + \ the trimmed read. For example,\nuse --length-tag 'length=' to correct fields\ + \ like\n'length=123'.\n" + info: null + example: + - "length=" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--strip_suffix" + description: "Remove this suffix from read names if present. Can be\ngiven multiple\ + \ times.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--prefix" + alternatives: + - "-x" + description: "Add this prefix to read names. Use {name} to insert the\nname of\ + \ the matching adapter.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--suffix" + alternatives: + - "-y" + description: "Add this suffix to read names; can also include {name}\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--rename" + description: "Rename reads using TEMPLATE containing variables such as\n{id},\ + \ {adapter_name} etc. (see documentation)\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--zero_cap" + alternatives: + - "-z" + description: "Change negative quality values to zero." + info: null + direction: "input" +- name: "Filtering of processed reads" + description: "Filters are applied after above read modifications. Paired-end reads\ + \ are\nalways discarded pairwise (see also --pair_filter).\n" + arguments: + - type: "string" + name: "--minimum_length" + alternatives: + - "-m" + description: "Discard reads shorter than LEN. Default is 0.\nWhen trimming paired-end\ + \ reads, the minimum lengths for R1 and R2 can be specified separately by separating\ + \ them with a colon (:).\nIf the colon syntax is not used, the same minimum\ + \ length applies to both reads, as discussed above.\nAlso, one of the values\ + \ can be omitted to impose no restrictions.\nFor example, with -m 17:, the length\ + \ of R1 must be at least 17, but the length of R2 is ignored.\n" + info: null + example: + - "0" + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--maximum_length" + alternatives: + - "-M" + description: "Discard reads longer than LEN. Default: no limit.\nFor paired reads,\ + \ see the remark for --minimum_length\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--max_n" + description: "Discard reads with more than COUNT 'N' bases. If COUNT is\na number\ + \ between 0 and 1, it is interpreted as a fraction\nof the read length.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "long" + name: "--max_expected_errors" + alternatives: + - "--max_ee" + description: "Discard reads whose expected number of errors (computed\nfrom quality\ + \ values) exceeds ERRORS.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "long" + name: "--max_average_error_rate" + alternatives: + - "--max_aer" + description: "as --max_expected_errors (see above), but divided by\nlength to\ + \ account for reads of varying length.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--discard_trimmed" + alternatives: + - "--discard" + description: "Discard reads that contain an adapter. Use also -O to\navoid discarding\ + \ too many randomly matching reads.\n" + info: null + direction: "input" + - type: "boolean_true" + name: "--discard_untrimmed" + alternatives: + - "--trimmed_only" + description: "Discard reads that do not contain an adapter.\n" + info: null + direction: "input" + - type: "boolean_true" + name: "--discard_casava" + description: "Discard reads that did not pass CASAVA filtering (header\nhas :Y:).\n" + info: null + direction: "input" +- name: "Output parameters" + arguments: + - type: "string" + name: "--report" + description: "Which type of report to print: 'full' (default) or 'minimal'.\n" + info: null + example: + - "full" + required: false + choices: + - "full" + - "minimal" + direction: "input" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--json" + description: "Write report in JSON format to this file.\n" + info: null + direction: "input" + - type: "file" + name: "--output" + description: "Glob pattern for matching the expected output files.\nShould include\ + \ `$output_dir`.\n" + info: null + example: + - "fastq/*_001.fast[a,q]" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: true + multiple_sep: ";" + - type: "boolean_true" + name: "--fasta" + description: "Output FASTA to standard output even on FASTQ input.\n" + info: null + direction: "input" + - type: "boolean_true" + name: "--info_file" + description: "Write information about each read and its adapter matches\ninto\ + \ info.txt in the output directory.\nSee the documentation for the file format.\n" + info: null + direction: "input" +- name: "Debug" + arguments: + - type: "boolean_true" + name: "--debug" + description: "Print debug information" + info: null + direction: "input" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +description: "Cutadapt removes adapter sequences from high-throughput sequencing reads.\n" +test_resources: +- type: "bash_script" + path: "test.sh" + is_executable: true +info: null +status: "enabled" +scope: + image: "public" + target: "public" +requirements: + commands: + - "ps" +keywords: +- "RNA-seq" +- "scRNA-seq" +- "high-throughput" +license: "MIT" +references: + doi: + - "10.14806/ej.17.1.200" +links: + repository: "https://github.com/marcelm/cutadapt" + homepage: "https://cutadapt.readthedocs.io" + documentation: "https://cutadapt.readthedocs.io" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12" + target_registry: "images.viash-hub.com" + target_tag: "v0.3.1" + namespace_separator: "/" + setup: + - type: "python" + user: false + pip: + - "cutadapt" + upgrade: true + - type: "docker" + run: + - "cutadapt --version | sed 's/\\(.*\\)/cutadapt: \"\\1\"/' > /var/software_versions.txt\n" + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/cutadapt/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/cutadapt" + executable: "target/nextflow/cutadapt/main.nf" + viash_version: "0.9.4" + git_commit: "98a5f3cc745525a65c10263d25cf414eb1093223" + git_remote: "https://github.com/viash-hub/biobox" + git_tag: "v0.3.0-8-g98a5f3c" +package_config: + name: "biobox" + version: "v0.3.1" + summary: "A curated collection of high-quality, standalone bioinformatics components\ + \ built with [Viash](https://viash.io).\n" + description: "`biobox` offers a suite of reliable bioinformatics components, similar\ + \ to [nf-core/modules](https://github.com/nf-core/modules) and [snakemake-wrappers/bio](https://github.com/snakemake/snakemake-wrappers/tree/master/bio),\ + \ but built using the [Viash](https://viash.io) framework.\n\nThis approach emphasizes\ + \ **reusability**, **reproducibility**, and adherence to **best practices**. Key\ + \ features of `biobox` components include:\n\n* **Standalone & Nextflow Ready:**\ + \ Run components directly via the command line or seamlessly integrate them into\ + \ Nextflow workflows.\n* **High Quality Standards:**\n * Comprehensive documentation\ + \ for components and parameters.\n * Full exposure of underlying tool arguments.\n\ + \ * Containerized (Docker) for dependency management and reproducibility.\n\ + \ * Unit tested for verified functionality.\n" + info: null + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".requirements.commands := ['ps']\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'v0.3.1'" + keywords: + - "bioinformatics" + - "modules" + - "sequencing" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/viash-hub/biobox" + issue_tracker: "https://github.com/viash-hub/biobox/issues" diff --git a/target/dependencies/vsh/vsh/biobox/v0.3.1/nextflow/cutadapt/main.nf b/target/dependencies/vsh/vsh/biobox/v0.3.1/nextflow/cutadapt/main.nf new file mode 100644 index 00000000..949e4972 --- /dev/null +++ b/target/dependencies/vsh/vsh/biobox/v0.3.1/nextflow/cutadapt/main.nf @@ -0,0 +1,4721 @@ +// cutadapt v0.3.1 +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Toni Verbeiren (author, maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "cutadapt", + "version" : "v0.3.1", + "authors" : [ + { + "name" : "Toni Verbeiren", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "links" : { + "github" : "tverbeiren", + "linkedin" : "verbeiren" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist and CEO" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Specify Adapters for R1", + "arguments" : [ + { + "type" : "string", + "name" : "--adapter", + "alternatives" : [ + "-a" + ], + "description" : "Sequence of an adapter ligated to the 3' end (paired data:\nof the first read). The adapter and subsequent bases are\ntrimmed. If a '$' character is appended ('anchoring'), the\nadapter is only found if it is a suffix of the read.\n", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--front", + "alternatives" : [ + "-g" + ], + "description" : "Sequence of an adapter ligated to the 5' end (paired data:\nof the first read). The adapter and any preceding bases\nare trimmed. Partial matches at the 5' end are allowed. If\na '^' character is prepended ('anchoring'), the adapter is\nonly found if it is a prefix of the read.\n", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--anywhere", + "alternatives" : [ + "-b" + ], + "description" : "Sequence of an adapter that may be ligated to the 5' or 3'\nend (paired data: of the first read). Both types of\nmatches as described under -a and -g are allowed. If the\nfirst base of the read is part of the match, the behavior\nis as with -g, otherwise as with -a. This option is mostly\nfor rescuing failed library preparations - do not use if\nyou know which end your adapter was ligated to!\n", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Specify Adapters using Fasta files for R1", + "arguments" : [ + { + "type" : "file", + "name" : "--adapter_fasta", + "description" : "Fasta file containing sequences of an adapter ligated to the 3' end (paired data:\nof the first read). The adapter and subsequent bases are\ntrimmed. If a '$' character is appended ('anchoring'), the\nadapter is only found if it is a suffix of the read.\n", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--front_fasta", + "description" : "Fasta file containing sequences of an adapter ligated to the 5' end (paired data:\nof the first read). The adapter and any preceding bases\nare trimmed. Partial matches at the 5' end are allowed. If\na '^' character is prepended ('anchoring'), the adapter is\nonly found if it is a prefix of the read.\n", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--anywhere_fasta", + "description" : "Fasta file containing sequences of an adapter that may be ligated to the 5' or 3'\nend (paired data: of the first read). Both types of\nmatches as described under -a and -g are allowed. If the\nfirst base of the read is part of the match, the behavior\nis as with -g, otherwise as with -a. This option is mostly\nfor rescuing failed library preparations - do not use if\nyou know which end your adapter was ligated to!\n", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Specify Adapters for R2", + "arguments" : [ + { + "type" : "string", + "name" : "--adapter_r2", + "alternatives" : [ + "-A" + ], + "description" : "Sequence of an adapter ligated to the 3' end (paired data:\nof the first read). The adapter and subsequent bases are\ntrimmed. If a '$' character is appended ('anchoring'), the\nadapter is only found if it is a suffix of the read.\n", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--front_r2", + "alternatives" : [ + "-G" + ], + "description" : "Sequence of an adapter ligated to the 5' end (paired data:\nof the first read). The adapter and any preceding bases\nare trimmed. Partial matches at the 5' end are allowed. If\na '^' character is prepended ('anchoring'), the adapter is\nonly found if it is a prefix of the read.\n", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--anywhere_r2", + "alternatives" : [ + "-B" + ], + "description" : "Sequence of an adapter that may be ligated to the 5' or 3'\nend (paired data: of the first read). Both types of\nmatches as described under -a and -g are allowed. If the\nfirst base of the read is part of the match, the behavior\nis as with -g, otherwise as with -a. This option is mostly\nfor rescuing failed library preparations - do not use if\nyou know which end your adapter was ligated to!\n", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Specify Adapters using Fasta files for R2", + "arguments" : [ + { + "type" : "file", + "name" : "--adapter_r2_fasta", + "description" : "Fasta file containing sequences of an adapter ligated to the 3' end (paired data:\nof the first read). The adapter and subsequent bases are\ntrimmed. If a '$' character is appended ('anchoring'), the\nadapter is only found if it is a suffix of the read.\n", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--front_r2_fasta", + "description" : "Fasta file containing sequences of an adapter ligated to the 5' end (paired data:\nof the first read). The adapter and any preceding bases\nare trimmed. Partial matches at the 5' end are allowed. If\na '^' character is prepended ('anchoring'), the adapter is\nonly found if it is a prefix of the read.\n", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--anywhere_r2_fasta", + "description" : "Fasta file containing sequences of an adapter that may be ligated to the 5' or 3'\nend (paired data: of the first read). Both types of\nmatches as described under -a and -g are allowed. If the\nfirst base of the read is part of the match, the behavior\nis as with -g, otherwise as with -a. This option is mostly\nfor rescuing failed library preparations - do not use if\nyou know which end your adapter was ligated to!\n", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Paired-end options", + "arguments" : [ + { + "type" : "boolean_true", + "name" : "--pair_adapters", + "description" : "Treat adapters given with -a/-A etc. as pairs. Either both\nor none are removed from each read pair.\n", + "direction" : "input" + }, + { + "type" : "string", + "name" : "--pair_filter", + "description" : "Which of the reads in a paired-end read have to match the\nfiltering criterion in order for the pair to be filtered.\n", + "required" : false, + "choices" : [ + "any", + "both", + "first" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--interleaved", + "description" : "Read and/or write interleaved paired-end reads.\n", + "direction" : "input" + } + ] + }, + { + "name" : "Input parameters", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "Input fastq file for single-end reads or R1 for paired-end reads.\n", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--input_r2", + "description" : "Input fastq file for R2 in the case of paired-end reads.\n", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "double", + "name" : "--error_rate", + "alternatives" : [ + "-E", + "--errors" + ], + "description" : "Maximum allowed error rate (if 0 <= E < 1), or absolute\nnumber of errors for full-length adapter match (if E is an\ninteger >= 1). Error rate = no. of errors divided by\nlength of matching region. Default: 0.1 (10%).\n", + "example" : [ + 0.1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--no_indels", + "description" : "Allow only mismatches in alignments.\n", + "direction" : "input" + }, + { + "type" : "integer", + "name" : "--times", + "alternatives" : [ + "-n" + ], + "description" : "Remove up to COUNT adapters from each read. Default: 1.\n", + "example" : [ + 1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--overlap", + "alternatives" : [ + "-O" + ], + "description" : "Require MINLENGTH overlap between read and adapter for an\nadapter to be found. The default is 3.\n", + "example" : [ + 3 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--match_read_wildcards", + "description" : "Interpret IUPAC wildcards in reads.\n", + "direction" : "input" + }, + { + "type" : "boolean_true", + "name" : "--no_match_adapter_wildcards", + "description" : "Do not interpret IUPAC wildcards in adapters.\n", + "direction" : "input" + }, + { + "type" : "string", + "name" : "--action", + "description" : "What to do if a match was found. trim: trim adapter and\nup- or downstream sequence; retain: trim, but retain\nadapter; mask: replace with 'N' characters; lowercase:\nconvert to lowercase; none: leave unchanged.\nThe default is trim.\n", + "example" : [ + "trim" + ], + "required" : false, + "choices" : [ + "trim", + "retain", + "mask", + "lowercase", + "none" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--revcomp", + "alternatives" : [ + "--rc" + ], + "description" : "Check both the read and its reverse complement for adapter\nmatches. If match is on reverse-complemented version,\noutput that one.\n", + "direction" : "input" + } + ] + }, + { + "name" : "Demultiplexing options", + "arguments" : [ + { + "type" : "string", + "name" : "--demultiplex_mode", + "description" : "Enable demultiplexing and set the mode for it.\nWith mode 'unique_dual', adapters from the first and second read are used,\nand the indexes from the reads are only used in pairs. This implies\n--pair_adapters.\nEnabling mode 'combinatorial_dual' allows all combinations of the sets of indexes\non R1 and R2. It is necessary to write each read pair to an output\nfile depending on the adapters found on both R1 and R2.\nMode 'single', uses indexes or barcodes located at the 5'\nend of the R1 read (single). \n", + "required" : false, + "choices" : [ + "single", + "unique_dual", + "combinatorial_dual" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Read modifications", + "arguments" : [ + { + "type" : "integer", + "name" : "--cut", + "alternatives" : [ + "-u" + ], + "description" : "Remove LEN bases from each read (or R1 if paired; use --cut_r2\noption for R2). If LEN is positive, remove bases from the\nbeginning. If LEN is negative, remove bases from the end.\nCan be used twice if LENs have different signs. Applied\n*before* adapter trimming.\n", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--cut_r2", + "description" : "Remove LEN bases from each read (for R2). If LEN is positive, remove bases from the\nbeginning. If LEN is negative, remove bases from the end.\nCan be used twice if LENs have different signs. Applied\n*before* adapter trimming.\n", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--nextseq_trim", + "description" : "NextSeq-specific quality trimming (each read). Trims also\ndark cycles appearing as high-quality G bases.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--quality_cutoff", + "alternatives" : [ + "-q" + ], + "description" : "Trim low-quality bases from 5' and/or 3' ends of each read\nbefore adapter removal. Applied to both reads if data is\npaired. If one value is given, only the 3' end is trimmed.\nIf two comma-separated cutoffs are given, the 5' end is\ntrimmed with the first cutoff, the 3' end with the second.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--quality_cutoff_r2", + "alternatives" : [ + "-Q" + ], + "description" : "Quality-trimming cutoff for R2. Default: same as for R1\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--quality_base", + "description" : "Assume that quality values in FASTQ are encoded as\nascii(quality + N). This needs to be set to 64 for some\nold Illumina FASTQ files. The default is 33.\n", + "example" : [ + 33 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--poly_a", + "description" : "Trim poly-A tails", + "direction" : "input" + }, + { + "type" : "integer", + "name" : "--length", + "alternatives" : [ + "-l" + ], + "description" : "Shorten reads to LENGTH. Positive values remove bases at\nthe end while negative ones remove bases at the beginning.\nThis and the following modifications are applied after\nadapter trimming.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--trim_n", + "description" : "Trim N's on ends of reads.", + "direction" : "input" + }, + { + "type" : "string", + "name" : "--length_tag", + "description" : "Search for TAG followed by a decimal number in the\ndescription field of the read. Replace the decimal number\nwith the correct length of the trimmed read. For example,\nuse --length-tag 'length=' to correct fields like\n'length=123'.\n", + "example" : [ + "length=" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--strip_suffix", + "description" : "Remove this suffix from read names if present. Can be\ngiven multiple times.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--prefix", + "alternatives" : [ + "-x" + ], + "description" : "Add this prefix to read names. Use {name} to insert the\nname of the matching adapter.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--suffix", + "alternatives" : [ + "-y" + ], + "description" : "Add this suffix to read names; can also include {name}\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--rename", + "description" : "Rename reads using TEMPLATE containing variables such as\n{id}, {adapter_name} etc. (see documentation)\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--zero_cap", + "alternatives" : [ + "-z" + ], + "description" : "Change negative quality values to zero.", + "direction" : "input" + } + ] + }, + { + "name" : "Filtering of processed reads", + "description" : "Filters are applied after above read modifications. Paired-end reads are\nalways discarded pairwise (see also --pair_filter).\n", + "arguments" : [ + { + "type" : "string", + "name" : "--minimum_length", + "alternatives" : [ + "-m" + ], + "description" : "Discard reads shorter than LEN. Default is 0.\nWhen trimming paired-end reads, the minimum lengths for R1 and R2 can be specified separately by separating them with a colon (:).\nIf the colon syntax is not used, the same minimum length applies to both reads, as discussed above.\nAlso, one of the values can be omitted to impose no restrictions.\nFor example, with -m 17:, the length of R1 must be at least 17, but the length of R2 is ignored.\n", + "example" : [ + "0" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--maximum_length", + "alternatives" : [ + "-M" + ], + "description" : "Discard reads longer than LEN. Default: no limit.\nFor paired reads, see the remark for --minimum_length\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--max_n", + "description" : "Discard reads with more than COUNT 'N' bases. If COUNT is\na number between 0 and 1, it is interpreted as a fraction\nof the read length.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "long", + "name" : "--max_expected_errors", + "alternatives" : [ + "--max_ee" + ], + "description" : "Discard reads whose expected number of errors (computed\nfrom quality values) exceeds ERRORS.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "long", + "name" : "--max_average_error_rate", + "alternatives" : [ + "--max_aer" + ], + "description" : "as --max_expected_errors (see above), but divided by\nlength to account for reads of varying length.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--discard_trimmed", + "alternatives" : [ + "--discard" + ], + "description" : "Discard reads that contain an adapter. Use also -O to\navoid discarding too many randomly matching reads.\n", + "direction" : "input" + }, + { + "type" : "boolean_true", + "name" : "--discard_untrimmed", + "alternatives" : [ + "--trimmed_only" + ], + "description" : "Discard reads that do not contain an adapter.\n", + "direction" : "input" + }, + { + "type" : "boolean_true", + "name" : "--discard_casava", + "description" : "Discard reads that did not pass CASAVA filtering (header\nhas :Y:).\n", + "direction" : "input" + } + ] + }, + { + "name" : "Output parameters", + "arguments" : [ + { + "type" : "string", + "name" : "--report", + "description" : "Which type of report to print: 'full' (default) or 'minimal'.\n", + "example" : [ + "full" + ], + "required" : false, + "choices" : [ + "full", + "minimal" + ], + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--json", + "description" : "Write report in JSON format to this file.\n", + "direction" : "input" + }, + { + "type" : "file", + "name" : "--output", + "description" : "Glob pattern for matching the expected output files.\nShould include `$output_dir`.\n", + "example" : [ + "fastq/*_001.fast[a,q]" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--fasta", + "description" : "Output FASTA to standard output even on FASTQ input.\n", + "direction" : "input" + }, + { + "type" : "boolean_true", + "name" : "--info_file", + "description" : "Write information about each read and its adapter matches\ninto info.txt in the output directory.\nSee the documentation for the file format.\n", + "direction" : "input" + } + ] + }, + { + "name" : "Debug", + "arguments" : [ + { + "type" : "boolean_true", + "name" : "--debug", + "description" : "Print debug information", + "direction" : "input" + } + ] + } + ], + "resources" : [ + { + "type" : "bash_script", + "path" : "script.sh", + "is_executable" : true + } + ], + "description" : "Cutadapt removes adapter sequences from high-throughput sequencing reads.\n", + "test_resources" : [ + { + "type" : "bash_script", + "path" : "test.sh", + "is_executable" : true + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "requirements" : { + "commands" : [ + "ps" + ] + }, + "keywords" : [ + "RNA-seq", + "scRNA-seq", + "high-throughput" + ], + "license" : "MIT", + "references" : { + "doi" : [ + "10.14806/ej.17.1.200" + ] + }, + "links" : { + "repository" : "https://github.com/marcelm/cutadapt", + "homepage" : "https://cutadapt.readthedocs.io", + "documentation" : "https://cutadapt.readthedocs.io" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + } + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12", + "target_registry" : "images.viash-hub.com", + "target_tag" : "v0.3.1", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "python", + "user" : false, + "pip" : [ + "cutadapt" + ], + "upgrade" : true + }, + { + "type" : "docker", + "run" : [ + "cutadapt --version | sed 's/\\\\(.*\\\\)/cutadapt: \\"\\\\1\\"/' > /var/software_versions.txt\n" + ] + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/cutadapt/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "target/nextflow/cutadapt", + "viash_version" : "0.9.4", + "git_commit" : "98a5f3cc745525a65c10263d25cf414eb1093223", + "git_remote" : "https://github.com/viash-hub/biobox", + "git_tag" : "v0.3.0-8-g98a5f3c" + }, + "package_config" : { + "name" : "biobox", + "version" : "v0.3.1", + "summary" : "A curated collection of high-quality, standalone bioinformatics components built with [Viash](https://viash.io).\n", + "description" : "`biobox` offers a suite of reliable bioinformatics components, similar to [nf-core/modules](https://github.com/nf-core/modules) and [snakemake-wrappers/bio](https://github.com/snakemake/snakemake-wrappers/tree/master/bio), but built using the [Viash](https://viash.io) framework.\n\nThis approach emphasizes **reusability**, **reproducibility**, and adherence to **best practices**. Key features of `biobox` components include:\n\n* **Standalone & Nextflow Ready:** Run components directly via the command line or seamlessly integrate them into Nextflow workflows.\n* **High Quality Standards:**\n * Comprehensive documentation for components and parameters.\n * Full exposure of underlying tool arguments.\n * Containerized (Docker) for dependency management and reproducibility.\n * Unit tested for verified functionality.\n", + "viash_version" : "0.9.4", + "source" : "src", + "target" : "target", + "config_mods" : [ + ".requirements.commands := ['ps']\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'v0.3.1'" + ], + "keywords" : [ + "bioinformatics", + "modules", + "sequencing" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/viash-hub/biobox", + "issue_tracker" : "https://github.com/viash-hub/biobox/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.sh" +cat > "$tempscript" << VIASHMAIN +#!/bin/bash + +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_ADAPTER+x} ]; then echo "${VIASH_PAR_ADAPTER}" | sed "s#'#'\\"'\\"'#g;s#.*#par_adapter='&'#" ; else echo "# par_adapter="; fi ) +$( if [ ! -z ${VIASH_PAR_FRONT+x} ]; then echo "${VIASH_PAR_FRONT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_front='&'#" ; else echo "# par_front="; fi ) +$( if [ ! -z ${VIASH_PAR_ANYWHERE+x} ]; then echo "${VIASH_PAR_ANYWHERE}" | sed "s#'#'\\"'\\"'#g;s#.*#par_anywhere='&'#" ; else echo "# par_anywhere="; fi ) +$( if [ ! -z ${VIASH_PAR_ADAPTER_FASTA+x} ]; then echo "${VIASH_PAR_ADAPTER_FASTA}" | sed "s#'#'\\"'\\"'#g;s#.*#par_adapter_fasta='&'#" ; else echo "# par_adapter_fasta="; fi ) +$( if [ ! -z ${VIASH_PAR_FRONT_FASTA+x} ]; then echo "${VIASH_PAR_FRONT_FASTA}" | sed "s#'#'\\"'\\"'#g;s#.*#par_front_fasta='&'#" ; else echo "# par_front_fasta="; fi ) +$( if [ ! -z ${VIASH_PAR_ANYWHERE_FASTA+x} ]; then echo "${VIASH_PAR_ANYWHERE_FASTA}" | sed "s#'#'\\"'\\"'#g;s#.*#par_anywhere_fasta='&'#" ; else echo "# par_anywhere_fasta="; fi ) +$( if [ ! -z ${VIASH_PAR_ADAPTER_R2+x} ]; then echo "${VIASH_PAR_ADAPTER_R2}" | sed "s#'#'\\"'\\"'#g;s#.*#par_adapter_r2='&'#" ; else echo "# par_adapter_r2="; fi ) +$( if [ ! -z ${VIASH_PAR_FRONT_R2+x} ]; then echo "${VIASH_PAR_FRONT_R2}" | sed "s#'#'\\"'\\"'#g;s#.*#par_front_r2='&'#" ; else echo "# par_front_r2="; fi ) +$( if [ ! -z ${VIASH_PAR_ANYWHERE_R2+x} ]; then echo "${VIASH_PAR_ANYWHERE_R2}" | sed "s#'#'\\"'\\"'#g;s#.*#par_anywhere_r2='&'#" ; else echo "# par_anywhere_r2="; fi ) +$( if [ ! -z ${VIASH_PAR_ADAPTER_R2_FASTA+x} ]; then echo "${VIASH_PAR_ADAPTER_R2_FASTA}" | sed "s#'#'\\"'\\"'#g;s#.*#par_adapter_r2_fasta='&'#" ; else echo "# par_adapter_r2_fasta="; fi ) +$( if [ ! -z ${VIASH_PAR_FRONT_R2_FASTA+x} ]; then echo "${VIASH_PAR_FRONT_R2_FASTA}" | sed "s#'#'\\"'\\"'#g;s#.*#par_front_r2_fasta='&'#" ; else echo "# par_front_r2_fasta="; fi ) +$( if [ ! -z ${VIASH_PAR_ANYWHERE_R2_FASTA+x} ]; then echo "${VIASH_PAR_ANYWHERE_R2_FASTA}" | sed "s#'#'\\"'\\"'#g;s#.*#par_anywhere_r2_fasta='&'#" ; else echo "# par_anywhere_r2_fasta="; fi ) +$( if [ ! -z ${VIASH_PAR_PAIR_ADAPTERS+x} ]; then echo "${VIASH_PAR_PAIR_ADAPTERS}" | sed "s#'#'\\"'\\"'#g;s#.*#par_pair_adapters='&'#" ; else echo "# par_pair_adapters="; fi ) +$( if [ ! -z ${VIASH_PAR_PAIR_FILTER+x} ]; then echo "${VIASH_PAR_PAIR_FILTER}" | sed "s#'#'\\"'\\"'#g;s#.*#par_pair_filter='&'#" ; else echo "# par_pair_filter="; fi ) +$( if [ ! -z ${VIASH_PAR_INTERLEAVED+x} ]; then echo "${VIASH_PAR_INTERLEAVED}" | sed "s#'#'\\"'\\"'#g;s#.*#par_interleaved='&'#" ; else echo "# par_interleaved="; fi ) +$( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "${VIASH_PAR_INPUT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_input='&'#" ; else echo "# par_input="; fi ) +$( if [ ! -z ${VIASH_PAR_INPUT_R2+x} ]; then echo "${VIASH_PAR_INPUT_R2}" | sed "s#'#'\\"'\\"'#g;s#.*#par_input_r2='&'#" ; else echo "# par_input_r2="; fi ) +$( if [ ! -z ${VIASH_PAR_ERROR_RATE+x} ]; then echo "${VIASH_PAR_ERROR_RATE}" | sed "s#'#'\\"'\\"'#g;s#.*#par_error_rate='&'#" ; else echo "# par_error_rate="; fi ) +$( if [ ! -z ${VIASH_PAR_NO_INDELS+x} ]; then echo "${VIASH_PAR_NO_INDELS}" | sed "s#'#'\\"'\\"'#g;s#.*#par_no_indels='&'#" ; else echo "# par_no_indels="; fi ) +$( if [ ! -z ${VIASH_PAR_TIMES+x} ]; then echo "${VIASH_PAR_TIMES}" | sed "s#'#'\\"'\\"'#g;s#.*#par_times='&'#" ; else echo "# par_times="; fi ) +$( if [ ! -z ${VIASH_PAR_OVERLAP+x} ]; then echo "${VIASH_PAR_OVERLAP}" | sed "s#'#'\\"'\\"'#g;s#.*#par_overlap='&'#" ; else echo "# par_overlap="; fi ) +$( if [ ! -z ${VIASH_PAR_MATCH_READ_WILDCARDS+x} ]; then echo "${VIASH_PAR_MATCH_READ_WILDCARDS}" | sed "s#'#'\\"'\\"'#g;s#.*#par_match_read_wildcards='&'#" ; else echo "# par_match_read_wildcards="; fi ) +$( if [ ! -z ${VIASH_PAR_NO_MATCH_ADAPTER_WILDCARDS+x} ]; then echo "${VIASH_PAR_NO_MATCH_ADAPTER_WILDCARDS}" | sed "s#'#'\\"'\\"'#g;s#.*#par_no_match_adapter_wildcards='&'#" ; else echo "# par_no_match_adapter_wildcards="; fi ) +$( if [ ! -z ${VIASH_PAR_ACTION+x} ]; then echo "${VIASH_PAR_ACTION}" | sed "s#'#'\\"'\\"'#g;s#.*#par_action='&'#" ; else echo "# par_action="; fi ) +$( if [ ! -z ${VIASH_PAR_REVCOMP+x} ]; then echo "${VIASH_PAR_REVCOMP}" | sed "s#'#'\\"'\\"'#g;s#.*#par_revcomp='&'#" ; else echo "# par_revcomp="; fi ) +$( if [ ! -z ${VIASH_PAR_DEMULTIPLEX_MODE+x} ]; then echo "${VIASH_PAR_DEMULTIPLEX_MODE}" | sed "s#'#'\\"'\\"'#g;s#.*#par_demultiplex_mode='&'#" ; else echo "# par_demultiplex_mode="; fi ) +$( if [ ! -z ${VIASH_PAR_CUT+x} ]; then echo "${VIASH_PAR_CUT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_cut='&'#" ; else echo "# par_cut="; fi ) +$( if [ ! -z ${VIASH_PAR_CUT_R2+x} ]; then echo "${VIASH_PAR_CUT_R2}" | sed "s#'#'\\"'\\"'#g;s#.*#par_cut_r2='&'#" ; else echo "# par_cut_r2="; fi ) +$( if [ ! -z ${VIASH_PAR_NEXTSEQ_TRIM+x} ]; then echo "${VIASH_PAR_NEXTSEQ_TRIM}" | sed "s#'#'\\"'\\"'#g;s#.*#par_nextseq_trim='&'#" ; else echo "# par_nextseq_trim="; fi ) +$( if [ ! -z ${VIASH_PAR_QUALITY_CUTOFF+x} ]; then echo "${VIASH_PAR_QUALITY_CUTOFF}" | sed "s#'#'\\"'\\"'#g;s#.*#par_quality_cutoff='&'#" ; else echo "# par_quality_cutoff="; fi ) +$( if [ ! -z ${VIASH_PAR_QUALITY_CUTOFF_R2+x} ]; then echo "${VIASH_PAR_QUALITY_CUTOFF_R2}" | sed "s#'#'\\"'\\"'#g;s#.*#par_quality_cutoff_r2='&'#" ; else echo "# par_quality_cutoff_r2="; fi ) +$( if [ ! -z ${VIASH_PAR_QUALITY_BASE+x} ]; then echo "${VIASH_PAR_QUALITY_BASE}" | sed "s#'#'\\"'\\"'#g;s#.*#par_quality_base='&'#" ; else echo "# par_quality_base="; fi ) +$( if [ ! -z ${VIASH_PAR_POLY_A+x} ]; then echo "${VIASH_PAR_POLY_A}" | sed "s#'#'\\"'\\"'#g;s#.*#par_poly_a='&'#" ; else echo "# par_poly_a="; fi ) +$( if [ ! -z ${VIASH_PAR_LENGTH+x} ]; then echo "${VIASH_PAR_LENGTH}" | sed "s#'#'\\"'\\"'#g;s#.*#par_length='&'#" ; else echo "# par_length="; fi ) +$( if [ ! -z ${VIASH_PAR_TRIM_N+x} ]; then echo "${VIASH_PAR_TRIM_N}" | sed "s#'#'\\"'\\"'#g;s#.*#par_trim_n='&'#" ; else echo "# par_trim_n="; fi ) +$( if [ ! -z ${VIASH_PAR_LENGTH_TAG+x} ]; then echo "${VIASH_PAR_LENGTH_TAG}" | sed "s#'#'\\"'\\"'#g;s#.*#par_length_tag='&'#" ; else echo "# par_length_tag="; fi ) +$( if [ ! -z ${VIASH_PAR_STRIP_SUFFIX+x} ]; then echo "${VIASH_PAR_STRIP_SUFFIX}" | sed "s#'#'\\"'\\"'#g;s#.*#par_strip_suffix='&'#" ; else echo "# par_strip_suffix="; fi ) +$( if [ ! -z ${VIASH_PAR_PREFIX+x} ]; then echo "${VIASH_PAR_PREFIX}" | sed "s#'#'\\"'\\"'#g;s#.*#par_prefix='&'#" ; else echo "# par_prefix="; fi ) +$( if [ ! -z ${VIASH_PAR_SUFFIX+x} ]; then echo "${VIASH_PAR_SUFFIX}" | sed "s#'#'\\"'\\"'#g;s#.*#par_suffix='&'#" ; else echo "# par_suffix="; fi ) +$( if [ ! -z ${VIASH_PAR_RENAME+x} ]; then echo "${VIASH_PAR_RENAME}" | sed "s#'#'\\"'\\"'#g;s#.*#par_rename='&'#" ; else echo "# par_rename="; fi ) +$( if [ ! -z ${VIASH_PAR_ZERO_CAP+x} ]; then echo "${VIASH_PAR_ZERO_CAP}" | sed "s#'#'\\"'\\"'#g;s#.*#par_zero_cap='&'#" ; else echo "# par_zero_cap="; fi ) +$( if [ ! -z ${VIASH_PAR_MINIMUM_LENGTH+x} ]; then echo "${VIASH_PAR_MINIMUM_LENGTH}" | sed "s#'#'\\"'\\"'#g;s#.*#par_minimum_length='&'#" ; else echo "# par_minimum_length="; fi ) +$( if [ ! -z ${VIASH_PAR_MAXIMUM_LENGTH+x} ]; then echo "${VIASH_PAR_MAXIMUM_LENGTH}" | sed "s#'#'\\"'\\"'#g;s#.*#par_maximum_length='&'#" ; else echo "# par_maximum_length="; fi ) +$( if [ ! -z ${VIASH_PAR_MAX_N+x} ]; then echo "${VIASH_PAR_MAX_N}" | sed "s#'#'\\"'\\"'#g;s#.*#par_max_n='&'#" ; else echo "# par_max_n="; fi ) +$( if [ ! -z ${VIASH_PAR_MAX_EXPECTED_ERRORS+x} ]; then echo "${VIASH_PAR_MAX_EXPECTED_ERRORS}" | sed "s#'#'\\"'\\"'#g;s#.*#par_max_expected_errors='&'#" ; else echo "# par_max_expected_errors="; fi ) +$( if [ ! -z ${VIASH_PAR_MAX_AVERAGE_ERROR_RATE+x} ]; then echo "${VIASH_PAR_MAX_AVERAGE_ERROR_RATE}" | sed "s#'#'\\"'\\"'#g;s#.*#par_max_average_error_rate='&'#" ; else echo "# par_max_average_error_rate="; fi ) +$( if [ ! -z ${VIASH_PAR_DISCARD_TRIMMED+x} ]; then echo "${VIASH_PAR_DISCARD_TRIMMED}" | sed "s#'#'\\"'\\"'#g;s#.*#par_discard_trimmed='&'#" ; else echo "# par_discard_trimmed="; fi ) +$( if [ ! -z ${VIASH_PAR_DISCARD_UNTRIMMED+x} ]; then echo "${VIASH_PAR_DISCARD_UNTRIMMED}" | sed "s#'#'\\"'\\"'#g;s#.*#par_discard_untrimmed='&'#" ; else echo "# par_discard_untrimmed="; fi ) +$( if [ ! -z ${VIASH_PAR_DISCARD_CASAVA+x} ]; then echo "${VIASH_PAR_DISCARD_CASAVA}" | sed "s#'#'\\"'\\"'#g;s#.*#par_discard_casava='&'#" ; else echo "# par_discard_casava="; fi ) +$( if [ ! -z ${VIASH_PAR_REPORT+x} ]; then echo "${VIASH_PAR_REPORT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_report='&'#" ; else echo "# par_report="; fi ) +$( if [ ! -z ${VIASH_PAR_JSON+x} ]; then echo "${VIASH_PAR_JSON}" | sed "s#'#'\\"'\\"'#g;s#.*#par_json='&'#" ; else echo "# par_json="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "${VIASH_PAR_OUTPUT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_output='&'#" ; else echo "# par_output="; fi ) +$( if [ ! -z ${VIASH_PAR_FASTA+x} ]; then echo "${VIASH_PAR_FASTA}" | sed "s#'#'\\"'\\"'#g;s#.*#par_fasta='&'#" ; else echo "# par_fasta="; fi ) +$( if [ ! -z ${VIASH_PAR_INFO_FILE+x} ]; then echo "${VIASH_PAR_INFO_FILE}" | sed "s#'#'\\"'\\"'#g;s#.*#par_info_file='&'#" ; else echo "# par_info_file="; fi ) +$( if [ ! -z ${VIASH_PAR_DEBUG+x} ]; then echo "${VIASH_PAR_DEBUG}" | sed "s#'#'\\"'\\"'#g;s#.*#par_debug='&'#" ; else echo "# par_debug="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END + +function debug { + [[ "\\$par_debug" == "true" ]] && echo "DEBUG: \\$@" +} + +output_dir=\\$(dirname \\$par_output) +[[ ! -d \\$output_dir ]] && mkdir -p \\$output_dir + +# Init +########################################################### + +echo ">> Paired-end data or not?" + +mode="" +if [[ -z \\$par_input_r2 ]]; then + mode="se" + echo " Single end" + input="\\$par_input" +else + echo " Paired end" + mode="pe" + input="\\$par_input \\$par_input_r2" +fi + +# Adapter arguments +# - paired and single-end +# - string and fasta +########################################################### + +function add_flags { + local arg=\\$1 + local flag=\\$2 + local prefix=\\$3 + [[ -z \\$prefix ]] && prefix="" + + # This function should not be called if the input is empty + # but check for it just in case + if [[ -z \\$arg ]]; then + return + fi + + local output="" + IFS=';' read -r -a array <<< "\\$arg" + for a in "\\${array[@]}"; do + output="\\$output \\$flag \\$prefix\\$a" + done + echo \\$output +} + +debug ">> Parsing arguments dealing with adapters" +adapter_args=\\$(echo \\\\ + \\${par_adapter:+\\$(add_flags "\\$par_adapter" "--adapter")} \\\\ + \\${par_adapter_fasta:+\\$(add_flags "\\$par_adapter_fasta" "--adapter" "file:")} \\\\ + \\${par_front:+\\$(add_flags "\\$par_front" "--front")} \\\\ + \\${par_front_fasta:+\\$(add_flags "\\$par_front_fasta" "--front" "file:")} \\\\ + \\${par_anywhere:+\\$(add_flags "\\$par_anywhere" "--anywhere")} \\\\ + \\${par_anywhere_fasta:+\\$(add_flags "\\$par_anywhere_fasta" "--anywhere" "file:")} \\\\ + \\${par_adapter_r2:+\\$(add_flags "\\$par_adapter_r2" "-A")} \\\\ + \\${par_adapter_fasta_r2:+\\$(add_flags "\\$par_adapter_fasta_r2" "-A" "file:")} \\\\ + \\${par_front_r2:+\\$(add_flags "\\$par_front_r2" "-G")} \\\\ + \\${par_front_fasta_r2:+\\$(add_flags "\\$par_front_fasta_r2" "-G" "file:")} \\\\ + \\${par_anywhere_r2:+\\$(add_flags "\\$par_anywhere_r2" "-B")} \\\\ + \\${par_anywhere_fasta_r2:+\\$(add_flags "\\$par_anywhere_fasta_r2" "-B" "file:")} \\\\ +) + +debug "Arguments to cutadapt:" +debug "\\$adapter_args" +debug + +# Paired-end options +########################################################### +echo ">> Parsing arguments for paired-end reads" +[[ "\\$par_pair_adapters" == "false" ]] && unset par_pair_adapters +[[ "\\$par_interleaved" == "false" ]] && unset par_interleaved + +paired_args=\\$(echo \\\\ + \\${par_pair_adapters:+--pair-adapters} \\\\ + \\${par_pair_filter:+--pair-filter "\\${par_pair_filter}"} \\\\ + \\${par_interleaved:+--interleaved} +) +debug "Arguments to cutadapt:" +debug \\$paired_args +debug + +# Input arguments +########################################################### +echo ">> Parsing input arguments" +[[ "\\$par_no_indels" == "false" ]] && unset par_no_indels +[[ "\\$par_match_read_wildcards" == "false" ]] && unset par_match_read_wildcards +[[ "\\$par_no_match_adapter_wildcards" == "false" ]] && unset par_no_match_adapter_wildcards +[[ "\\$par_revcomp" == "false" ]] && unset par_revcomp + +input_args=\\$(echo \\\\ + \\${par_error_rate:+--error-rate "\\${par_error_rate}"} \\\\ + \\${par_no_indels:+--no-indels} \\\\ + \\${par_times:+--times "\\${par_times}"} \\\\ + \\${par_overlap:+--overlap "\\${par_overlap}"} \\\\ + \\${par_match_read_wildcards:+--match-read-wildcards} \\\\ + \\${par_no_match_adapter_wildcards:+--no-match-adapter-wildcards} \\\\ + \\${par_action:+--action="\\${par_action}"} \\\\ + \\${par_revcomp:+--revcomp} \\\\ +) +debug "Arguments to cutadapt:" +debug \\$input_args +debug + +# Read modifications +########################################################### +echo ">> Parsing read modification arguments" +[[ "\\$par_poly_a" == "false" ]] && unset par_poly_a +[[ "\\$par_trim_n" == "false" ]] && unset par_trim_n +[[ "\\$par_zero_cap" == "false" ]] && unset par_zero_cap + +mod_args=\\$(echo \\\\ + \\${par_cut:+--cut "\\${par_cut}"} \\\\ + \\${par_cut_r2:+--cut_r2 "\\${par_cut_r2}"} \\\\ + \\${par_nextseq_trim:+--nextseq-trim "\\${par_nextseq_trim}"} \\\\ + \\${par_quality_cutoff:+--quality-cutoff "\\${par_quality_cutoff}"} \\\\ + \\${par_quality_cutoff_r2:+-Q "\\${par_quality_cutoff_r2}"} \\\\ + \\${par_quality_base:+--quality-base "\\${par_quality_base}"} \\\\ + \\${par_poly_a:+--poly-a} \\\\ + \\${par_length:+--length "\\${par_length}"} \\\\ + \\${par_trim_n:+--trim-n} \\\\ + \\${par_length_tag:+--length-tag "\\${par_length_tag}"} \\\\ + \\${par_strip_suffix:+--strip-suffix "\\${par_strip_suffix}"} \\\\ + \\${par_prefix:+--prefix "\\${par_prefix}"} \\\\ + \\${par_suffix:+--suffix "\\${par_suffix}"} \\\\ + \\${par_rename:+--rename "\\${par_rename}"} \\\\ + \\${par_zero_cap:+--zero-cap} \\\\ +) +debug "Arguments to cutadapt:" +debug \\$mod_args +debug + +# Filtering of processed reads arguments +########################################################### +echo ">> Filtering of processed reads arguments" +[[ "\\$par_discard_trimmed" == "false" ]] && unset par_discard_trimmed +[[ "\\$par_discard_untrimmed" == "false" ]] && unset par_discard_untrimmed +[[ "\\$par_discard_casava" == "false" ]] && unset par_discard_casava + +# Parse and transform the minimum and maximum length arguments +[[ -z \\$par_minimum_length ]] + +filter_args=\\$(echo \\\\ + \\${par_minimum_length:+--minimum-length "\\${par_minimum_length}"} \\\\ + \\${par_maximum_length:+--maximum-length "\\${par_maximum_length}"} \\\\ + \\${par_max_n:+--max-n "\\${par_max_n}"} \\\\ + \\${par_max_expected_errors:+--max-expected-errors "\\${par_max_expected_errors}"} \\\\ + \\${par_max_average_error_rate:+--max-average-error-rate "\\${par_max_average_error_rate}"} \\\\ + \\${par_discard_trimmed:+--discard-trimmed} \\\\ + \\${par_discard_untrimmed:+--discard-untrimmed} \\\\ + \\${par_discard_casava:+--discard-casava} \\\\ +) +debug "Arguments to cutadapt:" +debug \\$filter_args +debug + +# Optional output arguments +########################################################### +echo ">> Optional arguments" +[[ "\\$par_json" == "false" ]] && unset par_json +[[ "\\$par_fasta" == "false" ]] && unset par_fasta +[[ "\\$par_info_file" == "false" ]] && unset par_info_file + +optional_output_args=\\$(echo \\\\ + \\${par_report:+--report "\\${par_report}"} \\\\ + \\${par_json:+--json "report.json"} \\\\ + \\${par_fasta:+--fasta} \\\\ + \\${par_info_file:+--info-file "info.txt"} \\\\ +) + +debug "Arguments to cutadapt:" +debug \\$optional_output_args +debug + +# Output arguments +# We write the output to a directory rather than +# individual files. +########################################################### + +if [[ -z \\$par_fasta ]]; then + ext="fastq" +else + ext="fasta" +fi + +demultiplex_mode="\\$par_demultiplex_mode" +if [[ \\$mode == "se" ]]; then + if [[ "\\$demultiplex_mode" == "unique_dual" ]] || [[ "\\$demultiplex_mode" == "combinatorial_dual" ]]; then + echo "Demultiplexing dual indexes is not possible with single-end data." + exit 1 + fi + prefix="trimmed_" + if [[ ! -z "\\$demultiplex_mode" ]]; then + prefix="{name}_" + fi + output_args=\\$(echo \\\\ + --output "\\$output_dir/\\${prefix}001.\\$ext" \\\\ + ) +else + demultiplex_indicator_r1='{name}_' + demultiplex_indicator_r2=\\$demultiplex_indicator_r1 + if [[ "\\$demultiplex_mode" == "combinatorial_dual" ]]; then + demultiplex_indicator_r1='{name1}_{name2}_' + demultiplex_indicator_r2='{name1}_{name2}_' + fi + prefix_r1="trimmed_" + prefix_r2="trimmed_" + if [[ ! -z "\\$demultiplex_mode" ]]; then + prefix_r1=\\$demultiplex_indicator_r1 + prefix_r2=\\$demultiplex_indicator_r2 + fi + output_args=\\$(echo \\\\ + --output "\\$output_dir/\\${prefix_r1}R1_001.\\$ext" \\\\ + --paired-output "\\$output_dir/\\${prefix_r2}R2_001.\\$ext" \\\\ + ) +fi + +debug "Arguments to cutadapt:" +debug \\$output_args +debug + +# Full CLI +# Set the --cores argument to 0 unless meta_cpus is set +########################################################### +echo ">> Running cutadapt" +par_cpus=0 +[[ ! -z \\$meta_cpus ]] && par_cpus=\\$meta_cpus + +cli=\\$(echo \\\\ + \\$input \\\\ + \\$adapter_args \\\\ + \\$paired_args \\\\ + \\$input_args \\\\ + \\$mod_args \\\\ + \\$filter_args \\\\ + \\$optional_output_args \\\\ + \\$output_args \\\\ + --cores \\$par_cpus +) + +debug ">> Full CLI to be run:" +debug cutadapt \\$cli | sed -e 's/--/\\\\r\\\\n --/g' +debug + +cutadapt \\$cli +VIASHMAIN +bash "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/biobox/cutadapt", + "tag" : "v0.3.1" + }, + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/dependencies/vsh/vsh/biobox/v0.3.1/nextflow/cutadapt/nextflow.config b/target/dependencies/vsh/vsh/biobox/v0.3.1/nextflow/cutadapt/nextflow.config new file mode 100644 index 00000000..09aff55b --- /dev/null +++ b/target/dependencies/vsh/vsh/biobox/v0.3.1/nextflow/cutadapt/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'cutadapt' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'v0.3.1' + description = 'Cutadapt removes adapter sequences from high-throughput sequencing reads.\n' + author = 'Toni Verbeiren' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + + diff --git a/target/dependencies/vsh/vsh/biobox/v0.3.1/nextflow/cutadapt/nextflow_schema.json b/target/dependencies/vsh/vsh/biobox/v0.3.1/nextflow/cutadapt/nextflow_schema.json new file mode 100644 index 00000000..2b7fb1b4 --- /dev/null +++ b/target/dependencies/vsh/vsh/biobox/v0.3.1/nextflow/cutadapt/nextflow_schema.json @@ -0,0 +1,775 @@ +{ +"$schema": "http://json-schema.org/draft-07/schema", +"title": "cutadapt", +"description": "Cutadapt removes adapter sequences from high-throughput sequencing reads.\n", +"type": "object", +"definitions": { + + + + "specify adapters for r1" : { + "title": "Specify Adapters for R1", + "type": "object", + "description": "No description", + "properties": { + + + "adapter": { + "type": + "string", + "description": "Type: List of `string`, multiple_sep: `\";\"`. Sequence of an adapter ligated to the 3\u0027 end (paired data:\nof the first read)", + "help_text": "Type: List of `string`, multiple_sep: `\";\"`. Sequence of an adapter ligated to the 3\u0027 end (paired data:\nof the first read). The adapter and subsequent bases are\ntrimmed. If a \u0027$\u0027 character is appended (\u0027anchoring\u0027), the\nadapter is only found if it is a suffix of the read.\n" + + } + + + , + "front": { + "type": + "string", + "description": "Type: List of `string`, multiple_sep: `\";\"`. Sequence of an adapter ligated to the 5\u0027 end (paired data:\nof the first read)", + "help_text": "Type: List of `string`, multiple_sep: `\";\"`. Sequence of an adapter ligated to the 5\u0027 end (paired data:\nof the first read). The adapter and any preceding bases\nare trimmed. Partial matches at the 5\u0027 end are allowed. If\na \u0027^\u0027 character is prepended (\u0027anchoring\u0027), the adapter is\nonly found if it is a prefix of the read.\n" + + } + + + , + "anywhere": { + "type": + "string", + "description": "Type: List of `string`, multiple_sep: `\";\"`. Sequence of an adapter that may be ligated to the 5\u0027 or 3\u0027\nend (paired data: of the first read)", + "help_text": "Type: List of `string`, multiple_sep: `\";\"`. Sequence of an adapter that may be ligated to the 5\u0027 or 3\u0027\nend (paired data: of the first read). Both types of\nmatches as described under -a and -g are allowed. If the\nfirst base of the read is part of the match, the behavior\nis as with -g, otherwise as with -a. This option is mostly\nfor rescuing failed library preparations - do not use if\nyou know which end your adapter was ligated to!\n" + + } + + +} +}, + + + "specify adapters using fasta files for r1" : { + "title": "Specify Adapters using Fasta files for R1", + "type": "object", + "description": "No description", + "properties": { + + + "adapter_fasta": { + "type": + "string", + "description": "Type: List of `file`, multiple_sep: `\";\"`. Fasta file containing sequences of an adapter ligated to the 3\u0027 end (paired data:\nof the first read)", + "help_text": "Type: List of `file`, multiple_sep: `\";\"`. Fasta file containing sequences of an adapter ligated to the 3\u0027 end (paired data:\nof the first read). The adapter and subsequent bases are\ntrimmed. If a \u0027$\u0027 character is appended (\u0027anchoring\u0027), the\nadapter is only found if it is a suffix of the read.\n" + + } + + + , + "front_fasta": { + "type": + "string", + "description": "Type: `file`. Fasta file containing sequences of an adapter ligated to the 5\u0027 end (paired data:\nof the first read)", + "help_text": "Type: `file`. Fasta file containing sequences of an adapter ligated to the 5\u0027 end (paired data:\nof the first read). The adapter and any preceding bases\nare trimmed. Partial matches at the 5\u0027 end are allowed. If\na \u0027^\u0027 character is prepended (\u0027anchoring\u0027), the adapter is\nonly found if it is a prefix of the read.\n" + + } + + + , + "anywhere_fasta": { + "type": + "string", + "description": "Type: `file`. Fasta file containing sequences of an adapter that may be ligated to the 5\u0027 or 3\u0027\nend (paired data: of the first read)", + "help_text": "Type: `file`. Fasta file containing sequences of an adapter that may be ligated to the 5\u0027 or 3\u0027\nend (paired data: of the first read). Both types of\nmatches as described under -a and -g are allowed. If the\nfirst base of the read is part of the match, the behavior\nis as with -g, otherwise as with -a. This option is mostly\nfor rescuing failed library preparations - do not use if\nyou know which end your adapter was ligated to!\n" + + } + + +} +}, + + + "specify adapters for r2" : { + "title": "Specify Adapters for R2", + "type": "object", + "description": "No description", + "properties": { + + + "adapter_r2": { + "type": + "string", + "description": "Type: List of `string`, multiple_sep: `\";\"`. Sequence of an adapter ligated to the 3\u0027 end (paired data:\nof the first read)", + "help_text": "Type: List of `string`, multiple_sep: `\";\"`. Sequence of an adapter ligated to the 3\u0027 end (paired data:\nof the first read). The adapter and subsequent bases are\ntrimmed. If a \u0027$\u0027 character is appended (\u0027anchoring\u0027), the\nadapter is only found if it is a suffix of the read.\n" + + } + + + , + "front_r2": { + "type": + "string", + "description": "Type: List of `string`, multiple_sep: `\";\"`. Sequence of an adapter ligated to the 5\u0027 end (paired data:\nof the first read)", + "help_text": "Type: List of `string`, multiple_sep: `\";\"`. Sequence of an adapter ligated to the 5\u0027 end (paired data:\nof the first read). The adapter and any preceding bases\nare trimmed. Partial matches at the 5\u0027 end are allowed. If\na \u0027^\u0027 character is prepended (\u0027anchoring\u0027), the adapter is\nonly found if it is a prefix of the read.\n" + + } + + + , + "anywhere_r2": { + "type": + "string", + "description": "Type: List of `string`, multiple_sep: `\";\"`. Sequence of an adapter that may be ligated to the 5\u0027 or 3\u0027\nend (paired data: of the first read)", + "help_text": "Type: List of `string`, multiple_sep: `\";\"`. Sequence of an adapter that may be ligated to the 5\u0027 or 3\u0027\nend (paired data: of the first read). Both types of\nmatches as described under -a and -g are allowed. If the\nfirst base of the read is part of the match, the behavior\nis as with -g, otherwise as with -a. This option is mostly\nfor rescuing failed library preparations - do not use if\nyou know which end your adapter was ligated to!\n" + + } + + +} +}, + + + "specify adapters using fasta files for r2" : { + "title": "Specify Adapters using Fasta files for R2", + "type": "object", + "description": "No description", + "properties": { + + + "adapter_r2_fasta": { + "type": + "string", + "description": "Type: `file`. Fasta file containing sequences of an adapter ligated to the 3\u0027 end (paired data:\nof the first read)", + "help_text": "Type: `file`. Fasta file containing sequences of an adapter ligated to the 3\u0027 end (paired data:\nof the first read). The adapter and subsequent bases are\ntrimmed. If a \u0027$\u0027 character is appended (\u0027anchoring\u0027), the\nadapter is only found if it is a suffix of the read.\n" + + } + + + , + "front_r2_fasta": { + "type": + "string", + "description": "Type: `file`. Fasta file containing sequences of an adapter ligated to the 5\u0027 end (paired data:\nof the first read)", + "help_text": "Type: `file`. Fasta file containing sequences of an adapter ligated to the 5\u0027 end (paired data:\nof the first read). The adapter and any preceding bases\nare trimmed. Partial matches at the 5\u0027 end are allowed. If\na \u0027^\u0027 character is prepended (\u0027anchoring\u0027), the adapter is\nonly found if it is a prefix of the read.\n" + + } + + + , + "anywhere_r2_fasta": { + "type": + "string", + "description": "Type: `file`. Fasta file containing sequences of an adapter that may be ligated to the 5\u0027 or 3\u0027\nend (paired data: of the first read)", + "help_text": "Type: `file`. Fasta file containing sequences of an adapter that may be ligated to the 5\u0027 or 3\u0027\nend (paired data: of the first read). Both types of\nmatches as described under -a and -g are allowed. If the\nfirst base of the read is part of the match, the behavior\nis as with -g, otherwise as with -a. This option is mostly\nfor rescuing failed library preparations - do not use if\nyou know which end your adapter was ligated to!\n" + + } + + +} +}, + + + "paired-end options" : { + "title": "Paired-end options", + "type": "object", + "description": "No description", + "properties": { + + + "pair_adapters": { + "type": + "boolean", + "description": "Type: `boolean_true`, default: `false`. Treat adapters given with -a/-A etc", + "help_text": "Type: `boolean_true`, default: `false`. Treat adapters given with -a/-A etc. as pairs. Either both\nor none are removed from each read pair.\n" + , + "default":false + } + + + , + "pair_filter": { + "type": + "string", + "description": "Type: `string`, choices: ``any`, `both`, `first``. Which of the reads in a paired-end read have to match the\nfiltering criterion in order for the pair to be filtered", + "help_text": "Type: `string`, choices: ``any`, `both`, `first``. Which of the reads in a paired-end read have to match the\nfiltering criterion in order for the pair to be filtered.\n", + "enum": ["any", "both", "first"] + + + } + + + , + "interleaved": { + "type": + "boolean", + "description": "Type: `boolean_true`, default: `false`. Read and/or write interleaved paired-end reads", + "help_text": "Type: `boolean_true`, default: `false`. Read and/or write interleaved paired-end reads.\n" + , + "default":false + } + + +} +}, + + + "input parameters" : { + "title": "Input parameters", + "type": "object", + "description": "No description", + "properties": { + + + "input": { + "type": + "string", + "description": "Type: `file`, required. Input fastq file for single-end reads or R1 for paired-end reads", + "help_text": "Type: `file`, required. Input fastq file for single-end reads or R1 for paired-end reads.\n" + + } + + + , + "input_r2": { + "type": + "string", + "description": "Type: `file`. Input fastq file for R2 in the case of paired-end reads", + "help_text": "Type: `file`. Input fastq file for R2 in the case of paired-end reads.\n" + + } + + + , + "error_rate": { + "type": + "number", + "description": "Type: `double`, example: `0.1`. Maximum allowed error rate (if 0 \u003c= E \u003c 1), or absolute\nnumber of errors for full-length adapter match (if E is an\ninteger \u003e= 1)", + "help_text": "Type: `double`, example: `0.1`. Maximum allowed error rate (if 0 \u003c= E \u003c 1), or absolute\nnumber of errors for full-length adapter match (if E is an\ninteger \u003e= 1). Error rate = no. of errors divided by\nlength of matching region. Default: 0.1 (10%).\n" + + } + + + , + "no_indels": { + "type": + "boolean", + "description": "Type: `boolean_true`, default: `false`. Allow only mismatches in alignments", + "help_text": "Type: `boolean_true`, default: `false`. Allow only mismatches in alignments.\n" + , + "default":false + } + + + , + "times": { + "type": + "integer", + "description": "Type: `integer`, example: `1`. Remove up to COUNT adapters from each read", + "help_text": "Type: `integer`, example: `1`. Remove up to COUNT adapters from each read. Default: 1.\n" + + } + + + , + "overlap": { + "type": + "integer", + "description": "Type: `integer`, example: `3`. Require MINLENGTH overlap between read and adapter for an\nadapter to be found", + "help_text": "Type: `integer`, example: `3`. Require MINLENGTH overlap between read and adapter for an\nadapter to be found. The default is 3.\n" + + } + + + , + "match_read_wildcards": { + "type": + "boolean", + "description": "Type: `boolean_true`, default: `false`. Interpret IUPAC wildcards in reads", + "help_text": "Type: `boolean_true`, default: `false`. Interpret IUPAC wildcards in reads.\n" + , + "default":false + } + + + , + "no_match_adapter_wildcards": { + "type": + "boolean", + "description": "Type: `boolean_true`, default: `false`. Do not interpret IUPAC wildcards in adapters", + "help_text": "Type: `boolean_true`, default: `false`. Do not interpret IUPAC wildcards in adapters.\n" + , + "default":false + } + + + , + "action": { + "type": + "string", + "description": "Type: `string`, example: `trim`, choices: ``trim`, `retain`, `mask`, `lowercase`, `none``. What to do if a match was found", + "help_text": "Type: `string`, example: `trim`, choices: ``trim`, `retain`, `mask`, `lowercase`, `none``. What to do if a match was found. trim: trim adapter and\nup- or downstream sequence; retain: trim, but retain\nadapter; mask: replace with \u0027N\u0027 characters; lowercase:\nconvert to lowercase; none: leave unchanged.\nThe default is trim.\n", + "enum": ["trim", "retain", "mask", "lowercase", "none"] + + + } + + + , + "revcomp": { + "type": + "boolean", + "description": "Type: `boolean_true`, default: `false`. Check both the read and its reverse complement for adapter\nmatches", + "help_text": "Type: `boolean_true`, default: `false`. Check both the read and its reverse complement for adapter\nmatches. If match is on reverse-complemented version,\noutput that one.\n" + , + "default":false + } + + +} +}, + + + "demultiplexing options" : { + "title": "Demultiplexing options", + "type": "object", + "description": "No description", + "properties": { + + + "demultiplex_mode": { + "type": + "string", + "description": "Type: `string`, choices: ``single`, `unique_dual`, `combinatorial_dual``. Enable demultiplexing and set the mode for it", + "help_text": "Type: `string`, choices: ``single`, `unique_dual`, `combinatorial_dual``. Enable demultiplexing and set the mode for it.\nWith mode \u0027unique_dual\u0027, adapters from the first and second read are used,\nand the indexes from the reads are only used in pairs. This implies\n--pair_adapters.\nEnabling mode \u0027combinatorial_dual\u0027 allows all combinations of the sets of indexes\non R1 and R2. It is necessary to write each read pair to an output\nfile depending on the adapters found on both R1 and R2.\nMode \u0027single\u0027, uses indexes or barcodes located at the 5\u0027\nend of the R1 read (single). \n", + "enum": ["single", "unique_dual", "combinatorial_dual"] + + + } + + +} +}, + + + "read modifications" : { + "title": "Read modifications", + "type": "object", + "description": "No description", + "properties": { + + + "cut": { + "type": + "string", + "description": "Type: List of `integer`, multiple_sep: `\";\"`. Remove LEN bases from each read (or R1 if paired; use --cut_r2\noption for R2)", + "help_text": "Type: List of `integer`, multiple_sep: `\";\"`. Remove LEN bases from each read (or R1 if paired; use --cut_r2\noption for R2). If LEN is positive, remove bases from the\nbeginning. If LEN is negative, remove bases from the end.\nCan be used twice if LENs have different signs. Applied\n*before* adapter trimming.\n" + + } + + + , + "cut_r2": { + "type": + "string", + "description": "Type: List of `integer`, multiple_sep: `\";\"`. Remove LEN bases from each read (for R2)", + "help_text": "Type: List of `integer`, multiple_sep: `\";\"`. Remove LEN bases from each read (for R2). If LEN is positive, remove bases from the\nbeginning. If LEN is negative, remove bases from the end.\nCan be used twice if LENs have different signs. Applied\n*before* adapter trimming.\n" + + } + + + , + "nextseq_trim": { + "type": + "string", + "description": "Type: `string`. NextSeq-specific quality trimming (each read)", + "help_text": "Type: `string`. NextSeq-specific quality trimming (each read). Trims also\ndark cycles appearing as high-quality G bases.\n" + + } + + + , + "quality_cutoff": { + "type": + "string", + "description": "Type: `string`. Trim low-quality bases from 5\u0027 and/or 3\u0027 ends of each read\nbefore adapter removal", + "help_text": "Type: `string`. Trim low-quality bases from 5\u0027 and/or 3\u0027 ends of each read\nbefore adapter removal. Applied to both reads if data is\npaired. If one value is given, only the 3\u0027 end is trimmed.\nIf two comma-separated cutoffs are given, the 5\u0027 end is\ntrimmed with the first cutoff, the 3\u0027 end with the second.\n" + + } + + + , + "quality_cutoff_r2": { + "type": + "string", + "description": "Type: `string`. Quality-trimming cutoff for R2", + "help_text": "Type: `string`. Quality-trimming cutoff for R2. Default: same as for R1\n" + + } + + + , + "quality_base": { + "type": + "integer", + "description": "Type: `integer`, example: `33`. Assume that quality values in FASTQ are encoded as\nascii(quality + N)", + "help_text": "Type: `integer`, example: `33`. Assume that quality values in FASTQ are encoded as\nascii(quality + N). This needs to be set to 64 for some\nold Illumina FASTQ files. The default is 33.\n" + + } + + + , + "poly_a": { + "type": + "boolean", + "description": "Type: `boolean_true`, default: `false`. Trim poly-A tails", + "help_text": "Type: `boolean_true`, default: `false`. Trim poly-A tails" + , + "default":false + } + + + , + "length": { + "type": + "integer", + "description": "Type: `integer`. Shorten reads to LENGTH", + "help_text": "Type: `integer`. Shorten reads to LENGTH. Positive values remove bases at\nthe end while negative ones remove bases at the beginning.\nThis and the following modifications are applied after\nadapter trimming.\n" + + } + + + , + "trim_n": { + "type": + "boolean", + "description": "Type: `boolean_true`, default: `false`. Trim N\u0027s on ends of reads", + "help_text": "Type: `boolean_true`, default: `false`. Trim N\u0027s on ends of reads." + , + "default":false + } + + + , + "length_tag": { + "type": + "string", + "description": "Type: `string`, example: `length=`. Search for TAG followed by a decimal number in the\ndescription field of the read", + "help_text": "Type: `string`, example: `length=`. Search for TAG followed by a decimal number in the\ndescription field of the read. Replace the decimal number\nwith the correct length of the trimmed read. For example,\nuse --length-tag \u0027length=\u0027 to correct fields like\n\u0027length=123\u0027.\n" + + } + + + , + "strip_suffix": { + "type": + "string", + "description": "Type: `string`. Remove this suffix from read names if present", + "help_text": "Type: `string`. Remove this suffix from read names if present. Can be\ngiven multiple times.\n" + + } + + + , + "prefix": { + "type": + "string", + "description": "Type: `string`. Add this prefix to read names", + "help_text": "Type: `string`. Add this prefix to read names. Use {name} to insert the\nname of the matching adapter.\n" + + } + + + , + "suffix": { + "type": + "string", + "description": "Type: `string`. Add this suffix to read names; can also include {name}\n", + "help_text": "Type: `string`. Add this suffix to read names; can also include {name}\n" + + } + + + , + "rename": { + "type": + "string", + "description": "Type: `string`. Rename reads using TEMPLATE containing variables such as\n{id}, {adapter_name} etc", + "help_text": "Type: `string`. Rename reads using TEMPLATE containing variables such as\n{id}, {adapter_name} etc. (see documentation)\n" + + } + + + , + "zero_cap": { + "type": + "boolean", + "description": "Type: `boolean_true`, default: `false`. Change negative quality values to zero", + "help_text": "Type: `boolean_true`, default: `false`. Change negative quality values to zero." + , + "default":false + } + + +} +}, + + + "filtering of processed reads" : { + "title": "Filtering of processed reads", + "type": "object", + "description": "Filters are applied after above read modifications. Paired-end reads are\nalways discarded pairwise (see also --pair_filter).\n", + "properties": { + + + "minimum_length": { + "type": + "string", + "description": "Type: `string`, example: `0`. Discard reads shorter than LEN", + "help_text": "Type: `string`, example: `0`. Discard reads shorter than LEN. Default is 0.\nWhen trimming paired-end reads, the minimum lengths for R1 and R2 can be specified separately by separating them with a colon (:).\nIf the colon syntax is not used, the same minimum length applies to both reads, as discussed above.\nAlso, one of the values can be omitted to impose no restrictions.\nFor example, with -m 17:, the length of R1 must be at least 17, but the length of R2 is ignored.\n" + + } + + + , + "maximum_length": { + "type": + "string", + "description": "Type: `string`. Discard reads longer than LEN", + "help_text": "Type: `string`. Discard reads longer than LEN. Default: no limit.\nFor paired reads, see the remark for --minimum_length\n" + + } + + + , + "max_n": { + "type": + "string", + "description": "Type: `string`. Discard reads with more than COUNT \u0027N\u0027 bases", + "help_text": "Type: `string`. Discard reads with more than COUNT \u0027N\u0027 bases. If COUNT is\na number between 0 and 1, it is interpreted as a fraction\nof the read length.\n" + + } + + + , + "max_expected_errors": { + "type": + "string", + "description": "Type: `long`. Discard reads whose expected number of errors (computed\nfrom quality values) exceeds ERRORS", + "help_text": "Type: `long`. Discard reads whose expected number of errors (computed\nfrom quality values) exceeds ERRORS.\n" + + } + + + , + "max_average_error_rate": { + "type": + "string", + "description": "Type: `long`. as --max_expected_errors (see above), but divided by\nlength to account for reads of varying length", + "help_text": "Type: `long`. as --max_expected_errors (see above), but divided by\nlength to account for reads of varying length.\n" + + } + + + , + "discard_trimmed": { + "type": + "boolean", + "description": "Type: `boolean_true`, default: `false`. Discard reads that contain an adapter", + "help_text": "Type: `boolean_true`, default: `false`. Discard reads that contain an adapter. Use also -O to\navoid discarding too many randomly matching reads.\n" + , + "default":false + } + + + , + "discard_untrimmed": { + "type": + "boolean", + "description": "Type: `boolean_true`, default: `false`. Discard reads that do not contain an adapter", + "help_text": "Type: `boolean_true`, default: `false`. Discard reads that do not contain an adapter.\n" + , + "default":false + } + + + , + "discard_casava": { + "type": + "boolean", + "description": "Type: `boolean_true`, default: `false`. Discard reads that did not pass CASAVA filtering (header\nhas :Y:)", + "help_text": "Type: `boolean_true`, default: `false`. Discard reads that did not pass CASAVA filtering (header\nhas :Y:).\n" + , + "default":false + } + + +} +}, + + + "output parameters" : { + "title": "Output parameters", + "type": "object", + "description": "No description", + "properties": { + + + "report": { + "type": + "string", + "description": "Type: `string`, example: `full`, choices: ``full`, `minimal``. Which type of report to print: \u0027full\u0027 (default) or \u0027minimal\u0027", + "help_text": "Type: `string`, example: `full`, choices: ``full`, `minimal``. Which type of report to print: \u0027full\u0027 (default) or \u0027minimal\u0027.\n", + "enum": ["full", "minimal"] + + + } + + + , + "json": { + "type": + "boolean", + "description": "Type: `boolean_true`, default: `false`. Write report in JSON format to this file", + "help_text": "Type: `boolean_true`, default: `false`. Write report in JSON format to this file.\n" + , + "default":false + } + + + , + "output": { + "type": + "string", + "description": "Type: List of `file`, required, default: `$id.$key.output_*.fast[a,q]`, example: `fastq/*_001.fast[a,q]`, multiple_sep: `\";\"`. Glob pattern for matching the expected output files", + "help_text": "Type: List of `file`, required, default: `$id.$key.output_*.fast[a,q]`, example: `fastq/*_001.fast[a,q]`, multiple_sep: `\";\"`. Glob pattern for matching the expected output files.\nShould include `$output_dir`.\n" + , + "default":"$id.$key.output_*.fast[a,q]" + } + + + , + "fasta": { + "type": + "boolean", + "description": "Type: `boolean_true`, default: `false`. Output FASTA to standard output even on FASTQ input", + "help_text": "Type: `boolean_true`, default: `false`. Output FASTA to standard output even on FASTQ input.\n" + , + "default":false + } + + + , + "info_file": { + "type": + "boolean", + "description": "Type: `boolean_true`, default: `false`. Write information about each read and its adapter matches\ninto info", + "help_text": "Type: `boolean_true`, default: `false`. Write information about each read and its adapter matches\ninto info.txt in the output directory.\nSee the documentation for the file format.\n" + , + "default":false + } + + +} +}, + + + "debug" : { + "title": "Debug", + "type": "object", + "description": "No description", + "properties": { + + + "debug": { + "type": + "boolean", + "description": "Type: `boolean_true`, default: `false`. Print debug information", + "help_text": "Type: `boolean_true`, default: `false`. Print debug information" + , + "default":false + } + + +} +}, + + + "nextflow input-output arguments" : { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + + + "publish_dir": { + "type": + "string", + "description": "Type: `string`, required, example: `output/`. Path to an output directory", + "help_text": "Type: `string`, required, example: `output/`. Path to an output directory." + + } + + + , + "param_list": { + "type": + "string", + "description": "Type: `string`, example: `my_params.yaml`. Allows inputting multiple parameter sets to initialise a Nextflow channel", + "help_text": "Type: `string`, example: `my_params.yaml`. Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob.\n\n* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ [\u0027id\u0027: \u0027foo\u0027, \u0027input\u0027: \u0027foo.txt\u0027], [\u0027id\u0027: \u0027bar\u0027, \u0027input\u0027: \u0027bar.txt\u0027] ]`.\n* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`.\n* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {\u0027id\u0027: \u0027foo\u0027, \u0027input\u0027: \u0027foo.txt\u0027}, {\u0027id\u0027: \u0027bar\u0027, \u0027input\u0027: \u0027bar.txt\u0027} ]`.\n* A yaml blob can also be passed directly as a string. Example: `--param_list \"[ {\u0027id\u0027: \u0027foo\u0027, \u0027input\u0027: \u0027foo.txt\u0027}, {\u0027id\u0027: \u0027bar\u0027, \u0027input\u0027: \u0027bar.txt\u0027} ]\"`.\n\nWhen passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.", + "hidden": true + + } + + +} +} +}, +"allOf": [ + + { + "$ref": "#/definitions/specify adapters for r1" + }, + + { + "$ref": "#/definitions/specify adapters using fasta files for r1" + }, + + { + "$ref": "#/definitions/specify adapters for r2" + }, + + { + "$ref": "#/definitions/specify adapters using fasta files for r2" + }, + + { + "$ref": "#/definitions/paired-end options" + }, + + { + "$ref": "#/definitions/input parameters" + }, + + { + "$ref": "#/definitions/demultiplexing options" + }, + + { + "$ref": "#/definitions/read modifications" + }, + + { + "$ref": "#/definitions/filtering of processed reads" + }, + + { + "$ref": "#/definitions/output parameters" + }, + + { + "$ref": "#/definitions/debug" + }, + + { + "$ref": "#/definitions/nextflow input-output arguments" + } +] +} diff --git a/target/dependencies/vsh/vsh/craftbox/v0.3.0/nextflow/concat_text/.config.vsh.yaml b/target/dependencies/vsh/vsh/craftbox/v0.3.0/nextflow/concat_text/.config.vsh.yaml new file mode 100644 index 00000000..be617241 --- /dev/null +++ b/target/dependencies/vsh/vsh/craftbox/v0.3.0/nextflow/concat_text/.config.vsh.yaml @@ -0,0 +1,216 @@ +name: "concat_text" +version: "v0.3.0" +authors: +- name: "Toni Verbeiren" + roles: + - "author" + - "maintainer" + info: + links: + github: "tverbeiren" + linkedin: "verbeiren" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist and CEO" +- name: "Dries Schaumont" + roles: + - "reviewer" + info: + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Input arguments" + arguments: + - type: "file" + name: "--input" + description: "A list of (gzipped) text files." + info: null + example: + - "input?.txt.gz" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Output arguments" + arguments: + - type: "boolean_true" + name: "--gzip_output" + description: "Should the output be zipped?" + info: null + direction: "input" + - type: "file" + name: "--output" + description: "File to write the output to, optionally gzipped." + info: null + example: + - "output.txt" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +summary: "Concatenate a number of text files" +description: "Concatenate a number of text files, handle gzipped text files gracefully\ + \ and\noptionally gzip the output text file.\n\nThis component is useful for concatening\ + \ fastq files from different lanes, for instance.\n" +test_resources: +- type: "bash_script" + path: "test.sh" + is_executable: true +info: + improvements: "This component could be improved in 2 ways:\n 1. Allow for a mix\ + \ of zipped and plain input files\n 2. Allow to specify a compression algorithm\ + \ for the output\n" +status: "enabled" +scope: + image: "public" + target: "public" +requirements: + commands: + - "ps" +license: "MIT" +links: + repository: "https://github.com/viash-hub/craftbox" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "alpine:latest" + target_registry: "images.viash-hub.com" + target_tag: "v0.3.0" + namespace_separator: "/" + setup: + - type: "apk" + packages: + - "bash" + - "procps" + - "file" + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/concat_text/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/concat_text" + executable: "target/nextflow/concat_text/main.nf" + viash_version: "0.9.4" + git_commit: "a1801c5139bbcda244e06affa66d16f7abd5e124" + git_remote: "https://github.com/viash-hub/craftbox" + git_tag: "v0.1.0-11-ga1801c5" +package_config: + name: "craftbox" + version: "v0.3.0" + summary: "A collection of custom-tailored scripts and applied utilities built with\ + \ Viash.\n" + description: "`craftbox` is a curated collection of custom scripts and utilities\ + \ designed to tackle context-specific tasks.\n\nEmphasizing the Viash principles,\ + \ `craftbox` components aim for **reusability**, **reproducibility**, and adherence\ + \ to **best practices**. Key features generally include:\n\n* **Standalone & Nextflow\ + \ Ready:** Components are built to run directly via the command line or be smoothly\ + \ integrated into Nextflow workflows.\n* **Custom Implementations:** Contains\ + \ scripts and tools developed for particular tasks that may not be found in broader\ + \ collections.\n* **High Quality Standards (promoted by Viash):**\n * Clear\ + \ documentation for components and their parameters.\n * Full exposure of underlying\ + \ script/tool arguments for fine-grained control.\n * Containerized (Docker)\ + \ to ensure dependency management and a consistent, reproducible runtime environment.\n\ + \ * Unit tested where applicable to ensure components function as expected.\n" + info: null + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".requirements.commands := ['ps']\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'v0.3.0'" + keywords: + - "scripts" + - "custom" + - "implementations" + - "utilities" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/viash-hub/craftbox" + issue_tracker: "https://github.com/viash-hub/craftbox/issues" diff --git a/target/dependencies/vsh/vsh/craftbox/v0.3.0/nextflow/concat_text/main.nf b/target/dependencies/vsh/vsh/craftbox/v0.3.0/nextflow/concat_text/main.nf new file mode 100644 index 00000000..18f7a853 --- /dev/null +++ b/target/dependencies/vsh/vsh/craftbox/v0.3.0/nextflow/concat_text/main.nf @@ -0,0 +1,3868 @@ +// concat_text v0.3.0 +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Toni Verbeiren (author, maintainer) +// * Dries Schaumont (reviewer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "concat_text", + "version" : "v0.3.0", + "authors" : [ + { + "name" : "Toni Verbeiren", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "links" : { + "github" : "tverbeiren", + "linkedin" : "verbeiren" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist and CEO" + } + ] + } + }, + { + "name" : "Dries Schaumont", + "roles" : [ + "reviewer" + ], + "info" : { + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Input arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "A list of (gzipped) text files.", + "example" : [ + "input?.txt.gz" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Output arguments", + "arguments" : [ + { + "type" : "boolean_true", + "name" : "--gzip_output", + "description" : "Should the output be zipped?", + "direction" : "input" + }, + { + "type" : "file", + "name" : "--output", + "description" : "File to write the output to, optionally gzipped.", + "example" : [ + "output.txt" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "bash_script", + "path" : "script.sh", + "is_executable" : true + } + ], + "summary" : "Concatenate a number of text files", + "description" : "Concatenate a number of text files, handle gzipped text files gracefully and\noptionally gzip the output text file.\n\nThis component is useful for concatening fastq files from different lanes, for instance.\n", + "test_resources" : [ + { + "type" : "bash_script", + "path" : "test.sh", + "is_executable" : true + } + ], + "info" : { + "improvements" : "This component could be improved in 2 ways:\n 1. Allow for a mix of zipped and plain input files\n 2. Allow to specify a compression algorithm for the output\n" + }, + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "requirements" : { + "commands" : [ + "ps" + ] + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/viash-hub/craftbox" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + } + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "alpine:latest", + "target_registry" : "images.viash-hub.com", + "target_tag" : "v0.3.0", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apk", + "packages" : [ + "bash", + "procps", + "file" + ] + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/concat_text/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "target/nextflow/concat_text", + "viash_version" : "0.9.4", + "git_commit" : "a1801c5139bbcda244e06affa66d16f7abd5e124", + "git_remote" : "https://github.com/viash-hub/craftbox", + "git_tag" : "v0.1.0-11-ga1801c5" + }, + "package_config" : { + "name" : "craftbox", + "version" : "v0.3.0", + "summary" : "A collection of custom-tailored scripts and applied utilities built with Viash.\n", + "description" : "`craftbox` is a curated collection of custom scripts and utilities designed to tackle context-specific tasks.\n\nEmphasizing the Viash principles, `craftbox` components aim for **reusability**, **reproducibility**, and adherence to **best practices**. Key features generally include:\n\n* **Standalone & Nextflow Ready:** Components are built to run directly via the command line or be smoothly integrated into Nextflow workflows.\n* **Custom Implementations:** Contains scripts and tools developed for particular tasks that may not be found in broader collections.\n* **High Quality Standards (promoted by Viash):**\n * Clear documentation for components and their parameters.\n * Full exposure of underlying script/tool arguments for fine-grained control.\n * Containerized (Docker) to ensure dependency management and a consistent, reproducible runtime environment.\n * Unit tested where applicable to ensure components function as expected.\n", + "viash_version" : "0.9.4", + "source" : "src", + "target" : "target", + "config_mods" : [ + ".requirements.commands := ['ps']\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'v0.3.0'" + ], + "keywords" : [ + "scripts", + "custom", + "implementations", + "utilities" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/viash-hub/craftbox", + "issue_tracker" : "https://github.com/viash-hub/craftbox/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.sh" +cat > "$tempscript" << VIASHMAIN +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "${VIASH_PAR_INPUT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_input='&'#" ; else echo "# par_input="; fi ) +$( if [ ! -z ${VIASH_PAR_GZIP_OUTPUT+x} ]; then echo "${VIASH_PAR_GZIP_OUTPUT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_gzip_output='&'#" ; else echo "# par_gzip_output="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "${VIASH_PAR_OUTPUT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_output='&'#" ; else echo "# par_output="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END +#!/usr/bin/env bash + +set -euo pipefail + +TMPDIR=\\$(mktemp -d "\\$meta_temp_dir/concat_text-XXXXXX") +function clean_up { + [[ -d "\\$TMPDIR" ]] && rm -r "\\$TMPDIR" +} +trap clean_up EXIT + +par_input="\\$(echo "\\$par_input" | tr ';' ' ')" + +echo -n ">> Check if input is gzipped... " +set +eo pipefail +file \\$par_input | grep -q 'gzip' +is_zipped="\\$?" +set -euo pipefail +[[ "\\$is_zipped" == "0" ]] && echo "yes" || echo "no" + +if [[ "\\$is_zipped" == "0" ]]; then + echo ">> zcat gzipped files" + zcat \\$par_input > \\$TMPDIR/contents +else + echo ">> cat plain files" + cat \\$par_input > \\$TMPDIR/contents +fi + +if [ "\\$par_gzip_output" == true ]; then + echo ">> Zip output file" + gzip \\$TMPDIR/contents + mv \\$TMPDIR/contents.gz \\$par_output +else + mv \\$TMPDIR/contents \\$par_output +fi +VIASHMAIN +bash "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/craftbox/concat_text", + "tag" : "v0.3.0" + }, + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/dependencies/vsh/vsh/craftbox/v0.3.0/nextflow/concat_text/nextflow.config b/target/dependencies/vsh/vsh/craftbox/v0.3.0/nextflow/concat_text/nextflow.config new file mode 100644 index 00000000..00643d65 --- /dev/null +++ b/target/dependencies/vsh/vsh/craftbox/v0.3.0/nextflow/concat_text/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'concat_text' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'v0.3.0' + description = 'Concatenate a number of text files, handle gzipped text files gracefully and\noptionally gzip the output text file.\n\nThis component is useful for concatening fastq files from different lanes, for instance.\n' + author = 'Toni Verbeiren, Dries Schaumont' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + + diff --git a/target/dependencies/vsh/vsh/craftbox/v0.3.0/nextflow/concat_text/nextflow_schema.json b/target/dependencies/vsh/vsh/craftbox/v0.3.0/nextflow/concat_text/nextflow_schema.json new file mode 100644 index 00000000..f38d953e --- /dev/null +++ b/target/dependencies/vsh/vsh/craftbox/v0.3.0/nextflow/concat_text/nextflow_schema.json @@ -0,0 +1,68 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "concat_text", + "description": "Concatenate a number of text files, handle gzipped text files gracefully and\noptionally gzip the output text file.\n\nThis component is useful for concatening fastq files from different lanes, for instance.\n", + "type": "object", + "$defs": { + "input arguments": { + "title": "Input arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "exists": true, + "description": "A list of (gzipped) text files.", + "help_text": "Type: `file`, multiple: `True`, required, direction: `input`, example: `[\"input?.txt.gz\"]`. " + } + } + }, + "output arguments": { + "title": "Output arguments", + "type": "object", + "description": "No description", + "properties": { + "gzip_output": { + "type": "boolean", + "description": "Should the output be zipped?", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + }, + "output": { + "type": "string", + "format": "path", + "description": "File to write the output to, optionally gzipped.", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.output.txt\"`, direction: `output`, example: `\"output.txt\"`. ", + "default": "$id.$key.output.txt" + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/input arguments" + }, + { + "$ref": "#/$defs/output arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/dependencies/vsh/vsh/craftbox/v0.3.0/nextflow/move_files_to_directory/.config.vsh.yaml b/target/dependencies/vsh/vsh/craftbox/v0.3.0/nextflow/move_files_to_directory/.config.vsh.yaml new file mode 100644 index 00000000..1ddb6b6f --- /dev/null +++ b/target/dependencies/vsh/vsh/craftbox/v0.3.0/nextflow/move_files_to_directory/.config.vsh.yaml @@ -0,0 +1,194 @@ +name: "move_files_to_directory" +version: "v0.3.0" +authors: +- name: "Dorien Roosen" + roles: + - "maintainer" + info: + links: + email: "dorien@data-intuitive.com" + github: "dorien-er" + linkedin: "dorien-roosen" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + description: "Paths of the files that will be copied into the output directory." + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--output" + description: "Path to output directory" + info: null + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "boolean_true" + name: "--keep_symbolic_links" + alternatives: + - "-d" + description: "Preserve symbolic links." + info: null + direction: "input" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +summary: "Publish one or multiple files to the same directory" +description: "This component copies one or multiple files to the same destination\ + \ directory, creating the output directory if it doesn't exist." +test_resources: +- type: "bash_script" + path: "test.sh" + is_executable: true +info: null +status: "enabled" +scope: + image: "public" + target: "public" +requirements: + commands: + - "ps" +license: "MIT" +links: + repository: "https://github.com/viash-hub/craftbox" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "debian:latest" + target_registry: "images.viash-hub.com" + target_tag: "v0.3.0" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/move_files_to_directory/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/move_files_to_directory" + executable: "target/nextflow/move_files_to_directory/main.nf" + viash_version: "0.9.4" + git_commit: "a1801c5139bbcda244e06affa66d16f7abd5e124" + git_remote: "https://github.com/viash-hub/craftbox" + git_tag: "v0.1.0-11-ga1801c5" +package_config: + name: "craftbox" + version: "v0.3.0" + summary: "A collection of custom-tailored scripts and applied utilities built with\ + \ Viash.\n" + description: "`craftbox` is a curated collection of custom scripts and utilities\ + \ designed to tackle context-specific tasks.\n\nEmphasizing the Viash principles,\ + \ `craftbox` components aim for **reusability**, **reproducibility**, and adherence\ + \ to **best practices**. Key features generally include:\n\n* **Standalone & Nextflow\ + \ Ready:** Components are built to run directly via the command line or be smoothly\ + \ integrated into Nextflow workflows.\n* **Custom Implementations:** Contains\ + \ scripts and tools developed for particular tasks that may not be found in broader\ + \ collections.\n* **High Quality Standards (promoted by Viash):**\n * Clear\ + \ documentation for components and their parameters.\n * Full exposure of underlying\ + \ script/tool arguments for fine-grained control.\n * Containerized (Docker)\ + \ to ensure dependency management and a consistent, reproducible runtime environment.\n\ + \ * Unit tested where applicable to ensure components function as expected.\n" + info: null + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".requirements.commands := ['ps']\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'v0.3.0'" + keywords: + - "scripts" + - "custom" + - "implementations" + - "utilities" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/viash-hub/craftbox" + issue_tracker: "https://github.com/viash-hub/craftbox/issues" diff --git a/target/dependencies/vsh/vsh/craftbox/v0.3.0/nextflow/move_files_to_directory/main.nf b/target/dependencies/vsh/vsh/craftbox/v0.3.0/nextflow/move_files_to_directory/main.nf new file mode 100644 index 00000000..eda882f3 --- /dev/null +++ b/target/dependencies/vsh/vsh/craftbox/v0.3.0/nextflow/move_files_to_directory/main.nf @@ -0,0 +1,3828 @@ +// move_files_to_directory v0.3.0 +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dorien Roosen (maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "move_files_to_directory", + "version" : "v0.3.0", + "authors" : [ + { + "name" : "Dorien Roosen", + "roles" : [ + "maintainer" + ], + "info" : { + "links" : { + "email" : "dorien@data-intuitive.com", + "github" : "dorien-er", + "linkedin" : "dorien-roosen" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "Paths of the files that will be copied into the output directory.", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "description" : "Path to output directory", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "boolean_true", + "name" : "--keep_symbolic_links", + "alternatives" : [ + "-d" + ], + "description" : "Preserve symbolic links.", + "direction" : "input" + } + ] + } + ], + "resources" : [ + { + "type" : "bash_script", + "path" : "script.sh", + "is_executable" : true + } + ], + "summary" : "Publish one or multiple files to the same directory", + "description" : "This component copies one or multiple files to the same destination directory, creating the output directory if it doesn't exist.", + "test_resources" : [ + { + "type" : "bash_script", + "path" : "test.sh", + "is_executable" : true + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "requirements" : { + "commands" : [ + "ps" + ] + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/viash-hub/craftbox" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + } + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "debian:latest", + "target_registry" : "images.viash-hub.com", + "target_tag" : "v0.3.0", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/move_files_to_directory/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "target/nextflow/move_files_to_directory", + "viash_version" : "0.9.4", + "git_commit" : "a1801c5139bbcda244e06affa66d16f7abd5e124", + "git_remote" : "https://github.com/viash-hub/craftbox", + "git_tag" : "v0.1.0-11-ga1801c5" + }, + "package_config" : { + "name" : "craftbox", + "version" : "v0.3.0", + "summary" : "A collection of custom-tailored scripts and applied utilities built with Viash.\n", + "description" : "`craftbox` is a curated collection of custom scripts and utilities designed to tackle context-specific tasks.\n\nEmphasizing the Viash principles, `craftbox` components aim for **reusability**, **reproducibility**, and adherence to **best practices**. Key features generally include:\n\n* **Standalone & Nextflow Ready:** Components are built to run directly via the command line or be smoothly integrated into Nextflow workflows.\n* **Custom Implementations:** Contains scripts and tools developed for particular tasks that may not be found in broader collections.\n* **High Quality Standards (promoted by Viash):**\n * Clear documentation for components and their parameters.\n * Full exposure of underlying script/tool arguments for fine-grained control.\n * Containerized (Docker) to ensure dependency management and a consistent, reproducible runtime environment.\n * Unit tested where applicable to ensure components function as expected.\n", + "viash_version" : "0.9.4", + "source" : "src", + "target" : "target", + "config_mods" : [ + ".requirements.commands := ['ps']\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'v0.3.0'" + ], + "keywords" : [ + "scripts", + "custom", + "implementations", + "utilities" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/viash-hub/craftbox", + "issue_tracker" : "https://github.com/viash-hub/craftbox/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.sh" +cat > "$tempscript" << VIASHMAIN +#!/bin/bash + +set -eo pipefail + +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "${VIASH_PAR_INPUT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_input='&'#" ; else echo "# par_input="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "${VIASH_PAR_OUTPUT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_output='&'#" ; else echo "# par_output="; fi ) +$( if [ ! -z ${VIASH_PAR_KEEP_SYMBOLIC_LINKS+x} ]; then echo "${VIASH_PAR_KEEP_SYMBOLIC_LINKS}" | sed "s#'#'\\"'\\"'#g;s#.*#par_keep_symbolic_links='&'#" ; else echo "# par_keep_symbolic_links="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END + + +if [[ ! -d "\\$par_output" ]]; then + mkdir -p "\\$par_output" +fi + +extra_params=( ) + +if [ "\\$par_keep_symbolic_links" == "true" ]; then + extra_params+=( "-d" ) +fi + +# Process multiple input files +IFS=";" read -ra input_files <<< "\\$par_input" +for file in "\\${input_files[@]}"; do + # Check if the file exists before copying + if [[ -f "\\$file" ]]; then + + cp \\${extra_params[@]} "\\$file" "\\$par_output/" + echo "Copied \\$file to \\$par_output/" + else + echo "Warning: Input file \\$file does not exist, skipping" + fi +done +VIASHMAIN +bash "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/craftbox/move_files_to_directory", + "tag" : "v0.3.0" + }, + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/dependencies/vsh/vsh/craftbox/v0.3.0/nextflow/move_files_to_directory/nextflow.config b/target/dependencies/vsh/vsh/craftbox/v0.3.0/nextflow/move_files_to_directory/nextflow.config new file mode 100644 index 00000000..b038ed20 --- /dev/null +++ b/target/dependencies/vsh/vsh/craftbox/v0.3.0/nextflow/move_files_to_directory/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'move_files_to_directory' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'v0.3.0' + description = 'This component copies one or multiple files to the same destination directory, creating the output directory if it doesn\'t exist.' + author = 'Dorien Roosen' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + + diff --git a/target/dependencies/vsh/vsh/craftbox/v0.3.0/nextflow/move_files_to_directory/nextflow_schema.json b/target/dependencies/vsh/vsh/craftbox/v0.3.0/nextflow/move_files_to_directory/nextflow_schema.json new file mode 100644 index 00000000..7ea5f606 --- /dev/null +++ b/target/dependencies/vsh/vsh/craftbox/v0.3.0/nextflow/move_files_to_directory/nextflow_schema.json @@ -0,0 +1,58 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "move_files_to_directory", + "description": "This component copies one or multiple files to the same destination directory, creating the output directory if it doesn't exist.", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "exists": true, + "description": "Paths of the files that will be copied into the output directory.", + "help_text": "Type: `file`, multiple: `True`, required, direction: `input`. " + }, + "output": { + "type": "string", + "format": "path", + "description": "Path to output directory", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output\"`, direction: `output`. ", + "default": "$id.$key.output" + }, + "keep_symbolic_links": { + "type": "boolean", + "description": "Preserve symbolic links.", + "help_text": "Type: `boolean_true`, multiple: `False`, default: `false`. ", + "default": false + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/executable/eset/create_eset/.config.vsh.yaml b/target/executable/eset/create_eset/.config.vsh.yaml new file mode 100644 index 00000000..9bd8161f --- /dev/null +++ b/target/executable/eset/create_eset/.config.vsh.yaml @@ -0,0 +1,259 @@ +name: "create_eset" +namespace: "eset" +version: "updatecraftbox" +authors: +- name: "Dries Schaumont" + roles: + - "maintainer" + info: + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Marijke Van Moerbeke" + roles: + - "author" + info: + links: + github: "mvanmoerbeke" + orcid: "0000-0002-3097-5621" + linkedin: "marijke-van-moerbeke-84303a34" + organizations: + - name: "OpenAnalytics" + href: "https://www.openanalytics.eu" + role: "Statistical Consultant" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--pDataFile" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--fDataFile" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--mappingDir" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--poolName" + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + info: null + default: + - "eset.$id.rds" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "r_script" + path: "script.R" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +- type: "file" + path: "_viash.yaml" + dest: "_viash.yaml" +test_resources: +- type: "r_script" + path: "test.R" + is_executable: true +- type: "file" + path: "pData.tsv" +- type: "file" + path: "fData.tsv" +- type: "file" + path: "mapping_dir" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +requirements: + commands: + - "ps" +license: "MIT" +links: + repository: "https://github.com/viash-hub/htrnaseq" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "rocker/r2u:24.04" + target_registry: "images.viash-hub.com" + target_tag: "updatecraftbox" + namespace_separator: "/" + setup: + - type: "r" + cran: + - "data.table" + - "nlcv" + bioc: + - "Seurat" + bioc_force_install: false + warnings_as_errors: true + test_setup: + - type: "r" + cran: + - "testthat" + bioc_force_install: false + warnings_as_errors: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/eset/create_eset/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/eset/create_eset" + executable: "target/executable/eset/create_eset/create_eset" + viash_version: "0.9.4" + git_commit: "e6da525fc57aaec74f348eb974b68faa647bf800" + git_remote: "https://github.com/viash-hub/htrnaseq" +package_config: + name: "htrnaseq" + version: "updatecraftbox" + summary: "A workflow for high-throughput RNA-seq data analyses.\n" + description: "This workflow is designed to process high-throughput RNA-seq data,\ + \ where every\nwell of a microarray plate is a sample. A fasta file provided as\ + \ input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow\ + \ is built in a modular fashion, where most of the base functionality\nis provided\ + \ by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\n\ + supplemented by custom base components and workflow components in this package.\n\ + \nThe full workflow is split in two major subworkflows that can be run independently:\n\ + \n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per\ + \ well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate\ + \ QC reports.\n\nEach of those can be started individually, or the full workflow\ + \ can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq)\ + \ \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner)\ + \ where a\nnumber of choices (input/output structure and location) have been made.\n\ + \nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other\ + \ formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex)\ + \ first.\n" + info: + test_resources: + - path: "gs://viash-hub-resources/htrnaseq/v2" + dest: "resources_test" + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script\ + \ += 'includeConfig(\"nextflow_labels.config\")'\n.resources += {path: '/src/config/labels.config',\ + \ dest: 'nextflow_labels.config'}\n.resources += {path: '/_viash.yaml', dest:\ + \ '_viash.yaml'}\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'updatecraftbox'" + keywords: + - "bioinformatics" + - "sequencing" + - "high-throughput" + - "RNAseq" + - "mapping" + - "counting" + - "pipeline" + - "workflow" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/viash-hub/htrnaseq" + issue_tracker: "https://github.com/viash-hub/htrnaseq/issues" diff --git a/target/executable/eset/create_eset/_viash.yaml b/target/executable/eset/create_eset/_viash.yaml new file mode 100644 index 00000000..d08de7b1 --- /dev/null +++ b/target/executable/eset/create_eset/_viash.yaml @@ -0,0 +1,21 @@ +name: htrnaseq +summary: | + A workflow for high-throughput RNA-seq data analyses. +description: "This workflow is designed to process high-throughput RNA-seq data, where every\nwell of a microarray plate is a sample. A fasta file provided as input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow is built in a modular fashion, where most of the base functionality\nis provided by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\nsupplemented by custom base components and workflow components in this package.\n\nThe full workflow is split in two major subworkflows that can be run independently:\n\n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate QC reports.\n\nEach of those can be started individually, or the full workflow can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq) \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner) where a\nnumber of choices (input/output structure and location) have been made.\n\nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex) first.\n" +license: MIT +keywords: [bioinformatics, sequencing, high-throughput, RNAseq, mapping, counting, pipeline, workflow] +links: + issue_tracker: https://github.com/viash-hub/htrnaseq/issues + repository: https://github.com/viash-hub/htrnaseq +viash_version: 0.9.4 +info: + test_resources: + - path: gs://viash-hub-resources/htrnaseq/v2 + dest: resources_test +config_mods: | + .requirements.commands := ['ps'] + .runners[.type == 'nextflow'].config.script += 'includeConfig("nextflow_labels.config")' + .resources += {path: '/src/config/labels.config', dest: 'nextflow_labels.config'} + .resources += {path: '/_viash.yaml', dest: '_viash.yaml'} +version: updatecraftbox +organization: vsh diff --git a/target/executable/eset/create_eset/create_eset b/target/executable/eset/create_eset/create_eset new file mode 100755 index 00000000..1d88ab67 --- /dev/null +++ b/target/executable/eset/create_eset/create_eset @@ -0,0 +1,1646 @@ +#!/usr/bin/env bash + +# create_eset updatecraftbox +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dries Schaumont (maintainer) +# * Marijke Van Moerbeke (author) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="create_eset" +VIASH_META_FUNCTIONALITY_NAME="create_eset" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM rocker/r2u:24.04 +ENTRYPOINT [] +RUN Rscript -e 'options(warn = 2); if (!requireNamespace("remotes", quietly = TRUE)) install.packages("remotes")' && \ + Rscript -e 'options(warn = 2); if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager")' && \ + Rscript -e 'options(warn = 2); if (!requireNamespace("Seurat", quietly = TRUE)) BiocManager::install("Seurat")' && \ + Rscript -e 'options(warn = 2); remotes::install_cran(c("data.table", "nlcv"), repos = "https://cran.rstudio.com")' + +LABEL org.opencontainers.image.authors="Dries Schaumont, Marijke Van Moerbeke" +LABEL org.opencontainers.image.description="Companion container for running component eset create_eset" +LABEL org.opencontainers.image.created="2025-09-02T13:33:11Z" +LABEL org.opencontainers.image.source="https://github.com/viash-hub/htrnaseq" +LABEL org.opencontainers.image.revision="e6da525fc57aaec74f348eb974b68faa647bf800" +LABEL org.opencontainers.image.version="updatecraftbox" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "create_eset updatecraftbox" + echo "" + echo "Arguments:" + echo " --pDataFile" + echo " type: file, required parameter, file must exist" + echo "" + echo " --fDataFile" + echo " type: file, required parameter, file must exist" + echo "" + echo " --mappingDir" + echo " type: file, required parameter, multiple values allowed, file must exist" + echo "" + echo " --poolName" + echo " type: string, required parameter" + echo "" + echo " --output" + echo " type: file, required parameter, output, file must exist" + echo " default: eset.\$id.rds" + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "create_eset updatecraftbox" + exit + ;; + --pDataFile) + [ -n "$VIASH_PAR_PDATAFILE" ] && ViashError Bad arguments for option \'--pDataFile\': \'$VIASH_PAR_PDATAFILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PDATAFILE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --pDataFile. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --pDataFile=*) + [ -n "$VIASH_PAR_PDATAFILE" ] && ViashError Bad arguments for option \'--pDataFile=*\': \'$VIASH_PAR_PDATAFILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PDATAFILE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --fDataFile) + [ -n "$VIASH_PAR_FDATAFILE" ] && ViashError Bad arguments for option \'--fDataFile\': \'$VIASH_PAR_FDATAFILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FDATAFILE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --fDataFile. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --fDataFile=*) + [ -n "$VIASH_PAR_FDATAFILE" ] && ViashError Bad arguments for option \'--fDataFile=*\': \'$VIASH_PAR_FDATAFILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_FDATAFILE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --mappingDir) + if [ -z "$VIASH_PAR_MAPPINGDIR" ]; then + VIASH_PAR_MAPPINGDIR="$2" + else + VIASH_PAR_MAPPINGDIR="$VIASH_PAR_MAPPINGDIR;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --mappingDir. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --mappingDir=*) + if [ -z "$VIASH_PAR_MAPPINGDIR" ]; then + VIASH_PAR_MAPPINGDIR=$(ViashRemoveFlags "$1") + else + VIASH_PAR_MAPPINGDIR="$VIASH_PAR_MAPPINGDIR;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --poolName) + [ -n "$VIASH_PAR_POOLNAME" ] && ViashError Bad arguments for option \'--poolName\': \'$VIASH_PAR_POOLNAME\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_POOLNAME="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --poolName. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --poolName=*) + [ -n "$VIASH_PAR_POOLNAME" ] && ViashError Bad arguments for option \'--poolName=*\': \'$VIASH_PAR_POOLNAME\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_POOLNAME=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/htrnaseq/eset/create_eset:updatecraftbox' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'ps' 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'ps' 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_PDATAFILE+x} ]; then + ViashError '--pDataFile' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_FDATAFILE+x} ]; then + ViashError '--fDataFile' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_MAPPINGDIR+x} ]; then + ViashError '--mappingDir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_POOLNAME+x} ]; then + ViashError '--poolName' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_PDATAFILE" ] && [ ! -e "$VIASH_PAR_PDATAFILE" ]; then + ViashError "Input file '$VIASH_PAR_PDATAFILE' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_FDATAFILE" ] && [ ! -e "$VIASH_PAR_FDATAFILE" ]; then + ViashError "Input file '$VIASH_PAR_FDATAFILE' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_MAPPINGDIR" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_MAPPINGDIR; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_PDATAFILE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_PDATAFILE")" ) + VIASH_PAR_PDATAFILE=$(ViashDockerAutodetectMount "$VIASH_PAR_PDATAFILE") +fi +if [ ! -z "$VIASH_PAR_FDATAFILE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_FDATAFILE")" ) + VIASH_PAR_FDATAFILE=$(ViashDockerAutodetectMount "$VIASH_PAR_FDATAFILE") +fi +if [ ! -z "$VIASH_PAR_MAPPINGDIR" ]; then + VIASH_TEST_MAPPINGDIR=() + IFS=';' + for var in $VIASH_PAR_MAPPINGDIR; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_MAPPINGDIR+=( "$var" ) + done + VIASH_PAR_MAPPINGDIR=$(IFS=';' ; echo "${VIASH_TEST_MAPPINGDIR[*]}") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-create_eset-XXXXXX").R +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +library(Biobase) +library(data.table) +library(nlcv) +library(Matrix) +library(Seurat) + +### VIASH START +# The following code has been auto-generated by Viash. +# treat warnings as errors +.viash_orig_warn <- options(warn = 2) + +par <- list( + "pDataFile" = $( if [ ! -z ${VIASH_PAR_PDATAFILE+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_PDATAFILE" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "fDataFile" = $( if [ ! -z ${VIASH_PAR_FDATAFILE+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_FDATAFILE" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "mappingDir" = $( if [ ! -z ${VIASH_PAR_MAPPINGDIR+x} ]; then echo -n "strsplit('"; echo -n "$VIASH_PAR_MAPPINGDIR" | sed "s#['\\]#\\\\&#g"; echo "', split = ';')[[1]]"; else echo NULL; fi ), + "poolName" = $( if [ ! -z ${VIASH_PAR_POOLNAME+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_POOLNAME" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "output" = $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_OUTPUT" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ) +) +meta <- list( + "name" = $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_META_NAME" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "functionality_name" = $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_META_FUNCTIONALITY_NAME" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "resources_dir" = $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_META_RESOURCES_DIR" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "executable" = $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo -n "'"; echo -n "$VIASH_META_EXECUTABLE" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "config" = $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo -n "'"; echo -n "$VIASH_META_CONFIG" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "temp_dir" = $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_META_TEMP_DIR" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "cpus" = $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_META_CPUS" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_b" = $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_B" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_kb" = $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_KB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_mb" = $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_MB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_gb" = $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_GB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_tb" = $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_TB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_pb" = $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_PB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_kib" = $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_KIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_mib" = $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_MIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_gib" = $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_GIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_tib" = $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_TIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_pib" = $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_PIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ) +) +dep <- list( + +) + + +# restore original warn setting +options(.viash_orig_warn) +rm(.viash_orig_warn) + +### VIASH END + + +Read10X <- function(data_dir = NULL, gene_column = 2, unique_features = TRUE) { + full.data <- list() + for (i in seq_along(along.with = data_dir)) { + run <- data_dir[i] + if (!dir.exists(paths = run)) { + stop("Directory provided does not exist") + } + barcode.loc <- file.path(run, "barcodes.tsv") + gene.loc <- file.path(run, "features.tsv") + features.loc <- file.path(run, "features.tsv.gz") + matrix.loc <- file.path(run, "matrix.mtx") + pre_ver_3 <- file.exists(gene.loc) + if (!pre_ver_3) { + addgz <- function(s) { + return(paste0(s, ".gz")) + } + barcode.loc <- addgz(s = barcode.loc) + matrix.loc <- addgz(s = matrix.loc) + } + if (!file.exists(barcode.loc)) { + stop("Barcode file missing") + } + if (!pre_ver_3 && !file.exists(features.loc)) { + stop("Gene name or features file missing") + } + if (!file.exists(matrix.loc)) { + stop("Expression matrix file missing") + } + data <- readMM(file = matrix.loc) + cell.names <- readLines(barcode.loc) + if (all(grepl(pattern = "\\\\-1\$", x = cell.names))) { + cell.names <- as.vector(x = as.character(x = sapply(X = cell.names, + FUN = ExtractField, field = 1, delim = "-"))) + } + if (is.null(x = names(x = data_dir))) { + if (i < 2) { + colnames(x = data) <- cell.names + } + else { + colnames(x = data) <- paste0(i, "_", cell.names) + } + } + else { + colnames(x = data) <- paste0(names(x = data_dir)[i], + "_", cell.names) + } + feature.names <- read.delim(file = ifelse(test = pre_ver_3, + yes = gene.loc, no = features.loc), header = FALSE, + stringsAsFactors = FALSE) + if (any(is.na(x = feature.names[, gene_column]))) { + warning("Some features names are NA. Replacing NA names with ID from the opposite column requested", + call. = FALSE, immediate. = TRUE) + na.features <- which(x = is.na(x = feature.names[, + gene_column])) + replacement.column <- ifelse(test = gene_column == + 2, yes = 1, no = 2) + feature.names[na.features, gene_column] <- feature.names[na.features, + replacement.column] + } + if (unique_features) { + fcols = ncol(x = feature.names) + if (fcols < gene_column) { + stop(paste0("gene_column was set to ", gene_column, + " but feature.tsv.gz (or genes.tsv) only has ", + fcols, " columns.", " Try setting the gene_column ", + "argument to a value <= to ", + fcols, ".")) + } + rownames(x = data) <- make.unique(names = feature.names[, + gene_column]) + } + if (ncol(x = feature.names) > 2) { + data_types <- factor(x = feature.names\$V3) + lvls <- levels(x = data_types) + if (length(x = lvls) > 1 && length(x = full.data) == 0) { + message(paste0("10X data contains more than one type and is ", + "being returned as a list containing matrices ", + "of each type.")) + } + expr_name <- "Gene Expression" + if (expr_name %in% lvls) { + lvls <- c(expr_name, lvls[-which(x = lvls == + expr_name)]) + } + data <- lapply(X = lvls, FUN = function(l) { + return(data[data_types == l, , drop = FALSE]) + }) + names(x = data) <- lvls + } else { + data <- list(data) + } + full.data[[length(x = full.data) + 1]] <- data + } + list_of_data <- list() + for (j in 1:length(x = full.data[[1]])) { + list_of_data[[j]] <- do.call(cbind, lapply(X = full.data, + FUN = \`[[\`, j)) + list_of_data[[j]] <- as(object = list_of_data[[j]], Class = "CsparseMatrix") + } + names(x = list_of_data) <- names(x = full.data[[1]]) + if (length(x = list_of_data) == 1) { + return(list_of_data[[1]]) + } else { + return(list_of_data) + } +} + +match_features <- function(exprs_matrix, fdata) { + + identical_features <- all(rownames(exprs_matrix) == rownames(fdata)) + + if (nrow(exprs_matrix) != nrow(fdata) || !identical_features) { + message(paste0("Features in 'fData' and expression matrix differ. ", + "Only matching features are returned.")) + } + + features <- intersect(rownames(exprs_matrix), rownames(fdata)) + exprs_matrix <- exprs_matrix[which(rownames(exprs_matrix) %in% features), ] + fdata <- fdata[which(rownames(fdata) %in% features), ] + + fdata[, seq_len(ncol(fdata))] <- lapply(fdata[, seq_len(ncol(fdata)), drop = FALSE], as.character) + # order features in exprs mat according to fdata + exprs_matrix <- exprs_matrix[match(rownames(fdata), rownames(exprs_matrix)), ] + + list(exprs_matrix = exprs_matrix, fdata = fdata) + +} + + +create_pdata <- function(sample_file, pool_name, barcodes) { + cols_to_remove <- c("SampleFileName", "Output", "Measure", "Strandedness") + pData <- sample_file[, !colnames(sample_file) %in% cols_to_remove, + drop = FALSE] + rownames(pData) <- lapply(sample_file\$WellBC, + \\(x) paste(pool_name, x, sep = "_")) + # pData[, ] <- lapply(pData, as.factor) + pData\$PoolName <- pool_name + pData <- pData[match(barcodes, pData\$WellBC), ] + return(pData) +} + +check_sample_file <- function(mapping_dir, sample_file){ + + message("Checking sample annotation:") + + requireNamespace("tools") + mapping_dir <- unlist(lapply(mapping_dir, function(x) { + if (!dir.exists(x)) { + stop(sprintf(paste0("Could not find directory ", + "provided in 'mappingDir' argument (%s)."), x)) + } + tools::file_path_as_absolute(x) + })) + + + # additional check for STARsolo + check_STARsolo_output <- function(x) { + files <- c("barcodes.tsv", "features.tsv", "matrix.mtx") + test <- list.files(x) %in% c(files, paste0(files, ".gz")) + length(test) != 0 && all(test) + } + + + if (!"WellBC" %in% colnames(sample_file)) { + stop(paste0("STARsolo output is used. The sample annotation must ", + "contain 'WellBC' column providing cell barcodes.")) + } + + mapping_dir <- unique(mapping_dir) + all_STARsolo_files_present <- all( + unlist( + lapply(mapping_dir, function(x) { + check_STARsolo_output(x) + }) + ) + ) + if (!all_STARsolo_files_present) { + stop(paste0("Could not find files: 'barcodes', 'features' and 'matrix'", + " for STARsolo output. Please check 'mappingDir' argument.")) + } + + message("- 'SampleFileName' column - OK") + + + + list(sample_expression_files = mapping_dir) +} + +create_exprs_matrix <- function(exprs_matrix_path, exprs_file_paths, + output, measure, col_names, cell_barcodes) { + + read_matrix <- Read10X(data_dir = exprs_file_paths, gene_column = 1) + # keep index of feature names containing "_" because Seurat + #changes them to "-" and they no longer match with fdata[, "gene_id"] + idx <- grep("_", rownames(read_matrix)) + + requireNamespace("Seurat") + seurat_object <- Seurat::CreateSeuratObject(counts = read_matrix) + + exprs_matrix <- as.matrix(seurat_object[['RNA']]\$counts) + # replace "-" with "_" for features with "_" + # before converting to Seurat object + rownames(exprs_matrix)[idx] <- gsub("-", "_", rownames(exprs_matrix)[idx]) + requireNamespace("stringr") + exprs_matrix <- exprs_matrix[, stringr::str_detect(colnames(exprs_matrix), + paste(cell_barcodes, collapse = "|"))] + + + # check if rownames are ENSEMBL and remove version suffix + isENSEMBL <- all(grepl("ENS", rownames(exprs_matrix))) + if (isENSEMBL) { + # do not use gsub("(.+)[.]\\\\d+", "\\\\1", rownames(exprs_matrix)), + # so that ENS000000.1_PAR_Y can be kept + rownames(exprs_matrix) <- gsub("\\\\.\\\\d+\$", "", rownames(exprs_matrix)) + } + + + colnames(exprs_matrix) <- col_names + + exprs_matrix +} + +create_eset <- function(feature_annotation_path, + sample_annotation_path, + mapping_dir, + barcodes, + output_path, + pool_name, + exprs_matrix_path = NULL, + path = NULL, + add_eset_annotation = NULL) { + if (!file.exists(feature_annotation_path)) { + stop("Could not find feature annotation at '", feature_annotation_path, "'") + } + + if (!file.exists(sample_annotation_path)) { + stop("Could not find sample annotation at '", sample_annotation_path, "'") + } + + if(!is.null(exprs_matrix_path)) { + if(!file.exists(exprs_matrix_path)) { + stop("Could not find expression matrix at '", exprs_matrix_path, "'") + } + } + + if(!is.null(path)) { + if(!dir.exists(path)) { + stop("Provided 'path': '", path, "' does not exist.") + } + } + + ##### Import annotation files ##### + message("Importing feature annotation") + fdata_file <- read.table(feature_annotation_path, header = TRUE, + sep = "\\t", quote = "\\"", + comment.char = "", stringsAsFactors = FALSE) + + # for backwards compatibility + if("ENSEMBL" %in% colnames(fdata_file) && !all(grepl("ENS", fdata_file[, "ENSEMBL"])) & !"gene_id" %in% colnames(fdata_file)) { + colnames(fdata_file)[which(colnames(fdata_file) == "ENSEMBL")] <- "gene_id" + } + + # Check gene annotation + if(!"gene_id" %in% colnames(fdata_file)) + stop("'gene_id' column with unique feature identifiers must be present in 'feature_annotation_path'.") + + # check if duplicated ids are present + if(any(duplicated(fdata_file\$gene_id))) + stop("Duplicated features ids are not allowed. Please check the 'gene_id' column in 'feature_annotation_path'.") + + message("Importing sample annotation") + sample_file <- read.table(sample_annotation_path, header = TRUE, + sep = "\\t", quote = "\\"", + comment.char = "", stringsAsFactors = FALSE) + # Check sample annotation + check_sample_file_list <- check_sample_file(mapping_dir = mapping_dir, + sample_file = sample_file) + output <- "STARsolo" + measure <- "counts" + sample_expression_files <- check_sample_file_list\$sample_expression_files + + ##### Create phenodata ##### + pdata_eset <- create_pdata(sample_file = sample_file, pool_name = pool_name, + barcodes = barcodes) + + ##### Create expression matrix ##### + message("Creating expression matrix") + + exprs_matrix_eset <- create_exprs_matrix( + exprs_matrix_path = exprs_matrix_path, + exprs_file_paths = sample_expression_files, + output = output, + measure = measure, + col_names = rownames(pdata_eset), + cell_barcodes = barcodes + ) + + + ##### Create featuredata ##### + message("Creating feature data") + + fdata_eset <- fdata_file + rownames(fdata_eset) <- fdata_eset[, "gene_id"] + + # intersect features between exprs matrix and fdata + feature_files <- match_features(exprs_matrix = exprs_matrix_eset, + fdata = fdata_eset) + + fdata_eset <- feature_files\$fdata + exprs_matrix_eset <- feature_files\$exprs_matrix + + ##### Create eSet ##### + message("Creating eset") + + if (nrow(pdata_eset) != ncol(exprs_matrix_eset)) { + stop("nrow(pData) and ncol(exprsMatrix) differ") + } + + if (nrow(fdata_eset) != nrow(exprs_matrix_eset)) { + stop("nrow(fData) and nrow(exprsMatrix) differ") + } + + if (!all(rownames(pdata_eset) == colnames(exprs_matrix_eset))) { + stop("rownames(pData) and colnames(exprsMatrix) differ") + } + + if (!all(rownames(fdata_eset) == rownames(exprs_matrix_eset))) { + stop("rownames(fData) and rownames(exprsMatrix) differ") + } + + if (!inherits(exprs_matrix_eset, "matrix")) { + stop("exprsMatrix must be of class 'matrix'") + } + + + + additional_info <- paste0("Additional information about eSet \\n", + " Expression matrix created from ", + output, " output. \\n", + " Expression matrix contains non-transformed ", + ifelse(output %in% c("STAR", "STARsolo"), + "counts", + ifelse(measure == "expected_count", + "counts", measure)), ".") + + + if (isTRUE(!is.null(add_eset_annotation) & + is.character(add_eset_annotation))) { + additional_info <- paste0(additional_info, "\\n", " ", add_eset_annotation) + } + + fdata_eset <- new("AnnotatedDataFrame", data = fdata_eset) + pdata_eset <- new("AnnotatedDataFrame", data = pdata_eset) + + requireNamespace("Biobase") + eset <- Biobase::ExpressionSet(assayData = exprs_matrix_eset, + phenoData = pdata_eset, + featureData = fdata_eset, + annotation = additional_info) + + eset <- eset[, colSums(exprs(eset)) != 0] + saveRDS(eset, file = output_path) + + message(paste0("eset created succesfully for ", ncol(eset), + " samples and ", nrow(eset), + " genes and saved at ", output_path, ".")) + + eset +} + + +p_data_file <- par\$pDataFile +f_data_file <- par\$fDataFile +pool_name <- par\$poolName +mapping_dir <- lapply(par\$mappingDir, + \\(x) file.path(x, "Solo.out", "Gene", "raw")) + +get_barcode_from_mapping_dir <- function(raw_dir) { + barcodes_file <- file.path(raw_dir, "barcodes.tsv") + if (!file.exists(barcodes_file)) { + stop(paste0("Expected the 'Solo.out/Gene/raw' directory at ", + raw_dir, " to contain a 'barcodes.tsv' file.")) + } + barcodes <- readLines(barcodes_file) + if (length(barcodes) != 1) { + stop(paste0("A single STAR Solo folder should only have ", + "mapped one (1) barcode, but found '", + length(barcodes), "'for mapping directory ", raw_dir)) + } + return(barcodes) +} + +barcodes <- lapply(mapping_dir, get_barcode_from_mapping_dir) + +print(paste0("mappingDir: ", mapping_dir)) +print(paste0("pDataFile: ", p_data_file)) +print(paste0("fDataFile: ", f_data_file)) +print(paste0("poolName: ", pool_name)) +print(paste0("barcodes: ", barcodes)) + + + +# CREATE ESET WITH RAW UMI COUNTS + +eset <- create_eset(feature_annotation_path = f_data_file, + sample_annotation_path = p_data_file, + mapping_dir = mapping_dir, + barcodes = barcodes, + output_path = par\$output, + pool_name = pool_name, + path = NULL, + exprs_matrix_path = NULL) +VIASHMAIN +Rscript "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_PDATAFILE" ]; then + VIASH_PAR_PDATAFILE=$(ViashDockerStripAutomount "$VIASH_PAR_PDATAFILE") + fi + if [ ! -z "$VIASH_PAR_FDATAFILE" ]; then + VIASH_PAR_FDATAFILE=$(ViashDockerStripAutomount "$VIASH_PAR_FDATAFILE") + fi + if [ ! -z "$VIASH_PAR_MAPPINGDIR" ]; then + unset VIASH_TEST_MAPPINGDIR + IFS=';' + for var in $VIASH_PAR_MAPPINGDIR; do + unset IFS + if [ -z "$VIASH_TEST_MAPPINGDIR" ]; then + VIASH_TEST_MAPPINGDIR="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_MAPPINGDIR="$VIASH_TEST_MAPPINGDIR;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_MAPPINGDIR="$VIASH_TEST_MAPPINGDIR" + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/eset/create_eset/nextflow_labels.config b/target/executable/eset/create_eset/nextflow_labels.config new file mode 100644 index 00000000..2821ec46 --- /dev/null +++ b/target/executable/eset/create_eset/nextflow_labels.config @@ -0,0 +1,108 @@ +executor { + $k8s { + submitRateLimit = '10sec' + pollInterval = '1 sec' + } +} + +process { + container = 'nextflow/bash:latest' + + // default resources + memory = { 8.Gb * task.attempt } + cpus = 8 + maxForks = 36 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + maxMemory = 192.GB + + // Resource labels + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 8 } + withLabel: midcpu { cpus = 16 } + withLabel: highcpu { cpus = 32 } + + withLabel: verylowmem { memory = { get_memory( 4.GB * task.attempt ) } } + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 16.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 64.GB * task.attempt ) } } + +} + +profiles { + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.fixOwnership = true + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + + local { + // This config is for local processing. + process { + withName: ".*parallel_map_process" { + maxForks = 1 + } + maxMemory = 25.GB + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 6 } + withLabel: highcpu { cpus = 8 } + + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 12.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 20.GB * task.attempt ) } } + } + } +} + +def get_memory(to_compare) { + if (!process.containsKey("maxMemory") || !process.maxMemory) { + return to_compare + } + + try { + if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) { + return process.maxMemory + } + else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) { + return max_memory as nextflow.util.MemoryUnit + } + else { + return to_compare + } + } catch (all) { + println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!" + System.exit(1) + } + } diff --git a/target/executable/eset/create_fdata/.config.vsh.yaml b/target/executable/eset/create_fdata/.config.vsh.yaml new file mode 100644 index 00000000..b2d4a1d6 --- /dev/null +++ b/target/executable/eset/create_fdata/.config.vsh.yaml @@ -0,0 +1,236 @@ +name: "create_fdata" +namespace: "eset" +version: "updatecraftbox" +authors: +- name: "Dries Schaumont" + roles: + - "maintainer" + info: + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Marijke Van Moerbeke" + roles: + - "contributor" + info: + links: + github: "mvanmoerbeke" + orcid: "0000-0002-3097-5621" + linkedin: "marijke-van-moerbeke-84303a34" + organizations: + - name: "OpenAnalytics" + href: "https://www.openanalytics.eu" + role: "Statistical Consultant" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--gtf" + description: "Genome annotation file in GTF format." + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + description: "Tab-delimited text file containing information about the 'gene'\ + \ or 'transcript'\nentries from the input GTF file. The 'transcript' entries\ + \ are used in case the source\nof the GTF was 'refGene' or 'ncbiRefSeq'. \n" + info: null + default: + - "fData.$id.txt" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "create_fdata.py" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +- type: "file" + path: "_viash.yaml" + dest: "_viash.yaml" +description: "Create a fdata file\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "test_annotation.gtf" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +requirements: + commands: + - "ps" +license: "MIT" +links: + repository: "https://github.com/viash-hub/htrnaseq" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "updatecraftbox" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "pandas" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/eset/create_fdata/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/eset/create_fdata" + executable: "target/executable/eset/create_fdata/create_fdata" + viash_version: "0.9.4" + git_commit: "e6da525fc57aaec74f348eb974b68faa647bf800" + git_remote: "https://github.com/viash-hub/htrnaseq" +package_config: + name: "htrnaseq" + version: "updatecraftbox" + summary: "A workflow for high-throughput RNA-seq data analyses.\n" + description: "This workflow is designed to process high-throughput RNA-seq data,\ + \ where every\nwell of a microarray plate is a sample. A fasta file provided as\ + \ input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow\ + \ is built in a modular fashion, where most of the base functionality\nis provided\ + \ by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\n\ + supplemented by custom base components and workflow components in this package.\n\ + \nThe full workflow is split in two major subworkflows that can be run independently:\n\ + \n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per\ + \ well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate\ + \ QC reports.\n\nEach of those can be started individually, or the full workflow\ + \ can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq)\ + \ \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner)\ + \ where a\nnumber of choices (input/output structure and location) have been made.\n\ + \nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other\ + \ formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex)\ + \ first.\n" + info: + test_resources: + - path: "gs://viash-hub-resources/htrnaseq/v2" + dest: "resources_test" + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script\ + \ += 'includeConfig(\"nextflow_labels.config\")'\n.resources += {path: '/src/config/labels.config',\ + \ dest: 'nextflow_labels.config'}\n.resources += {path: '/_viash.yaml', dest:\ + \ '_viash.yaml'}\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'updatecraftbox'" + keywords: + - "bioinformatics" + - "sequencing" + - "high-throughput" + - "RNAseq" + - "mapping" + - "counting" + - "pipeline" + - "workflow" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/viash-hub/htrnaseq" + issue_tracker: "https://github.com/viash-hub/htrnaseq/issues" diff --git a/target/executable/eset/create_fdata/_viash.yaml b/target/executable/eset/create_fdata/_viash.yaml new file mode 100644 index 00000000..d08de7b1 --- /dev/null +++ b/target/executable/eset/create_fdata/_viash.yaml @@ -0,0 +1,21 @@ +name: htrnaseq +summary: | + A workflow for high-throughput RNA-seq data analyses. +description: "This workflow is designed to process high-throughput RNA-seq data, where every\nwell of a microarray plate is a sample. A fasta file provided as input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow is built in a modular fashion, where most of the base functionality\nis provided by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\nsupplemented by custom base components and workflow components in this package.\n\nThe full workflow is split in two major subworkflows that can be run independently:\n\n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate QC reports.\n\nEach of those can be started individually, or the full workflow can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq) \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner) where a\nnumber of choices (input/output structure and location) have been made.\n\nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex) first.\n" +license: MIT +keywords: [bioinformatics, sequencing, high-throughput, RNAseq, mapping, counting, pipeline, workflow] +links: + issue_tracker: https://github.com/viash-hub/htrnaseq/issues + repository: https://github.com/viash-hub/htrnaseq +viash_version: 0.9.4 +info: + test_resources: + - path: gs://viash-hub-resources/htrnaseq/v2 + dest: resources_test +config_mods: | + .requirements.commands := ['ps'] + .runners[.type == 'nextflow'].config.script += 'includeConfig("nextflow_labels.config")' + .resources += {path: '/src/config/labels.config', dest: 'nextflow_labels.config'} + .resources += {path: '/_viash.yaml', dest: '_viash.yaml'} +version: updatecraftbox +organization: vsh diff --git a/target/executable/eset/create_fdata/create_fdata b/target/executable/eset/create_fdata/create_fdata new file mode 100755 index 00000000..b9dc0e6e --- /dev/null +++ b/target/executable/eset/create_fdata/create_fdata @@ -0,0 +1,1245 @@ +#!/usr/bin/env bash + +# create_fdata updatecraftbox +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dries Schaumont (maintainer) +# * Marijke Van Moerbeke (contributor) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="create_fdata" +VIASH_META_FUNCTIONALITY_NAME="create_fdata" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "pandas" + +LABEL org.opencontainers.image.authors="Dries Schaumont, Marijke Van Moerbeke" +LABEL org.opencontainers.image.description="Companion container for running component eset create_fdata" +LABEL org.opencontainers.image.created="2025-09-02T13:33:09Z" +LABEL org.opencontainers.image.source="https://github.com/viash-hub/htrnaseq" +LABEL org.opencontainers.image.revision="e6da525fc57aaec74f348eb974b68faa647bf800" +LABEL org.opencontainers.image.version="updatecraftbox" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "create_fdata updatecraftbox" + echo "" + echo "Create a fdata file" + echo "" + echo "Arguments:" + echo " --gtf" + echo " type: file, required parameter, file must exist" + echo " Genome annotation file in GTF format." + echo "" + echo " --output" + echo " type: file, output, file must exist" + echo " default: fData.\$id.txt" + echo " Tab-delimited text file containing information about the 'gene' or" + echo " 'transcript'" + echo " entries from the input GTF file. The 'transcript' entries are used in" + echo " case the source" + echo " of the GTF was 'refGene' or 'ncbiRefSeq'." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "create_fdata updatecraftbox" + exit + ;; + --gtf) + [ -n "$VIASH_PAR_GTF" ] && ViashError Bad arguments for option \'--gtf\': \'$VIASH_PAR_GTF\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GTF="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --gtf. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --gtf=*) + [ -n "$VIASH_PAR_GTF" ] && ViashError Bad arguments for option \'--gtf=*\': \'$VIASH_PAR_GTF\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GTF=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/htrnaseq/eset/create_fdata:updatecraftbox' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'ps' 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'ps' 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_GTF+x} ]; then + ViashError '--gtf' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + VIASH_PAR_OUTPUT="fData.\$id.txt" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_GTF" ] && [ ! -e "$VIASH_PAR_GTF" ]; then + ViashError "Input file '$VIASH_PAR_GTF' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_GTF" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_GTF")" ) + VIASH_PAR_GTF=$(ViashDockerAutodetectMount "$VIASH_PAR_GTF") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-create_fdata-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import logging +import pandas as pd +import numpy as np +from textwrap import fill + + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'gtf': $( if [ ! -z ${VIASH_PAR_GTF+x} ]; then echo "r'${VIASH_PAR_GTF//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + +logger = logging.getLogger() +console_handler = logging.StreamHandler() +logger.addHandler(console_handler) +logger.setLevel(logging.DEBUG) + + +def read_gtf(gtf_path: str) -> pd.DataFrame: + logger.info("Reading %s", gtf_path) + result = pd.read_csv(gtf_path, sep="\\t", + header=None, names=("seqname", "source", + "feature", "start", "end", + "score", "strand", "frame", + "attribute"), + dtype={ + "seqname": pd.StringDtype(), + "source": pd.StringDtype(), + "feature": pd.StringDtype(), + "start": pd.Int64Dtype(), + "end": pd.Int64Dtype(), + "score": pd.StringDtype(), + "strand": pd.CategoricalDtype(categories=["+", "-"], + ordered=False), + "frame": pd.StringDtype(), + "attribute": pd.StringDtype(), + }, + comment='#' + ) + logger.info("Done reading %s. Found %d GTF entries ", par["gtf"], result.shape[0]) + logger.info("GTF file is providing information for the following chromosomes: \\n%s", + fill(", ".join(result['seqname'].unique()), width=100)) + logger.info("The following sources were specified in the GTF file:\\n%s", + ", ".join(result["source"].unique())) + return result + + +def parse_attributes(attributes_series: pd.Series): + attribute_dict = dict() + attributes_list = [attr.strip().split(" ") + for attr in attributes_series["attribute"].strip(";").split(";")] + for (attr_name, attr_value) in attributes_list: + attribute_dict.setdefault(attr_name, []).append(attr_value.strip('"')) + attribute_dict = {attr_name: "|".join(attr_value) + for attr_name, attr_value in attribute_dict.items()} + return pd.Series(attribute_dict) + + +def main(par): + logger.info(f"{meta['name']} started.") + parameters_str = [f'\\t{param}: {param_val}\\n' for param, param_val in par.items()] + logger.info("Parameters:\\n%s", "".join(parameters_str).rstrip()) + gtf_file = read_gtf(par["gtf"]) + sources = set(source for source in gtf_file["source"].unique() if source != "ERCC") + specific_gtf = False + feature = "gene" + if len(sources) == 1 and (source := sources[0]) \\ + and (source == "refGene" or source == "ncbiRefSeq"): + feature = "transcript" + specific_gtf = True + logger.info("Found specific GTF from %s, forcing filtering on feature type %s", source, feature) + logger.info("Filtering GTF entries for feature type '%s'.", feature) + gtf_file = gtf_file[gtf_file["feature"] == feature] + logger.info("After filtering %d entries are left.", gtf_file.shape[0]) + logger.info("Parsing the GTF attributes") + annotation = gtf_file[["attribute"]].apply(parse_attributes, result_type="expand", axis=1) + logger.info("Found the following attributes in the GTF:\\n%s", ", ".join(annotation.columns)) + annotation = pd.concat([gtf_file.drop(["attribute"], axis=1), annotation], axis=1) + if specific_gtf: + logger.info("Because the source of the GTF is either 'ncbiRefSeq' or 'refGene', which" + "caused forced filtering based on %s, the duplicate genes still need to be dropped.", + feature) + annotation = annotation.drop_duplicates(subset=("gene_id", "gene_name"), keep=False) + logger.info("After dropping duplicates, %d entries are left", annotation.shape[0]) + + # detect ensembl ids + # some GTF files contain version in ENSEMBL, e.g. ENS00000000046319.1 + # we remove the version, because the annotation packages don't contain the version + if "gene_id" in annotation.columns: + logger.info("'gene_id' column was detected in attributes. Performing extra parsing of ENSEMBL ids.") + annotation["ENSEMBL_with_version"] = annotation["gene_id"].where(annotation["gene_id"].str.startswith("ENS")) + annotation["ENSEMBL"] = annotation["ENSEMBL_with_version"].str.replace(r"\\.\\d+\$", "", regex=True) + annotation["gene_id"] = annotation["gene_id"].str.replace(r"\\.\\d+\$", "", regex=True) + + possible_name_columns = ("Name", "name", "gene_name") + found_columns = list(filter(lambda col_name: col_name in annotation, possible_name_columns)) + # The following code allows to select a value for the SYMBOL column based on the first non-na column + if found_columns: + logger.info("Found one the following columns: %s; which can be used to populate the SYMBOL column", + ", ".join(possible_name_columns)) + # For each row (gtf entry), get the name of the first column that actually holds a value. + column_to_get = annotation.loc[:,found_columns].apply(pd.Series.first_valid_index, axis=1) + counts_per_column = column_to_get.value_counts(dropna=False).to_dict() + counts_per_column_str = [f'\\t{col}: {counts}\\n' for col, counts in counts_per_column.items()] + logger.info("Frequencies of the origin for the entries in the SYMBOL column:\\n%s", + "".join(counts_per_column_str).rstrip()) + # If all columns hold NA for a certain row, first_valid_index will return None. + # Just use the name of the first column. + column_to_get = column_to_get.fillna(found_columns[0]) + # We now have a list one column name per row, use it so select the values + # Loc cannot be used here because 1 value per row is required, + # and loc will select for each row all the columns in columns_to_get + idx, cols = pd.factorize(column_to_get) + symbol_values = annotation.reindex(cols, axis=1).to_numpy()[np.arange(len(annotation)), idx] + annotation["SYMBOL"] = symbol_values + logger.info("Dropping unused columns") + annotation = annotation.drop(["score", "source", "frame", "feature"], axis=1) + logger.info("Looking for duplicate rows and removing them. Starting with %i entries", annotation.shape[0]) + annotation = annotation.drop_duplicates(keep="first", ignore_index=True) + logger.info("After removing duplicates: %i entries", annotation.shape[0]) + logger.info("Writing to %s", par["output"]) + annotation.to_csv(par["output"], sep="\\t", header=True, index=False, na_rep="NA") + # Do these checks *after* writing the csv in order to be able to check the data + logger.info("Checking for unique gene IDs") + if not annotation["gene_id"].is_unique: + raise ValueError("Values from the 'gene_id' column are not unique after processing!") + logger.info("%s finished", meta['name']) + + +if __name__ == "__main__": + main(par) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_GTF" ]; then + VIASH_PAR_GTF=$(ViashDockerStripAutomount "$VIASH_PAR_GTF") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/eset/create_fdata/nextflow_labels.config b/target/executable/eset/create_fdata/nextflow_labels.config new file mode 100644 index 00000000..2821ec46 --- /dev/null +++ b/target/executable/eset/create_fdata/nextflow_labels.config @@ -0,0 +1,108 @@ +executor { + $k8s { + submitRateLimit = '10sec' + pollInterval = '1 sec' + } +} + +process { + container = 'nextflow/bash:latest' + + // default resources + memory = { 8.Gb * task.attempt } + cpus = 8 + maxForks = 36 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + maxMemory = 192.GB + + // Resource labels + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 8 } + withLabel: midcpu { cpus = 16 } + withLabel: highcpu { cpus = 32 } + + withLabel: verylowmem { memory = { get_memory( 4.GB * task.attempt ) } } + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 16.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 64.GB * task.attempt ) } } + +} + +profiles { + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.fixOwnership = true + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + + local { + // This config is for local processing. + process { + withName: ".*parallel_map_process" { + maxForks = 1 + } + maxMemory = 25.GB + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 6 } + withLabel: highcpu { cpus = 8 } + + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 12.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 20.GB * task.attempt ) } } + } + } +} + +def get_memory(to_compare) { + if (!process.containsKey("maxMemory") || !process.maxMemory) { + return to_compare + } + + try { + if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) { + return process.maxMemory + } + else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) { + return max_memory as nextflow.util.MemoryUnit + } + else { + return to_compare + } + } catch (all) { + println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!" + System.exit(1) + } + } diff --git a/target/executable/eset/create_pdata/.config.vsh.yaml b/target/executable/eset/create_pdata/.config.vsh.yaml new file mode 100644 index 00000000..be349a9b --- /dev/null +++ b/target/executable/eset/create_pdata/.config.vsh.yaml @@ -0,0 +1,250 @@ +name: "create_pdata" +namespace: "eset" +version: "updatecraftbox" +authors: +- name: "Dries Schaumont" + roles: + - "maintainer" + info: + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Marijke Van Moerbeke" + roles: + - "contributor" + info: + links: + github: "mvanmoerbeke" + orcid: "0000-0002-3097-5621" + linkedin: "marijke-van-moerbeke-84303a34" + organizations: + - name: "OpenAnalytics" + href: "https://www.openanalytics.eu" + role: "Statistical Consultant" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--star_stats_file" + description: "Tab-delimited text file containing statistics (per column) that\ + \ were generated\nfrom the STAR log files (Log.final.out, Summary.csv, ReadsPerGene.out.tab).\n\ + Each entry (row) in the file describes the values for one well (barcode).\n" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--nrReadsNrGenesPerChromPool" + description: "Pivot table in tsv format of the combined nrReadsNrGenesPerChrom\ + \ files from STAR. \nDescribes per chromosome (as columns) the number of reads,\ + \ as well as the total number \nof reads per cell barcode and the percentage\ + \ of nuclear, ERCC and mitochondrial\nreads.\n" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + info: null + default: + - "pData.$id.txt" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "create_pdata.py" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +- type: "file" + path: "_viash.yaml" + dest: "_viash.yaml" +description: "Create a pdata file by combining the mapping statistics \n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "nrReadsNrGenesPerChromPool.txt" +- type: "file" + path: "starLogs.txt" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +requirements: + commands: + - "ps" +license: "MIT" +links: + repository: "https://github.com/viash-hub/htrnaseq" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "updatecraftbox" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "pandas" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/eset/create_pdata/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/eset/create_pdata" + executable: "target/executable/eset/create_pdata/create_pdata" + viash_version: "0.9.4" + git_commit: "e6da525fc57aaec74f348eb974b68faa647bf800" + git_remote: "https://github.com/viash-hub/htrnaseq" +package_config: + name: "htrnaseq" + version: "updatecraftbox" + summary: "A workflow for high-throughput RNA-seq data analyses.\n" + description: "This workflow is designed to process high-throughput RNA-seq data,\ + \ where every\nwell of a microarray plate is a sample. A fasta file provided as\ + \ input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow\ + \ is built in a modular fashion, where most of the base functionality\nis provided\ + \ by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\n\ + supplemented by custom base components and workflow components in this package.\n\ + \nThe full workflow is split in two major subworkflows that can be run independently:\n\ + \n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per\ + \ well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate\ + \ QC reports.\n\nEach of those can be started individually, or the full workflow\ + \ can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq)\ + \ \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner)\ + \ where a\nnumber of choices (input/output structure and location) have been made.\n\ + \nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other\ + \ formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex)\ + \ first.\n" + info: + test_resources: + - path: "gs://viash-hub-resources/htrnaseq/v2" + dest: "resources_test" + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script\ + \ += 'includeConfig(\"nextflow_labels.config\")'\n.resources += {path: '/src/config/labels.config',\ + \ dest: 'nextflow_labels.config'}\n.resources += {path: '/_viash.yaml', dest:\ + \ '_viash.yaml'}\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'updatecraftbox'" + keywords: + - "bioinformatics" + - "sequencing" + - "high-throughput" + - "RNAseq" + - "mapping" + - "counting" + - "pipeline" + - "workflow" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/viash-hub/htrnaseq" + issue_tracker: "https://github.com/viash-hub/htrnaseq/issues" diff --git a/target/executable/eset/create_pdata/_viash.yaml b/target/executable/eset/create_pdata/_viash.yaml new file mode 100644 index 00000000..d08de7b1 --- /dev/null +++ b/target/executable/eset/create_pdata/_viash.yaml @@ -0,0 +1,21 @@ +name: htrnaseq +summary: | + A workflow for high-throughput RNA-seq data analyses. +description: "This workflow is designed to process high-throughput RNA-seq data, where every\nwell of a microarray plate is a sample. A fasta file provided as input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow is built in a modular fashion, where most of the base functionality\nis provided by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\nsupplemented by custom base components and workflow components in this package.\n\nThe full workflow is split in two major subworkflows that can be run independently:\n\n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate QC reports.\n\nEach of those can be started individually, or the full workflow can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq) \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner) where a\nnumber of choices (input/output structure and location) have been made.\n\nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex) first.\n" +license: MIT +keywords: [bioinformatics, sequencing, high-throughput, RNAseq, mapping, counting, pipeline, workflow] +links: + issue_tracker: https://github.com/viash-hub/htrnaseq/issues + repository: https://github.com/viash-hub/htrnaseq +viash_version: 0.9.4 +info: + test_resources: + - path: gs://viash-hub-resources/htrnaseq/v2 + dest: resources_test +config_mods: | + .requirements.commands := ['ps'] + .runners[.type == 'nextflow'].config.script += 'includeConfig("nextflow_labels.config")' + .resources += {path: '/src/config/labels.config', dest: 'nextflow_labels.config'} + .resources += {path: '/_viash.yaml', dest: '_viash.yaml'} +version: updatecraftbox +organization: vsh diff --git a/target/executable/eset/create_pdata/create_pdata b/target/executable/eset/create_pdata/create_pdata new file mode 100755 index 00000000..a9a167db --- /dev/null +++ b/target/executable/eset/create_pdata/create_pdata @@ -0,0 +1,1207 @@ +#!/usr/bin/env bash + +# create_pdata updatecraftbox +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dries Schaumont (maintainer) +# * Marijke Van Moerbeke (contributor) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="create_pdata" +VIASH_META_FUNCTIONALITY_NAME="create_pdata" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "pandas" + +LABEL org.opencontainers.image.authors="Dries Schaumont, Marijke Van Moerbeke" +LABEL org.opencontainers.image.description="Companion container for running component eset create_pdata" +LABEL org.opencontainers.image.created="2025-09-02T13:33:11Z" +LABEL org.opencontainers.image.source="https://github.com/viash-hub/htrnaseq" +LABEL org.opencontainers.image.revision="e6da525fc57aaec74f348eb974b68faa647bf800" +LABEL org.opencontainers.image.version="updatecraftbox" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "create_pdata updatecraftbox" + echo "" + echo "Create a pdata file by combining the mapping statistics" + echo "" + echo "Arguments:" + echo " --star_stats_file" + echo " type: file, required parameter, file must exist" + echo " Tab-delimited text file containing statistics (per column) that were" + echo " generated" + echo " from the STAR log files (Log.final.out, Summary.csv," + echo " ReadsPerGene.out.tab)." + echo " Each entry (row) in the file describes the values for one well" + echo " (barcode)." + echo "" + echo " --nrReadsNrGenesPerChromPool" + echo " type: file, required parameter, file must exist" + echo " Pivot table in tsv format of the combined nrReadsNrGenesPerChrom files" + echo " from STAR." + echo " Describes per chromosome (as columns) the number of reads, as well as" + echo " the total number" + echo " of reads per cell barcode and the percentage of nuclear, ERCC and" + echo " mitochondrial" + echo " reads." + echo "" + echo " --output" + echo " type: file, output, file must exist" + echo " default: pData.\$id.txt" + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "create_pdata updatecraftbox" + exit + ;; + --star_stats_file) + [ -n "$VIASH_PAR_STAR_STATS_FILE" ] && ViashError Bad arguments for option \'--star_stats_file\': \'$VIASH_PAR_STAR_STATS_FILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_STAR_STATS_FILE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --star_stats_file. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --star_stats_file=*) + [ -n "$VIASH_PAR_STAR_STATS_FILE" ] && ViashError Bad arguments for option \'--star_stats_file=*\': \'$VIASH_PAR_STAR_STATS_FILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_STAR_STATS_FILE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --nrReadsNrGenesPerChromPool) + [ -n "$VIASH_PAR_NRREADSNRGENESPERCHROMPOOL" ] && ViashError Bad arguments for option \'--nrReadsNrGenesPerChromPool\': \'$VIASH_PAR_NRREADSNRGENESPERCHROMPOOL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_NRREADSNRGENESPERCHROMPOOL="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --nrReadsNrGenesPerChromPool. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --nrReadsNrGenesPerChromPool=*) + [ -n "$VIASH_PAR_NRREADSNRGENESPERCHROMPOOL" ] && ViashError Bad arguments for option \'--nrReadsNrGenesPerChromPool=*\': \'$VIASH_PAR_NRREADSNRGENESPERCHROMPOOL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_NRREADSNRGENESPERCHROMPOOL=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/htrnaseq/eset/create_pdata:updatecraftbox' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'ps' 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'ps' 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_STAR_STATS_FILE+x} ]; then + ViashError '--star_stats_file' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_NRREADSNRGENESPERCHROMPOOL+x} ]; then + ViashError '--nrReadsNrGenesPerChromPool' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + VIASH_PAR_OUTPUT="pData.\$id.txt" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_STAR_STATS_FILE" ] && [ ! -e "$VIASH_PAR_STAR_STATS_FILE" ]; then + ViashError "Input file '$VIASH_PAR_STAR_STATS_FILE' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_NRREADSNRGENESPERCHROMPOOL" ] && [ ! -e "$VIASH_PAR_NRREADSNRGENESPERCHROMPOOL" ]; then + ViashError "Input file '$VIASH_PAR_NRREADSNRGENESPERCHROMPOOL' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_STAR_STATS_FILE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_STAR_STATS_FILE")" ) + VIASH_PAR_STAR_STATS_FILE=$(ViashDockerAutodetectMount "$VIASH_PAR_STAR_STATS_FILE") +fi +if [ ! -z "$VIASH_PAR_NRREADSNRGENESPERCHROMPOOL" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_NRREADSNRGENESPERCHROMPOOL")" ) + VIASH_PAR_NRREADSNRGENESPERCHROMPOOL=$(ViashDockerAutodetectMount "$VIASH_PAR_NRREADSNRGENESPERCHROMPOOL") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-create_pdata-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +from itertools import batched +import pandas as pd +import logging + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'star_stats_file': $( if [ ! -z ${VIASH_PAR_STAR_STATS_FILE+x} ]; then echo "r'${VIASH_PAR_STAR_STATS_FILE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'nrReadsNrGenesPerChromPool': $( if [ ! -z ${VIASH_PAR_NRREADSNRGENESPERCHROMPOOL+x} ]; then echo "r'${VIASH_PAR_NRREADSNRGENESPERCHROMPOOL//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + +logger = logging.getLogger() +console_handler = logging.StreamHandler() +logger.addHandler(console_handler) +logger.setLevel(logging.DEBUG) + +def main(par): + logger.info(f"{meta['name']} started.") + parameters_str = [f'\\t{param}: {param_val}\\n' for param, param_val in par.items()] + logger.info("Parameters:\\n%s", "".join(parameters_str).rstrip()) + logger.info("Reading %s", par["star_stats_file"]) + star_log_stats = pd.read_csv(par["star_stats_file"], sep="\\t", index_col=0) + logger.info("STAR log statics file contains information for the following barcodes: %s", + ", ".join(star_log_stats.index)) + logger.info("Reading %s", par["nrReadsNrGenesPerChromPool"]) + reads_and_genes_per_chr_stats = pd.read_csv(par["nrReadsNrGenesPerChromPool"], sep="\\t", index_col=0) + logger.info("Reads per gene and chromosome table contains information for the following barcodes: %s", + ", ".join(reads_and_genes_per_chr_stats.index)) + logger.info("Filtering mapping statistics file columns.") + cols_to_keep = ("WellID", "NumberOfMTReads", "pctMT", "NumberOfERCCReads", + "pctERCC", "NumberOfChromReads", "pctChrom") + try: + reads_and_genes_per_chr_stats = reads_and_genes_per_chr_stats.loc[:,cols_to_keep] + except KeyError as e: + raise KeyError("When trying to subset the reads per genes and chromosomes file, " + "a column was missing. Available columns in the file: " + f"{', '.join(reads_and_genes_per_chr_stats.columns)}.") from e + # Each barcode should be present. An alternative approach could be to just + # do the concatenation and check for NA values that are filled for non-overlapping + # index values, but there are already NA values present in the dataframes + if not star_log_stats.index.sort_values().equals(reads_and_genes_per_chr_stats.index.sort_values()): + raise ValueError("Error while combining two log files. It seems that the entries (barcodes) " + f"do not fully overlap. Barcodes in '{par['star_stats_file']}: " + f"{', '.join(reads_and_genes_per_chr_stats.index)}. Barcodes in " + f"'{par['nrReadsNrGenesPerChromPool']}': " + f"{', '.join(star_log_stats.index)}") + combined_stats = pd.concat([reads_and_genes_per_chr_stats, star_log_stats], axis=1) + logger.info("Summary of final output:\\n%s\\n", + "\\n".join(repr(combined_stats.loc[:,columns].describe()) + for columns in batched(combined_stats.columns, 3))) + logger.info("Writing to %s", par["output"]) + combined_stats.reset_index("WellBC").to_csv(par["output"], sep="\\t", header=True, index=False) + logger.info("Finished %s.", meta["name"]) + + +if __name__ == "__main__": + main(par) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_STAR_STATS_FILE" ]; then + VIASH_PAR_STAR_STATS_FILE=$(ViashDockerStripAutomount "$VIASH_PAR_STAR_STATS_FILE") + fi + if [ ! -z "$VIASH_PAR_NRREADSNRGENESPERCHROMPOOL" ]; then + VIASH_PAR_NRREADSNRGENESPERCHROMPOOL=$(ViashDockerStripAutomount "$VIASH_PAR_NRREADSNRGENESPERCHROMPOOL") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/eset/create_pdata/nextflow_labels.config b/target/executable/eset/create_pdata/nextflow_labels.config new file mode 100644 index 00000000..2821ec46 --- /dev/null +++ b/target/executable/eset/create_pdata/nextflow_labels.config @@ -0,0 +1,108 @@ +executor { + $k8s { + submitRateLimit = '10sec' + pollInterval = '1 sec' + } +} + +process { + container = 'nextflow/bash:latest' + + // default resources + memory = { 8.Gb * task.attempt } + cpus = 8 + maxForks = 36 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + maxMemory = 192.GB + + // Resource labels + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 8 } + withLabel: midcpu { cpus = 16 } + withLabel: highcpu { cpus = 32 } + + withLabel: verylowmem { memory = { get_memory( 4.GB * task.attempt ) } } + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 16.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 64.GB * task.attempt ) } } + +} + +profiles { + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.fixOwnership = true + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + + local { + // This config is for local processing. + process { + withName: ".*parallel_map_process" { + maxForks = 1 + } + maxMemory = 25.GB + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 6 } + withLabel: highcpu { cpus = 8 } + + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 12.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 20.GB * task.attempt ) } } + } + } +} + +def get_memory(to_compare) { + if (!process.containsKey("maxMemory") || !process.maxMemory) { + return to_compare + } + + try { + if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) { + return process.maxMemory + } + else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) { + return max_memory as nextflow.util.MemoryUnit + } + else { + return to_compare + } + } catch (all) { + println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!" + System.exit(1) + } + } diff --git a/target/executable/integration_test_components/htrnaseq/check_eset/.config.vsh.yaml b/target/executable/integration_test_components/htrnaseq/check_eset/.config.vsh.yaml new file mode 100644 index 00000000..3ea8b43e --- /dev/null +++ b/target/executable/integration_test_components/htrnaseq/check_eset/.config.vsh.yaml @@ -0,0 +1,208 @@ +name: "check_eset" +namespace: "integration_test_components/htrnaseq" +version: "updatecraftbox" +authors: +- name: "Dries Schaumont" + roles: + - "author" + - "maintainer" + info: + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--eset" + description: "Path to an ExpressionSet object." + info: null + example: + - "eset.rds" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--star_output" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" +resources: +- type: "r_script" + path: "script.R" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +- type: "file" + path: "_viash.yaml" + dest: "_viash.yaml" +description: "This component test the ExpressionSet object as output by the main pipeline." +info: null +status: "enabled" +scope: + image: "public" + target: "public" +requirements: + commands: + - "ps" +license: "MIT" +links: + repository: "https://github.com/viash-hub/htrnaseq" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "bioconductor/bioconductor_docker:3.19" + target_registry: "images.viash-hub.com" + target_tag: "updatecraftbox" + namespace_separator: "/" + setup: + - type: "r" + cran: + - "bit64" + bioc: + - "Biobase" + bioc_force_install: false + warnings_as_errors: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/integration_test_components/htrnaseq/check_eset/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/integration_test_components/htrnaseq/check_eset" + executable: "target/executable/integration_test_components/htrnaseq/check_eset/check_eset" + viash_version: "0.9.4" + git_commit: "e6da525fc57aaec74f348eb974b68faa647bf800" + git_remote: "https://github.com/viash-hub/htrnaseq" +package_config: + name: "htrnaseq" + version: "updatecraftbox" + summary: "A workflow for high-throughput RNA-seq data analyses.\n" + description: "This workflow is designed to process high-throughput RNA-seq data,\ + \ where every\nwell of a microarray plate is a sample. A fasta file provided as\ + \ input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow\ + \ is built in a modular fashion, where most of the base functionality\nis provided\ + \ by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\n\ + supplemented by custom base components and workflow components in this package.\n\ + \nThe full workflow is split in two major subworkflows that can be run independently:\n\ + \n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per\ + \ well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate\ + \ QC reports.\n\nEach of those can be started individually, or the full workflow\ + \ can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq)\ + \ \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner)\ + \ where a\nnumber of choices (input/output structure and location) have been made.\n\ + \nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other\ + \ formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex)\ + \ first.\n" + info: + test_resources: + - path: "gs://viash-hub-resources/htrnaseq/v2" + dest: "resources_test" + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script\ + \ += 'includeConfig(\"nextflow_labels.config\")'\n.resources += {path: '/src/config/labels.config',\ + \ dest: 'nextflow_labels.config'}\n.resources += {path: '/_viash.yaml', dest:\ + \ '_viash.yaml'}\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'updatecraftbox'" + keywords: + - "bioinformatics" + - "sequencing" + - "high-throughput" + - "RNAseq" + - "mapping" + - "counting" + - "pipeline" + - "workflow" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/viash-hub/htrnaseq" + issue_tracker: "https://github.com/viash-hub/htrnaseq/issues" diff --git a/target/executable/integration_test_components/htrnaseq/check_eset/_viash.yaml b/target/executable/integration_test_components/htrnaseq/check_eset/_viash.yaml new file mode 100644 index 00000000..d08de7b1 --- /dev/null +++ b/target/executable/integration_test_components/htrnaseq/check_eset/_viash.yaml @@ -0,0 +1,21 @@ +name: htrnaseq +summary: | + A workflow for high-throughput RNA-seq data analyses. +description: "This workflow is designed to process high-throughput RNA-seq data, where every\nwell of a microarray plate is a sample. A fasta file provided as input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow is built in a modular fashion, where most of the base functionality\nis provided by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\nsupplemented by custom base components and workflow components in this package.\n\nThe full workflow is split in two major subworkflows that can be run independently:\n\n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate QC reports.\n\nEach of those can be started individually, or the full workflow can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq) \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner) where a\nnumber of choices (input/output structure and location) have been made.\n\nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex) first.\n" +license: MIT +keywords: [bioinformatics, sequencing, high-throughput, RNAseq, mapping, counting, pipeline, workflow] +links: + issue_tracker: https://github.com/viash-hub/htrnaseq/issues + repository: https://github.com/viash-hub/htrnaseq +viash_version: 0.9.4 +info: + test_resources: + - path: gs://viash-hub-resources/htrnaseq/v2 + dest: resources_test +config_mods: | + .requirements.commands := ['ps'] + .runners[.type == 'nextflow'].config.script += 'includeConfig("nextflow_labels.config")' + .resources += {path: '/src/config/labels.config', dest: 'nextflow_labels.config'} + .resources += {path: '/_viash.yaml', dest: '_viash.yaml'} +version: updatecraftbox +organization: vsh diff --git a/target/executable/integration_test_components/htrnaseq/check_eset/check_eset b/target/executable/integration_test_components/htrnaseq/check_eset/check_eset new file mode 100755 index 00000000..84510b04 --- /dev/null +++ b/target/executable/integration_test_components/htrnaseq/check_eset/check_eset @@ -0,0 +1,1338 @@ +#!/usr/bin/env bash + +# check_eset updatecraftbox +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dries Schaumont (author, maintainer) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="check_eset" +VIASH_META_FUNCTIONALITY_NAME="check_eset" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM bioconductor/bioconductor_docker:3.19 +ENTRYPOINT [] +RUN Rscript -e 'options(warn = 2); if (!requireNamespace("remotes", quietly = TRUE)) install.packages("remotes")' && \ + Rscript -e 'options(warn = 2); if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager")' && \ + Rscript -e 'options(warn = 2); if (!requireNamespace("Biobase", quietly = TRUE)) BiocManager::install("Biobase")' && \ + Rscript -e 'options(warn = 2); remotes::install_cran(c("bit64"), repos = "https://cran.rstudio.com")' + +LABEL org.opencontainers.image.authors="Dries Schaumont" +LABEL org.opencontainers.image.description="Companion container for running component integration_test_components/htrnaseq check_eset" +LABEL org.opencontainers.image.created="2025-09-02T13:33:10Z" +LABEL org.opencontainers.image.source="https://github.com/viash-hub/htrnaseq" +LABEL org.opencontainers.image.revision="e6da525fc57aaec74f348eb974b68faa647bf800" +LABEL org.opencontainers.image.version="updatecraftbox" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "check_eset updatecraftbox" + echo "" + echo "This component test the ExpressionSet object as output by the main pipeline." + echo "" + echo "Inputs:" + echo " --eset" + echo " type: file, required parameter, file must exist" + echo " example: eset.rds" + echo " Path to an ExpressionSet object." + echo "" + echo " --star_output" + echo " type: file, required parameter, multiple values allowed, file must exist" + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "check_eset updatecraftbox" + exit + ;; + --eset) + [ -n "$VIASH_PAR_ESET" ] && ViashError Bad arguments for option \'--eset\': \'$VIASH_PAR_ESET\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ESET="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --eset. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --eset=*) + [ -n "$VIASH_PAR_ESET" ] && ViashError Bad arguments for option \'--eset=*\': \'$VIASH_PAR_ESET\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ESET=$(ViashRemoveFlags "$1") + shift 1 + ;; + --star_output) + if [ -z "$VIASH_PAR_STAR_OUTPUT" ]; then + VIASH_PAR_STAR_OUTPUT="$2" + else + VIASH_PAR_STAR_OUTPUT="$VIASH_PAR_STAR_OUTPUT;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --star_output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --star_output=*) + if [ -z "$VIASH_PAR_STAR_OUTPUT" ]; then + VIASH_PAR_STAR_OUTPUT=$(ViashRemoveFlags "$1") + else + VIASH_PAR_STAR_OUTPUT="$VIASH_PAR_STAR_OUTPUT;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/htrnaseq/integration_test_components/htrnaseq/check_eset:updatecraftbox' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'ps' 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'ps' 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_ESET+x} ]; then + ViashError '--eset' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_STAR_OUTPUT+x} ]; then + ViashError '--star_output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_ESET" ] && [ ! -e "$VIASH_PAR_ESET" ]; then + ViashError "Input file '$VIASH_PAR_ESET' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_STAR_OUTPUT" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_STAR_OUTPUT; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_ESET" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_ESET")" ) + VIASH_PAR_ESET=$(ViashDockerAutodetectMount "$VIASH_PAR_ESET") +fi +if [ ! -z "$VIASH_PAR_STAR_OUTPUT" ]; then + VIASH_TEST_STAR_OUTPUT=() + IFS=';' + for var in $VIASH_PAR_STAR_OUTPUT; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_STAR_OUTPUT+=( "$var" ) + done + VIASH_PAR_STAR_OUTPUT=$(IFS=';' ; echo "${VIASH_TEST_STAR_OUTPUT[*]}") +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-check_eset-XXXXXX").R +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +## VIASH START +# The following code has been auto-generated by Viash. +# treat warnings as errors +.viash_orig_warn <- options(warn = 2) + +par <- list( + "eset" = $( if [ ! -z ${VIASH_PAR_ESET+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_ESET" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "star_output" = $( if [ ! -z ${VIASH_PAR_STAR_OUTPUT+x} ]; then echo -n "strsplit('"; echo -n "$VIASH_PAR_STAR_OUTPUT" | sed "s#['\\]#\\\\&#g"; echo "', split = ';')[[1]]"; else echo NULL; fi ) +) +meta <- list( + "name" = $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_META_NAME" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "functionality_name" = $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_META_FUNCTIONALITY_NAME" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "resources_dir" = $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_META_RESOURCES_DIR" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "executable" = $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo -n "'"; echo -n "$VIASH_META_EXECUTABLE" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "config" = $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo -n "'"; echo -n "$VIASH_META_CONFIG" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "temp_dir" = $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_META_TEMP_DIR" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "cpus" = $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_META_CPUS" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_b" = $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_B" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_kb" = $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_KB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_mb" = $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_MB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_gb" = $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_GB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_tb" = $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_TB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_pb" = $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_PB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_kib" = $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_KIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_mib" = $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_MIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_gib" = $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_GIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_tib" = $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_TIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_pib" = $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_PIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ) +) +dep <- list( + +) + + +# restore original warn setting +options(.viash_orig_warn) +rm(.viash_orig_warn) + +## VIASH END +library(Biobase) +library(testthat) +library(Matrix) + +sample_1_result <- readRDS(par\$eset) +expected_sample_names <- c( + "sample_one_AACAAGGTAC", "sample_one_AACAATCAGG", "sample_one_AACACCTAGT", + "sample_one_AACAGGCAAT", "sample_one_AACATGGAGA", "sample_one_AACATTACCG", + "sample_one_AACCAGCCAG", "sample_one_AACCAGTTGA", "sample_one_AACCGCGACT", + "sample_one_AACCGGAAGG", "sample_one_AACCGGCGTA", "sample_one_AACCTAGTCC", + "sample_one_AACCTCATAG", "sample_one_AACGTAAGCT", "sample_one_AACTCTACAC", + "sample_one_AACTGTGTCA", "sample_one_AAGACGGATT", "sample_one_AAGATCGGCG", + "sample_one_AAGATGTCCA", "sample_one_AAGCATATGG", "sample_one_AAGCGATGTT", + "sample_one_AAGCGTTCAG", "sample_one_AAGCTCACCT", "sample_one_AAGGCATGCG", + "sample_one_AAGGTCTGGA", "sample_one_AAGTTAGCGC", "sample_one_AAGTTCCTTG", + "sample_one_AATACCGGTA", "sample_one_AATAGCCACA", "sample_one_AATCACGCGA", + "sample_one_AATCCATCTG", "sample_one_AATCCGCTCC", "sample_one_AATCCTACCA", + "sample_one_AATCGTCCGC", "sample_one_AATGAACACG", "sample_one_AATGACCTTC", + "sample_one_AATGAGAGCA", "sample_one_AATGTCAGTG", "sample_one_AATTAGGCCG", + "sample_one_AATTGCGATG", "sample_one_ACAACAGTCG", "sample_one_ACAACCATAC", + "sample_one_ACAACGGAGC", "sample_one_ACAAGCGCGA", "sample_one_ACACAATCTC", + "sample_one_ACACAGTGAA", "sample_one_ACACCGAATT", "sample_one_ACACGCAGTA", + "sample_one_ACACGGTCCT", "sample_one_ACACTTGCTG", "sample_one_ACAGTGCCAA", + "sample_one_ACATGTGTGC", "sample_one_ACCAGGACCA", "sample_one_ACCATAACAC", + "sample_one_ACCGAACCGT", "sample_one_ACCGAGAGTC", "sample_one_ACCGGTACAG", + "sample_one_ACCGTACTTC", "sample_one_ACCTCCGACA", "sample_one_ACCTCTCTCC", + "sample_one_ACCTGTCCGA", "sample_one_ACCTTATGTG", "sample_one_ACGAATGACA", + "sample_one_ACGCCTCAAC", "sample_one_ACGCCTTCGT", "sample_one_ACGCTGGATA", + "sample_one_ACGGTCCGTT", "sample_one_ACGTAGGCAC", "sample_one_ACGTGCTGAT", + "sample_one_ACTCCAAGCC", "sample_one_ACTGGCGCAT", "sample_one_ACTGGCTTCC", + "sample_one_ACTTAACTGC", "sample_one_ACTTCATCAC", "sample_one_ACTTCGTTGA", + "sample_one_ACTTCTCCTG", "sample_one_ACTTGAGGAA", "sample_one_ACTTGTAAGG", + "sample_one_AGAACCACGG", "sample_one_AGAAGCAATC", "sample_one_AGACCGTTAT", + "sample_one_AGACTAGCAT", "sample_one_AGAGATGCAG", "sample_one_AGAGCTTACA", + "sample_one_AGAGTGTAAC", "sample_one_AGAGTTCTGC", "sample_one_AGATAGTGCT", + "sample_one_AGCAATGCGC", "sample_one_AGCATGTCAT", "sample_one_AGCCACTAGC", + "sample_one_AGCCAGAATA", "sample_one_AGCCAGCTCT", "sample_one_AGCGATAACG", + "sample_one_AGCGTACAAT", "sample_one_AGCTATTCCA", "sample_one_AGCTCCTCAG", + "sample_one_AGGAGGCATA", "sample_one_AGGCGTCTGT", "sample_one_AGTAACTCAC", + "sample_one_AGTAAGCGTT", "sample_one_AGTCTGTACG", "sample_one_AGTGCAATGT", + "sample_one_ATAAGGTGCA", "sample_one_ATACACGACA", "sample_one_ATAGGCCATT", + "sample_one_ATATCCGCAT", "sample_one_ATCAGCACTT", "sample_one_ATCAGCGAGG", + "sample_one_ATCCAATACG", "sample_one_ATCCGCTGTG", "sample_one_ATCCGTCCAT", + "sample_one_ATCGACGGCT", "sample_one_ATCGCGATTA", "sample_one_ATCGGTAGGC", + "sample_one_ATCTAAGGAG", "sample_one_ATGACGGTAA", "sample_one_ATGACTCAGT", + "sample_one_ATGCACCGGA", "sample_one_ATGCGGACTG", "sample_one_ATGCTTCCTA", + "sample_one_ATGGACCAAC", "sample_one_ATGGTCTTAG", "sample_one_ATGGTGAGCG", + "sample_one_ATGTGGAAGC", "sample_one_ATTATCGGAC", "sample_one_ATTCGGAACA", + "sample_one_CAACAATCCA", "sample_one_CAAGAAGCAT", "sample_one_CAAGATGAGG", + "sample_one_CAAGCCAACG", "sample_one_CAAGTGGATC", "sample_one_CACAGTTCAT", + "sample_one_CACGAGTCTG", "sample_one_CACGCTCCAA", "sample_one_CACTGAGCAC", + "sample_one_CAGATCAATG", "sample_one_CAGTGCTCTT", "sample_one_CAGTTAAGCA", + "sample_one_CATAGCTATC", "sample_one_CATCACCACC", "sample_one_CATGTACGCC", + "sample_one_CATTACACTG", "sample_one_CATTCGACGA", "sample_one_CCAACTATGG", + "sample_one_CCAAGGAGTT", "sample_one_CCAATTGTTC", "sample_one_CCACAAGTGC", + "sample_one_CCAGCTTAGT", "sample_one_CCATAACTTG", "sample_one_CCATACTGAC", + "sample_one_CCATAGATCA", "sample_one_CCATGTGCTT", "sample_one_CCATTCAGCG", + "sample_one_CCGAACAAGC", "sample_one_CCGAACCTAA", "sample_one_CCGAAGACCT", + "sample_one_CCGAATAGTG", "sample_one_CCGACTTCTC", "sample_one_CCGATCCACT", + "sample_one_CCGATGATAC", "sample_one_CCGCGTTATG", "sample_one_CCGCTAGCTT", + "sample_one_CCGGAGTATC", "sample_one_CCGGCCAATT", "sample_one_CCGGTCTCTA", + "sample_one_CCGTACGATG", "sample_one_CCGTCAGAAC", "sample_one_CCTAGACACG", + "sample_one_CCTAGTTGAG", "sample_one_CCTATTCTGT", "sample_one_CCTCAACCGA", + "sample_one_CCTCCATAAG", "sample_one_CCTGATGCCA", "sample_one_CCTGCAATAC", + "sample_one_CCTTGTATTC", "sample_one_CGAGATCTCT", "sample_one_CGAGGAACAA", + "sample_one_CGATAACCGC", "sample_one_CGATCCTGTG", "sample_one_CGCCAACCAT", + "sample_one_CGCCAGTGTT", "sample_one_CGCCTTGTAC", "sample_one_CGCGGATTCA", + "sample_one_CGCTTAAGGC", "sample_one_CGCTTACTAA", "sample_one_CGCTTCTTGG", + "sample_one_CGGAAGCTGT", "sample_one_CGGAATACAC", "sample_one_CGGAGATTGG", + "sample_one_CGGAGCTCAA", "sample_one_CGGATCGGTA", "sample_one_CGGATTCTAG", + "sample_one_CGGCAACTTA", "sample_one_CGGCTCATCA", "sample_one_CGGTCGTATT", + "sample_one_CGGTGACATC", "sample_one_CGTAACGGAT", "sample_one_CGTAAGATTC", + "sample_one_CGTACTGTAA", "sample_one_CGTAGAAGAC", "sample_one_CGTCCTAGGA", + "sample_one_CGTCGGCAAT", "sample_one_CGTGAGTTAT", "sample_one_CGTGTCAAGC", + "sample_one_CTAACTTCAG", "sample_one_CTAATAGCGT", "sample_one_CTACACCAGG", + "sample_one_CTAGCACAAT", "sample_one_CTATGAACGG", "sample_one_CTCAAGGACC", + "sample_one_CTCACCTGTC", "sample_one_CTCCTATTGT", "sample_one_CTCGCAACGT", + "sample_one_CTCGTGCCTA", "sample_one_CTGGATTGAC", "sample_one_CTGTAGTCAG", + "sample_one_CTGTCGCTTC", "sample_one_CTGTCTGTGT", "sample_one_CTTCATATCG", + "sample_one_CTTGCTGACG", "sample_one_GAAGGATTAG", "sample_one_GAATCGAGCC", + "sample_one_GACCATCTAA", "sample_one_GACGACCACA", "sample_one_GAGACATCTT", + "sample_one_GAGCGAGTCA", "sample_one_GAGTAGACCA", "sample_one_GATACGCTTA", + "sample_one_GATAGACTGT", "sample_one_GATAGAGGCG", "sample_one_GATAGGTCAA", + "sample_one_GATATCAGGA", "sample_one_GATCTCATTC", "sample_one_GATCTGGTCG", + "sample_one_GATGAGTGAC", "sample_one_GATGGATACA", "sample_one_GATGTGACAG", + "sample_one_GATTAAGTCC", "sample_one_GATTGCACGC", "sample_one_GCAAGCGAAT", + "sample_one_GCAATGTAAG", "sample_one_GCACACTATA", "sample_one_GCACTCGGAA", + "sample_one_GCACTGCGTT", "sample_one_GCACTTAATC", "sample_one_GCAGGAGATG", + "sample_one_GCAGTACTGG", "sample_one_GCATATGAGT", "sample_one_GCATCCGATC", + "sample_one_GCCAAGTACA", "sample_one_GCCACGATTC", "sample_one_GCCATAGGTT", + "sample_one_GCCATATCGA", "sample_one_GCCGTCAATA", "sample_one_GCCTGGACAT", + "sample_one_GCGTAATTAC", "sample_one_GCTATTATCC", "sample_one_GCTCAGTAAT", + "sample_one_GCTGCTTATA", "sample_one_GGAATAAGCA", "sample_one_GGACGATGCT", + "sample_one_GGCATCGTGA", "sample_one_GGCATTATTG", "sample_one_GGCCGAGATT", + "sample_one_GGCGCTATAA", "sample_one_GGCGTTAAGT", "sample_one_GGCTATTGAT", + "sample_one_GGCTGCTACT", "sample_one_GGTAATGTGT", "sample_one_GGTGGTTGGA", + "sample_one_GGTGTTCACC", "sample_one_GGTTAGATCT", "sample_one_GGTTATGGCG", + "sample_one_GGTTCACTGG", "sample_one_GGTTGTGCAA", "sample_one_GTAACCAGTA", + "sample_one_GTAACCTTGG", "sample_one_GTAAGAACCT", "sample_one_GTAAGGCTCC", + "sample_one_GTAATCCACG", "sample_one_GTATTGTGGA", "sample_one_GTCCGCATCA", + "sample_one_GTCCTTCGGT", "sample_one_GTCGCTCTCT", "sample_one_GTCGGTGACA", + "sample_one_GTCTCGAGTG", "sample_one_GTCTCTTAAG", "sample_one_GTCTTCCGAG", + "sample_one_GTGACTATAC", "sample_one_GTGGTTAATG", "sample_one_GTGTGCCTGT", + "sample_one_GTGTGTGTCC", "sample_one_GTTCATTGCC", "sample_one_GTTCCGGTGA", + "sample_one_GTTCGTCGAA", "sample_one_GTTGAATTGG", "sample_one_GTTGATCCGC", + "sample_one_GTTGTATGCT", "sample_one_TAACCGTAGC", "sample_one_TAACGTCGAT", + "sample_one_TAAGGTACGG", "sample_one_TACGGACATA", "sample_one_TACTACCGCC", + "sample_one_TACTGTCAAG", "sample_one_TAGCGAACGC", "sample_one_TAGCGCCAAC", + "sample_one_TAGGACGCCT", "sample_one_TAGGTTGCAA", "sample_one_TAGTAGTCTC", + "sample_one_TAGTCCGCTG", "sample_one_TAGTGGAACT", "sample_one_TATCATGCAG", + "sample_one_TATCGTTACG", "sample_one_TCAAGTGCAG", "sample_one_TCACAGATAC", + "sample_one_TCACCGCCTA", "sample_one_TCACGCCACT", "sample_one_TCACGTTGGC", + "sample_one_TCATTGTCCA", "sample_one_TCCACACTAG", "sample_one_TCCACGGTCA", + "sample_one_TCCACTCGCT", "sample_one_TCCGACTAAC", "sample_one_TCCGTTATCT", + "sample_one_TCCTAAGAGA", "sample_one_TCCTCTAGTA", "sample_one_TCGAAGCATT", + "sample_one_TCGAGAGAGC", "sample_one_TCGCACTTGA", "sample_one_TCGCCTACTG", + "sample_one_TCGCGTAGCA", "sample_one_TCGGCGTTAA", "sample_one_TCTACATCCG", + "sample_one_TCTCCACATT", "sample_one_TCTCTCCTAT", "sample_one_TCTTGCTCGG", + "sample_one_TGAACTAACC", "sample_one_TGAAGAAGGT", "sample_one_TGAGCGTTCC", + "sample_one_TGAGTACGTA", "sample_one_TGGAATGGAG", "sample_one_TGTCATTCGC", + "sample_one_TGTGCTTCAG", "sample_one_TGTTCAGGAT", "sample_one_TTACACACGT", + "sample_one_TTACTGTGAC", "sample_one_TTATAGGAGG", "sample_one_TTATCGCGTT", + "sample_one_TTATGCCGCG", "sample_one_TTCACGGAAG", "sample_one_TTCAGGAGTA", + "sample_one_TTCCATCGAG", "sample_one_TTCGAGTGAT", "sample_one_TTCTGTACCT", + "sample_one_TTGGCAATTC", "sample_one_TTGGCTCCAC", "sample_one_TTGGTAACAG", + "sample_one_TTGGTCAGTA", "sample_one_TTGTCGGCCA", "sample_one_TTGTGTTCGA" +) +stopifnot(identical(sampleNames(sample_1_result), expected_sample_names)) + +expected_var_labels <- c( + "WellBC", + "WellID", + "NumberOfMTReads", + "pctMT", + "NumberOfERCCReads", + "pctERCC", + "NumberOfChromReads", + "pctChrom", + "NumberOfInputReads", + "NumberOfMappedReads", + "PctMappedReads", + "NumberOfReadsMappedToMultipleLoci", + "PectOfReadsMappedToMultipleLoci", + "NumberOfReadsMappedToTooManyLoci", + "PectOfReadsMappedToTooManyLoci", + "NumberOfReadsUnmappedTooManyMismatches", + "PectOfReadsUnmappedTooManyMismatches", + "NumberOfReadsUnmappedTooShort", + "PectOfReadsUnmappedTooShort", + "NumberOfReadsUnmappedOther", + "PectOfReadsUnmappedOther", + "ReadsWithValidBarcodes", + "SequencingSaturation", + "Q30BasesInCB.UMI", + "ReadsMappedToTranscriptome.Unique.MultipeGenes", + "EstimatedNumberOfCells", + "FractionOfReadsInCells", + "MeanReadsPerCell", + "NumberOfUMIs", + "NumberOfGenes", + "NumberOfCountedReads", + "PoolName" +) +stopifnot(identical(varLabels(sample_1_result), expected_var_labels)) + +read_mm <- function(mapping_dir) { + market_matrix_file <- file.path(mapping_dir, "Solo.out", + "Gene", "raw", "matrix.mtx") + result <- readMM(market_matrix_file) + feature_file <- file.path(mapping_dir, "Solo.out", + "Gene", "raw", "features.tsv") + features <- read.table(feature_file, sep = "\\t", header = FALSE, + col.names = c("ID", "Name", "Type"))\$ID + rownames(result) <- gsub("\\\\.\\\\d+\$", "", features) + barcodes_file <- file.path(mapping_dir, + "Solo.out", "Gene", "raw", "barcodes.tsv") + if (!file.exists(barcodes_file)) { + stop(paste0("Expected the 'Solo.out/Gene/raw' directory at ", + mapping_dir, " to contain a 'barcodes.tsv' file.")) + } + barcodes <- readLines(barcodes_file) + if (length(barcodes) != 1) { + stop(paste0("A single STAR Solo folder should only have ", + "mapped one (1) barcode, but found '", + length(barcodes), "'for mapping directory ", mapping_dir)) + } + colnames(result) <- paste0("sample_one_", barcodes) + return(result) +} +expected_matrices <- lapply(par\$star_output, read_mm) +expected_matrix <- as.matrix(do.call(cbind, expected_matrices)) +result_counts <- exprs(sample_1_result) +stopifnot(length(setdiff(colnames(expected_matrix), + colnames(exprs(sample_1_result)))) == 0) +stopifnot(length(setdiff(rownames(expected_matrix), + rownames(exprs(sample_1_result)))) == 0) +expected_matrix_sorted <- expected_matrix[, colnames(exprs(sample_1_result))] +stopifnot(identical(exprs(sample_1_result), expected_matrix_sorted)) +VIASHMAIN +Rscript "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_ESET" ]; then + VIASH_PAR_ESET=$(ViashDockerStripAutomount "$VIASH_PAR_ESET") + fi + if [ ! -z "$VIASH_PAR_STAR_OUTPUT" ]; then + unset VIASH_TEST_STAR_OUTPUT + IFS=';' + for var in $VIASH_PAR_STAR_OUTPUT; do + unset IFS + if [ -z "$VIASH_TEST_STAR_OUTPUT" ]; then + VIASH_TEST_STAR_OUTPUT="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_STAR_OUTPUT="$VIASH_TEST_STAR_OUTPUT;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_STAR_OUTPUT="$VIASH_TEST_STAR_OUTPUT" + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +exit 0 diff --git a/target/executable/integration_test_components/htrnaseq/check_eset/nextflow_labels.config b/target/executable/integration_test_components/htrnaseq/check_eset/nextflow_labels.config new file mode 100644 index 00000000..2821ec46 --- /dev/null +++ b/target/executable/integration_test_components/htrnaseq/check_eset/nextflow_labels.config @@ -0,0 +1,108 @@ +executor { + $k8s { + submitRateLimit = '10sec' + pollInterval = '1 sec' + } +} + +process { + container = 'nextflow/bash:latest' + + // default resources + memory = { 8.Gb * task.attempt } + cpus = 8 + maxForks = 36 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + maxMemory = 192.GB + + // Resource labels + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 8 } + withLabel: midcpu { cpus = 16 } + withLabel: highcpu { cpus = 32 } + + withLabel: verylowmem { memory = { get_memory( 4.GB * task.attempt ) } } + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 16.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 64.GB * task.attempt ) } } + +} + +profiles { + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.fixOwnership = true + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + + local { + // This config is for local processing. + process { + withName: ".*parallel_map_process" { + maxForks = 1 + } + maxMemory = 25.GB + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 6 } + withLabel: highcpu { cpus = 8 } + + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 12.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 20.GB * task.attempt ) } } + } + } +} + +def get_memory(to_compare) { + if (!process.containsKey("maxMemory") || !process.maxMemory) { + return to_compare + } + + try { + if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) { + return process.maxMemory + } + else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) { + return max_memory as nextflow.util.MemoryUnit + } + else { + return to_compare + } + } catch (all) { + println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!" + System.exit(1) + } + } diff --git a/target/executable/integration_test_components/well_demultiplexing/check_cutadapt_output/.config.vsh.yaml b/target/executable/integration_test_components/well_demultiplexing/check_cutadapt_output/.config.vsh.yaml new file mode 100644 index 00000000..30339dc6 --- /dev/null +++ b/target/executable/integration_test_components/well_demultiplexing/check_cutadapt_output/.config.vsh.yaml @@ -0,0 +1,217 @@ +name: "check_cutadapt_output" +namespace: "integration_test_components/well_demultiplexing" +version: "updatecraftbox" +authors: +- name: "Dries Schaumont" + roles: + - "author" + - "maintainer" + info: + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--fastq_r1" + description: "Path to the forward reads to test." + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--fastq_r2" + description: "Path to the reverse reads to test." + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--ids" + description: "Well IDs for the corresponding fastq input" + info: null + required: true + direction: "input" + multiple: true + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +- type: "file" + path: "_viash.yaml" + dest: "_viash.yaml" +description: "This component test the cutadapt output from the well_demultiplex subworkflow." +info: null +status: "enabled" +scope: + image: "public" + target: "public" +requirements: + commands: + - "ps" +license: "MIT" +links: + repository: "https://github.com/viash-hub/htrnaseq" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "updatecraftbox" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "dnaio" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/integration_test_components/well_demultiplexing/check_cutatapt_output/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/integration_test_components/well_demultiplexing/check_cutadapt_output" + executable: "target/executable/integration_test_components/well_demultiplexing/check_cutadapt_output/check_cutadapt_output" + viash_version: "0.9.4" + git_commit: "e6da525fc57aaec74f348eb974b68faa647bf800" + git_remote: "https://github.com/viash-hub/htrnaseq" +package_config: + name: "htrnaseq" + version: "updatecraftbox" + summary: "A workflow for high-throughput RNA-seq data analyses.\n" + description: "This workflow is designed to process high-throughput RNA-seq data,\ + \ where every\nwell of a microarray plate is a sample. A fasta file provided as\ + \ input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow\ + \ is built in a modular fashion, where most of the base functionality\nis provided\ + \ by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\n\ + supplemented by custom base components and workflow components in this package.\n\ + \nThe full workflow is split in two major subworkflows that can be run independently:\n\ + \n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per\ + \ well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate\ + \ QC reports.\n\nEach of those can be started individually, or the full workflow\ + \ can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq)\ + \ \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner)\ + \ where a\nnumber of choices (input/output structure and location) have been made.\n\ + \nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other\ + \ formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex)\ + \ first.\n" + info: + test_resources: + - path: "gs://viash-hub-resources/htrnaseq/v2" + dest: "resources_test" + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script\ + \ += 'includeConfig(\"nextflow_labels.config\")'\n.resources += {path: '/src/config/labels.config',\ + \ dest: 'nextflow_labels.config'}\n.resources += {path: '/_viash.yaml', dest:\ + \ '_viash.yaml'}\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'updatecraftbox'" + keywords: + - "bioinformatics" + - "sequencing" + - "high-throughput" + - "RNAseq" + - "mapping" + - "counting" + - "pipeline" + - "workflow" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/viash-hub/htrnaseq" + issue_tracker: "https://github.com/viash-hub/htrnaseq/issues" diff --git a/target/executable/integration_test_components/well_demultiplexing/check_cutadapt_output/_viash.yaml b/target/executable/integration_test_components/well_demultiplexing/check_cutadapt_output/_viash.yaml new file mode 100644 index 00000000..d08de7b1 --- /dev/null +++ b/target/executable/integration_test_components/well_demultiplexing/check_cutadapt_output/_viash.yaml @@ -0,0 +1,21 @@ +name: htrnaseq +summary: | + A workflow for high-throughput RNA-seq data analyses. +description: "This workflow is designed to process high-throughput RNA-seq data, where every\nwell of a microarray plate is a sample. A fasta file provided as input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow is built in a modular fashion, where most of the base functionality\nis provided by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\nsupplemented by custom base components and workflow components in this package.\n\nThe full workflow is split in two major subworkflows that can be run independently:\n\n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate QC reports.\n\nEach of those can be started individually, or the full workflow can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq) \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner) where a\nnumber of choices (input/output structure and location) have been made.\n\nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex) first.\n" +license: MIT +keywords: [bioinformatics, sequencing, high-throughput, RNAseq, mapping, counting, pipeline, workflow] +links: + issue_tracker: https://github.com/viash-hub/htrnaseq/issues + repository: https://github.com/viash-hub/htrnaseq +viash_version: 0.9.4 +info: + test_resources: + - path: gs://viash-hub-resources/htrnaseq/v2 + dest: resources_test +config_mods: | + .requirements.commands := ['ps'] + .runners[.type == 'nextflow'].config.script += 'includeConfig("nextflow_labels.config")' + .resources += {path: '/src/config/labels.config', dest: 'nextflow_labels.config'} + .resources += {path: '/_viash.yaml', dest: '_viash.yaml'} +version: updatecraftbox +organization: vsh diff --git a/target/executable/integration_test_components/well_demultiplexing/check_cutadapt_output/check_cutadapt_output b/target/executable/integration_test_components/well_demultiplexing/check_cutadapt_output/check_cutadapt_output new file mode 100755 index 00000000..05510ba4 --- /dev/null +++ b/target/executable/integration_test_components/well_demultiplexing/check_cutadapt_output/check_cutadapt_output @@ -0,0 +1,1265 @@ +#!/usr/bin/env bash + +# check_cutadapt_output updatecraftbox +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dries Schaumont (author, maintainer) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="check_cutadapt_output" +VIASH_META_FUNCTIONALITY_NAME="check_cutadapt_output" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "dnaio" + +LABEL org.opencontainers.image.authors="Dries Schaumont" +LABEL org.opencontainers.image.description="Companion container for running component integration_test_components/well_demultiplexing check_cutadapt_output" +LABEL org.opencontainers.image.created="2025-09-02T13:33:11Z" +LABEL org.opencontainers.image.source="https://github.com/viash-hub/htrnaseq" +LABEL org.opencontainers.image.revision="e6da525fc57aaec74f348eb974b68faa647bf800" +LABEL org.opencontainers.image.version="updatecraftbox" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "check_cutadapt_output updatecraftbox" + echo "" + echo "This component test the cutadapt output from the well_demultiplex subworkflow." + echo "" + echo "Inputs:" + echo " --fastq_r1" + echo " type: file, required parameter, multiple values allowed, file must exist" + echo " Path to the forward reads to test." + echo "" + echo " --fastq_r2" + echo " type: file, required parameter, multiple values allowed, file must exist" + echo " Path to the reverse reads to test." + echo "" + echo " --ids" + echo " type: string, required parameter, multiple values allowed" + echo " Well IDs for the corresponding fastq input" + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "check_cutadapt_output updatecraftbox" + exit + ;; + --fastq_r1) + if [ -z "$VIASH_PAR_FASTQ_R1" ]; then + VIASH_PAR_FASTQ_R1="$2" + else + VIASH_PAR_FASTQ_R1="$VIASH_PAR_FASTQ_R1;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --fastq_r1. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --fastq_r1=*) + if [ -z "$VIASH_PAR_FASTQ_R1" ]; then + VIASH_PAR_FASTQ_R1=$(ViashRemoveFlags "$1") + else + VIASH_PAR_FASTQ_R1="$VIASH_PAR_FASTQ_R1;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --fastq_r2) + if [ -z "$VIASH_PAR_FASTQ_R2" ]; then + VIASH_PAR_FASTQ_R2="$2" + else + VIASH_PAR_FASTQ_R2="$VIASH_PAR_FASTQ_R2;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --fastq_r2. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --fastq_r2=*) + if [ -z "$VIASH_PAR_FASTQ_R2" ]; then + VIASH_PAR_FASTQ_R2=$(ViashRemoveFlags "$1") + else + VIASH_PAR_FASTQ_R2="$VIASH_PAR_FASTQ_R2;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --ids) + if [ -z "$VIASH_PAR_IDS" ]; then + VIASH_PAR_IDS="$2" + else + VIASH_PAR_IDS="$VIASH_PAR_IDS;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --ids. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --ids=*) + if [ -z "$VIASH_PAR_IDS" ]; then + VIASH_PAR_IDS=$(ViashRemoveFlags "$1") + else + VIASH_PAR_IDS="$VIASH_PAR_IDS;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/htrnaseq/integration_test_components/well_demultiplexing/check_cutadapt_output:updatecraftbox' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'ps' 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'ps' 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_FASTQ_R1+x} ]; then + ViashError '--fastq_r1' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_FASTQ_R2+x} ]; then + ViashError '--fastq_r2' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_IDS+x} ]; then + ViashError '--ids' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_FASTQ_R1" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_FASTQ_R1; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi +if [ ! -z "$VIASH_PAR_FASTQ_R2" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_FASTQ_R2; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_FASTQ_R1" ]; then + VIASH_TEST_FASTQ_R1=() + IFS=';' + for var in $VIASH_PAR_FASTQ_R1; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_FASTQ_R1+=( "$var" ) + done + VIASH_PAR_FASTQ_R1=$(IFS=';' ; echo "${VIASH_TEST_FASTQ_R1[*]}") +fi +if [ ! -z "$VIASH_PAR_FASTQ_R2" ]; then + VIASH_TEST_FASTQ_R2=() + IFS=';' + for var in $VIASH_PAR_FASTQ_R2; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_FASTQ_R2+=( "$var" ) + done + VIASH_PAR_FASTQ_R2=$(IFS=';' ; echo "${VIASH_TEST_FASTQ_R2[*]}") +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-check_cutadapt_output-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import dnaio +from operator import itemgetter +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'fastq_r1': $( if [ ! -z ${VIASH_PAR_FASTQ_R1+x} ]; then echo "r'${VIASH_PAR_FASTQ_R1//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'fastq_r2': $( if [ ! -z ${VIASH_PAR_FASTQ_R2+x} ]; then echo "r'${VIASH_PAR_FASTQ_R2//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'ids': $( if [ ! -z ${VIASH_PAR_IDS+x} ]; then echo "r'${VIASH_PAR_IDS//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +def assert_number_of_reads(reads): + expected_number_of_reads = { + "SRR14730301__A1": 165, + "SRR14730301__B1": 194, + "SRR14730302__A1": 141, + "SRR14730302__B1": 213, + "SRR14730302__unknown": 99646, + "SRR14730301__unknown": 99641, + } + for input_id, expected_reads in expected_number_of_reads.items(): + num_reads = len(reads[input_id]) + assert num_reads == expected_reads, \\ + f"Expected number of ouput reads for {input_id} to be {expected_reads}, was {num_reads}." + + +def string_difference(string1, string2): + result = 0 + for char1, char2 in zip(string1, string2, strict=True): + if char1.lower() != char2.lower(): + result += 1 + return result + + +def assert_barcodes_not_removed(reads): + barcodes = { + "SRR14730301__A1": "ACACCGAATT", + "SRR14730302__A1": "ACACCGAATT", + "SRR14730301__B1": "GGCTATTGAT", + "SRR14730302__B1": "GGCTATTGAT" + } + for sample_id, barcode in barcodes.items(): + sample_reads = reads[sample_id] + forward_reads = map(itemgetter(0), sample_reads) + for i, forward_read in enumerate(forward_reads): + read_sequence = forward_read.sequence + read_barcode_start = read_sequence[: len(barcode)] + # A 10% difference is allowed. + assert string_difference(read_barcode_start, barcode) <= (0.1 * len(barcode)), \\ + (f"Expected barcode {barcode} to be present for sample {sample_id} " + f"in read {i}. Found {read_barcode_start}") + +def create_input_mapping(sample_ids, inputs_r1, inputs_r2): + return {sample_id: [input_r1, input_r2] + for sample_id, input_r1, input_r2 + in zip(sample_ids, inputs_r1, inputs_r2, strict=True)} + +def read_input_files(input_mapping): + expected_keys = {"SRR14730301__A1", "SRR14730301__B1", + "SRR14730302__A1", "SRR14730302__B1", + "SRR14730301__unknown", "SRR14730302__unknown"} + difference = set(input_mapping.keys()) - expected_keys + assert not difference, f"Found unexpected output id(s): {difference}" + result = {} + for input_id, input_files in input_mapping.items(): + input_r1, input_r2 = input_files + # This reads the files into memory, + # but they are reasonably small + with dnaio.open(input_r1) as r1_reads, dnaio.open(input_r2) as r2_reads: + for r1_read, r2_read in zip(r1_reads, r2_reads, strict=True): + result.setdefault(input_id, []).append((r1_read, r2_read)) + return result + + +def main(par): + inputs = create_input_mapping(par["ids"], par["fastq_r1"], par["fastq_r2"]) + reads = read_input_files(inputs) + assert_number_of_reads(reads) + assert_barcodes_not_removed(reads) + +if __name__ == "__main__": + main(par) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_FASTQ_R1" ]; then + unset VIASH_TEST_FASTQ_R1 + IFS=';' + for var in $VIASH_PAR_FASTQ_R1; do + unset IFS + if [ -z "$VIASH_TEST_FASTQ_R1" ]; then + VIASH_TEST_FASTQ_R1="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_FASTQ_R1="$VIASH_TEST_FASTQ_R1;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_FASTQ_R1="$VIASH_TEST_FASTQ_R1" + fi + if [ ! -z "$VIASH_PAR_FASTQ_R2" ]; then + unset VIASH_TEST_FASTQ_R2 + IFS=';' + for var in $VIASH_PAR_FASTQ_R2; do + unset IFS + if [ -z "$VIASH_TEST_FASTQ_R2" ]; then + VIASH_TEST_FASTQ_R2="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_FASTQ_R2="$VIASH_TEST_FASTQ_R2;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_FASTQ_R2="$VIASH_TEST_FASTQ_R2" + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +exit 0 diff --git a/target/executable/integration_test_components/well_demultiplexing/check_cutadapt_output/nextflow_labels.config b/target/executable/integration_test_components/well_demultiplexing/check_cutadapt_output/nextflow_labels.config new file mode 100644 index 00000000..2821ec46 --- /dev/null +++ b/target/executable/integration_test_components/well_demultiplexing/check_cutadapt_output/nextflow_labels.config @@ -0,0 +1,108 @@ +executor { + $k8s { + submitRateLimit = '10sec' + pollInterval = '1 sec' + } +} + +process { + container = 'nextflow/bash:latest' + + // default resources + memory = { 8.Gb * task.attempt } + cpus = 8 + maxForks = 36 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + maxMemory = 192.GB + + // Resource labels + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 8 } + withLabel: midcpu { cpus = 16 } + withLabel: highcpu { cpus = 32 } + + withLabel: verylowmem { memory = { get_memory( 4.GB * task.attempt ) } } + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 16.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 64.GB * task.attempt ) } } + +} + +profiles { + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.fixOwnership = true + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + + local { + // This config is for local processing. + process { + withName: ".*parallel_map_process" { + maxForks = 1 + } + maxMemory = 25.GB + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 6 } + withLabel: highcpu { cpus = 8 } + + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 12.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 20.GB * task.attempt ) } } + } + } +} + +def get_memory(to_compare) { + if (!process.containsKey("maxMemory") || !process.maxMemory) { + return to_compare + } + + try { + if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) { + return process.maxMemory + } + else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) { + return max_memory as nextflow.util.MemoryUnit + } + else { + return to_compare + } + } catch (all) { + println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!" + System.exit(1) + } + } diff --git a/target/executable/io/publish_fastqs/.config.vsh.yaml b/target/executable/io/publish_fastqs/.config.vsh.yaml new file mode 100644 index 00000000..6b78943c --- /dev/null +++ b/target/executable/io/publish_fastqs/.config.vsh.yaml @@ -0,0 +1,192 @@ +name: "publish_fastqs" +namespace: "io" +version: "updatecraftbox" +argument_groups: +- name: "Input arguments" + arguments: + - type: "file" + name: "--input" + description: "Directory to write fastq data to" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Output arguments" + arguments: + - type: "file" + name: "--output" + info: null + default: + - "$id" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "bash_script" + path: "code.sh" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +- type: "file" + path: "_viash.yaml" + dest: "_viash.yaml" +description: "Publish the fastq files per well" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +requirements: + commands: + - "ps" +license: "MIT" +links: + repository: "https://github.com/viash-hub/htrnaseq" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "debian:stable-slim" + target_registry: "images.viash-hub.com" + target_tag: "updatecraftbox" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/io/publish_fastqs/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/io/publish_fastqs" + executable: "target/executable/io/publish_fastqs/publish_fastqs" + viash_version: "0.9.4" + git_commit: "e6da525fc57aaec74f348eb974b68faa647bf800" + git_remote: "https://github.com/viash-hub/htrnaseq" +package_config: + name: "htrnaseq" + version: "updatecraftbox" + summary: "A workflow for high-throughput RNA-seq data analyses.\n" + description: "This workflow is designed to process high-throughput RNA-seq data,\ + \ where every\nwell of a microarray plate is a sample. A fasta file provided as\ + \ input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow\ + \ is built in a modular fashion, where most of the base functionality\nis provided\ + \ by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\n\ + supplemented by custom base components and workflow components in this package.\n\ + \nThe full workflow is split in two major subworkflows that can be run independently:\n\ + \n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per\ + \ well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate\ + \ QC reports.\n\nEach of those can be started individually, or the full workflow\ + \ can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq)\ + \ \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner)\ + \ where a\nnumber of choices (input/output structure and location) have been made.\n\ + \nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other\ + \ formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex)\ + \ first.\n" + info: + test_resources: + - path: "gs://viash-hub-resources/htrnaseq/v2" + dest: "resources_test" + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script\ + \ += 'includeConfig(\"nextflow_labels.config\")'\n.resources += {path: '/src/config/labels.config',\ + \ dest: 'nextflow_labels.config'}\n.resources += {path: '/_viash.yaml', dest:\ + \ '_viash.yaml'}\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'updatecraftbox'" + keywords: + - "bioinformatics" + - "sequencing" + - "high-throughput" + - "RNAseq" + - "mapping" + - "counting" + - "pipeline" + - "workflow" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/viash-hub/htrnaseq" + issue_tracker: "https://github.com/viash-hub/htrnaseq/issues" diff --git a/target/executable/io/publish_fastqs/_viash.yaml b/target/executable/io/publish_fastqs/_viash.yaml new file mode 100644 index 00000000..d08de7b1 --- /dev/null +++ b/target/executable/io/publish_fastqs/_viash.yaml @@ -0,0 +1,21 @@ +name: htrnaseq +summary: | + A workflow for high-throughput RNA-seq data analyses. +description: "This workflow is designed to process high-throughput RNA-seq data, where every\nwell of a microarray plate is a sample. A fasta file provided as input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow is built in a modular fashion, where most of the base functionality\nis provided by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\nsupplemented by custom base components and workflow components in this package.\n\nThe full workflow is split in two major subworkflows that can be run independently:\n\n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate QC reports.\n\nEach of those can be started individually, or the full workflow can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq) \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner) where a\nnumber of choices (input/output structure and location) have been made.\n\nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex) first.\n" +license: MIT +keywords: [bioinformatics, sequencing, high-throughput, RNAseq, mapping, counting, pipeline, workflow] +links: + issue_tracker: https://github.com/viash-hub/htrnaseq/issues + repository: https://github.com/viash-hub/htrnaseq +viash_version: 0.9.4 +info: + test_resources: + - path: gs://viash-hub-resources/htrnaseq/v2 + dest: resources_test +config_mods: | + .requirements.commands := ['ps'] + .runners[.type == 'nextflow'].config.script += 'includeConfig("nextflow_labels.config")' + .resources += {path: '/src/config/labels.config', dest: 'nextflow_labels.config'} + .resources += {path: '/_viash.yaml', dest: '_viash.yaml'} +version: updatecraftbox +organization: vsh diff --git a/target/executable/io/publish_fastqs/nextflow_labels.config b/target/executable/io/publish_fastqs/nextflow_labels.config new file mode 100644 index 00000000..2821ec46 --- /dev/null +++ b/target/executable/io/publish_fastqs/nextflow_labels.config @@ -0,0 +1,108 @@ +executor { + $k8s { + submitRateLimit = '10sec' + pollInterval = '1 sec' + } +} + +process { + container = 'nextflow/bash:latest' + + // default resources + memory = { 8.Gb * task.attempt } + cpus = 8 + maxForks = 36 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + maxMemory = 192.GB + + // Resource labels + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 8 } + withLabel: midcpu { cpus = 16 } + withLabel: highcpu { cpus = 32 } + + withLabel: verylowmem { memory = { get_memory( 4.GB * task.attempt ) } } + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 16.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 64.GB * task.attempt ) } } + +} + +profiles { + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.fixOwnership = true + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + + local { + // This config is for local processing. + process { + withName: ".*parallel_map_process" { + maxForks = 1 + } + maxMemory = 25.GB + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 6 } + withLabel: highcpu { cpus = 8 } + + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 12.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 20.GB * task.attempt ) } } + } + } +} + +def get_memory(to_compare) { + if (!process.containsKey("maxMemory") || !process.maxMemory) { + return to_compare + } + + try { + if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) { + return process.maxMemory + } + else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) { + return max_memory as nextflow.util.MemoryUnit + } + else { + return to_compare + } + } catch (all) { + println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!" + System.exit(1) + } + } diff --git a/target/executable/io/publish_fastqs/publish_fastqs b/target/executable/io/publish_fastqs/publish_fastqs new file mode 100755 index 00000000..1ab07b96 --- /dev/null +++ b/target/executable/io/publish_fastqs/publish_fastqs @@ -0,0 +1,1146 @@ +#!/usr/bin/env bash + +# publish_fastqs updatecraftbox +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="publish_fastqs" +VIASH_META_FUNCTIONALITY_NAME="publish_fastqs" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM debian:stable-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +LABEL org.opencontainers.image.description="Companion container for running component io publish_fastqs" +LABEL org.opencontainers.image.created="2025-09-02T13:33:10Z" +LABEL org.opencontainers.image.source="https://github.com/viash-hub/htrnaseq" +LABEL org.opencontainers.image.revision="e6da525fc57aaec74f348eb974b68faa647bf800" +LABEL org.opencontainers.image.version="updatecraftbox" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "publish_fastqs updatecraftbox" + echo "" + echo "Publish the fastq files per well" + echo "" + echo "Input arguments:" + echo " --input" + echo " type: file, required parameter, multiple values allowed, file must exist" + echo " Directory to write fastq data to" + echo "" + echo "Output arguments:" + echo " --output" + echo " type: file, output, file must exist" + echo " default: \$id" + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "publish_fastqs updatecraftbox" + exit + ;; + --input) + if [ -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT="$2" + else + VIASH_PAR_INPUT="$VIASH_PAR_INPUT;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + if [ -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + else + VIASH_PAR_INPUT="$VIASH_PAR_INPUT;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/htrnaseq/io/publish_fastqs:updatecraftbox' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'ps' 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'ps' 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT+x} ]; then + ViashError '--input' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + VIASH_PAR_OUTPUT="\$id" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_INPUT; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_TEST_INPUT=() + IFS=';' + for var in $VIASH_PAR_INPUT; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_INPUT+=( "$var" ) + done + VIASH_PAR_INPUT=$(IFS=';' ; echo "${VIASH_TEST_INPUT[*]}") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-publish_fastqs-XXXXXX").sh +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "${VIASH_PAR_INPUT}" | sed "s#'#'\"'\"'#g;s#.*#par_input='&'#" ; else echo "# par_input="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "${VIASH_PAR_OUTPUT}" | sed "s#'#'\"'\"'#g;s#.*#par_output='&'#" ; else echo "# par_output="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\"'\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\"'\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\"'\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END +#!/bin/bash + +echo "Publishing \$par_input -> \$par_output" + +echo +echo "Creating directory if it does not exist:" +mkdir -p "\$par_output" && echo "\$par_output created" + +echo +echo "Copying files..." +IFS=";" read -ra input <<<\$par_input + +for i in "\${input[@]}"; do + cp -rL "\$i" "\$par_output/" +done +VIASHMAIN +bash "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + unset VIASH_TEST_INPUT + IFS=';' + for var in $VIASH_PAR_INPUT; do + unset IFS + if [ -z "$VIASH_TEST_INPUT" ]; then + VIASH_TEST_INPUT="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_INPUT="$VIASH_TEST_INPUT;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_INPUT="$VIASH_TEST_INPUT" + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/io/publish_results/.config.vsh.yaml b/target/executable/io/publish_results/.config.vsh.yaml new file mode 100644 index 00000000..412ddbd3 --- /dev/null +++ b/target/executable/io/publish_results/.config.vsh.yaml @@ -0,0 +1,332 @@ +name: "publish_results" +namespace: "io" +version: "updatecraftbox" +argument_groups: +- name: "Input arguments" + arguments: + - type: "file" + name: "--star_output" + description: "Output from mapping with STAR" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--nrReadsNrGenesPerChrom" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--star_qc_metrics" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--eset" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--f_data" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--p_data" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--html_report" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--run_params" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output directory" + description: "Determines the name of output directories\n" + arguments: + - type: "file" + name: "--star_output_dir" + info: null + default: + - "star_output" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--nrReadsNrGenesPerChrom_dir" + info: null + default: + - "nrReadsNrGenesPerChrom" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--star_qc_metrics_dir" + info: null + default: + - "starLogs" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--eset_dir" + info: null + default: + - "esets" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--f_data_dir" + info: null + default: + - "fData" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--p_data_dir" + info: null + default: + - "pData" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +- name: "Output file arguments" + description: "Determines the name of output files" + arguments: + - type: "file" + name: "--run_params_output" + info: null + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--html_report_output" + info: null + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "bash_script" + path: "code.sh" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +- type: "file" + path: "_viash.yaml" + dest: "_viash.yaml" +description: "Publish the results" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +requirements: + commands: + - "ps" +license: "MIT" +links: + repository: "https://github.com/viash-hub/htrnaseq" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "debian:stable-slim" + target_registry: "images.viash-hub.com" + target_tag: "updatecraftbox" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/io/publish_results/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/io/publish_results" + executable: "target/executable/io/publish_results/publish_results" + viash_version: "0.9.4" + git_commit: "e6da525fc57aaec74f348eb974b68faa647bf800" + git_remote: "https://github.com/viash-hub/htrnaseq" +package_config: + name: "htrnaseq" + version: "updatecraftbox" + summary: "A workflow for high-throughput RNA-seq data analyses.\n" + description: "This workflow is designed to process high-throughput RNA-seq data,\ + \ where every\nwell of a microarray plate is a sample. A fasta file provided as\ + \ input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow\ + \ is built in a modular fashion, where most of the base functionality\nis provided\ + \ by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\n\ + supplemented by custom base components and workflow components in this package.\n\ + \nThe full workflow is split in two major subworkflows that can be run independently:\n\ + \n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per\ + \ well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate\ + \ QC reports.\n\nEach of those can be started individually, or the full workflow\ + \ can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq)\ + \ \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner)\ + \ where a\nnumber of choices (input/output structure and location) have been made.\n\ + \nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other\ + \ formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex)\ + \ first.\n" + info: + test_resources: + - path: "gs://viash-hub-resources/htrnaseq/v2" + dest: "resources_test" + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script\ + \ += 'includeConfig(\"nextflow_labels.config\")'\n.resources += {path: '/src/config/labels.config',\ + \ dest: 'nextflow_labels.config'}\n.resources += {path: '/_viash.yaml', dest:\ + \ '_viash.yaml'}\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'updatecraftbox'" + keywords: + - "bioinformatics" + - "sequencing" + - "high-throughput" + - "RNAseq" + - "mapping" + - "counting" + - "pipeline" + - "workflow" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/viash-hub/htrnaseq" + issue_tracker: "https://github.com/viash-hub/htrnaseq/issues" diff --git a/target/executable/io/publish_results/_viash.yaml b/target/executable/io/publish_results/_viash.yaml new file mode 100644 index 00000000..d08de7b1 --- /dev/null +++ b/target/executable/io/publish_results/_viash.yaml @@ -0,0 +1,21 @@ +name: htrnaseq +summary: | + A workflow for high-throughput RNA-seq data analyses. +description: "This workflow is designed to process high-throughput RNA-seq data, where every\nwell of a microarray plate is a sample. A fasta file provided as input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow is built in a modular fashion, where most of the base functionality\nis provided by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\nsupplemented by custom base components and workflow components in this package.\n\nThe full workflow is split in two major subworkflows that can be run independently:\n\n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate QC reports.\n\nEach of those can be started individually, or the full workflow can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq) \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner) where a\nnumber of choices (input/output structure and location) have been made.\n\nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex) first.\n" +license: MIT +keywords: [bioinformatics, sequencing, high-throughput, RNAseq, mapping, counting, pipeline, workflow] +links: + issue_tracker: https://github.com/viash-hub/htrnaseq/issues + repository: https://github.com/viash-hub/htrnaseq +viash_version: 0.9.4 +info: + test_resources: + - path: gs://viash-hub-resources/htrnaseq/v2 + dest: resources_test +config_mods: | + .requirements.commands := ['ps'] + .runners[.type == 'nextflow'].config.script += 'includeConfig("nextflow_labels.config")' + .resources += {path: '/src/config/labels.config', dest: 'nextflow_labels.config'} + .resources += {path: '/_viash.yaml', dest: '_viash.yaml'} +version: updatecraftbox +organization: vsh diff --git a/target/executable/io/publish_results/nextflow_labels.config b/target/executable/io/publish_results/nextflow_labels.config new file mode 100644 index 00000000..2821ec46 --- /dev/null +++ b/target/executable/io/publish_results/nextflow_labels.config @@ -0,0 +1,108 @@ +executor { + $k8s { + submitRateLimit = '10sec' + pollInterval = '1 sec' + } +} + +process { + container = 'nextflow/bash:latest' + + // default resources + memory = { 8.Gb * task.attempt } + cpus = 8 + maxForks = 36 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + maxMemory = 192.GB + + // Resource labels + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 8 } + withLabel: midcpu { cpus = 16 } + withLabel: highcpu { cpus = 32 } + + withLabel: verylowmem { memory = { get_memory( 4.GB * task.attempt ) } } + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 16.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 64.GB * task.attempt ) } } + +} + +profiles { + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.fixOwnership = true + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + + local { + // This config is for local processing. + process { + withName: ".*parallel_map_process" { + maxForks = 1 + } + maxMemory = 25.GB + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 6 } + withLabel: highcpu { cpus = 8 } + + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 12.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 20.GB * task.attempt ) } } + } + } +} + +def get_memory(to_compare) { + if (!process.containsKey("maxMemory") || !process.maxMemory) { + return to_compare + } + + try { + if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) { + return process.maxMemory + } + else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) { + return max_memory as nextflow.util.MemoryUnit + } + else { + return to_compare + } + } catch (all) { + println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!" + System.exit(1) + } + } diff --git a/target/executable/io/publish_results/publish_results b/target/executable/io/publish_results/publish_results new file mode 100755 index 00000000..d9c80406 --- /dev/null +++ b/target/executable/io/publish_results/publish_results @@ -0,0 +1,1824 @@ +#!/usr/bin/env bash + +# publish_results updatecraftbox +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="publish_results" +VIASH_META_FUNCTIONALITY_NAME="publish_results" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM debian:stable-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +LABEL org.opencontainers.image.description="Companion container for running component io publish_results" +LABEL org.opencontainers.image.created="2025-09-02T13:33:11Z" +LABEL org.opencontainers.image.source="https://github.com/viash-hub/htrnaseq" +LABEL org.opencontainers.image.revision="e6da525fc57aaec74f348eb974b68faa647bf800" +LABEL org.opencontainers.image.version="updatecraftbox" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "publish_results updatecraftbox" + echo "" + echo "Publish the results" + echo "" + echo "Input arguments:" + echo " --star_output" + echo " type: file, required parameter, multiple values allowed, file must exist" + echo " Output from mapping with STAR" + echo "" + echo " --nrReadsNrGenesPerChrom" + echo " type: file, required parameter, multiple values allowed, file must exist" + echo "" + echo " --star_qc_metrics" + echo " type: file, required parameter, multiple values allowed, file must exist" + echo "" + echo " --eset" + echo " type: file, required parameter, multiple values allowed, file must exist" + echo "" + echo " --f_data" + echo " type: file, required parameter, multiple values allowed, file must exist" + echo "" + echo " --p_data" + echo " type: file, required parameter, multiple values allowed, file must exist" + echo "" + echo " --html_report" + echo " type: file, required parameter, file must exist" + echo "" + echo " --run_params" + echo " type: file, required parameter, file must exist" + echo "" + echo "Output directory:" + echo " Determines the name of output directories" + echo "" + echo " --star_output_dir" + echo " type: file, output, file must exist" + echo " default: star_output" + echo "" + echo " --nrReadsNrGenesPerChrom_dir" + echo " type: file, output, file must exist" + echo " default: nrReadsNrGenesPerChrom" + echo "" + echo " --star_qc_metrics_dir" + echo " type: file, output, file must exist" + echo " default: starLogs" + echo "" + echo " --eset_dir" + echo " type: file, output, file must exist" + echo " default: esets" + echo "" + echo " --f_data_dir" + echo " type: file, output, file must exist" + echo " default: fData" + echo "" + echo " --p_data_dir" + echo " type: file, output, file must exist" + echo " default: pData" + echo "" + echo "Output file arguments:" + echo " Determines the name of output files" + echo "" + echo " --run_params_output" + echo " type: file, output, file must exist" + echo "" + echo " --html_report_output" + echo " type: file, output, file must exist" + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "publish_results updatecraftbox" + exit + ;; + --star_output) + if [ -z "$VIASH_PAR_STAR_OUTPUT" ]; then + VIASH_PAR_STAR_OUTPUT="$2" + else + VIASH_PAR_STAR_OUTPUT="$VIASH_PAR_STAR_OUTPUT;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --star_output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --star_output=*) + if [ -z "$VIASH_PAR_STAR_OUTPUT" ]; then + VIASH_PAR_STAR_OUTPUT=$(ViashRemoveFlags "$1") + else + VIASH_PAR_STAR_OUTPUT="$VIASH_PAR_STAR_OUTPUT;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --nrReadsNrGenesPerChrom) + if [ -z "$VIASH_PAR_NRREADSNRGENESPERCHROM" ]; then + VIASH_PAR_NRREADSNRGENESPERCHROM="$2" + else + VIASH_PAR_NRREADSNRGENESPERCHROM="$VIASH_PAR_NRREADSNRGENESPERCHROM;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --nrReadsNrGenesPerChrom. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --nrReadsNrGenesPerChrom=*) + if [ -z "$VIASH_PAR_NRREADSNRGENESPERCHROM" ]; then + VIASH_PAR_NRREADSNRGENESPERCHROM=$(ViashRemoveFlags "$1") + else + VIASH_PAR_NRREADSNRGENESPERCHROM="$VIASH_PAR_NRREADSNRGENESPERCHROM;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --star_qc_metrics) + if [ -z "$VIASH_PAR_STAR_QC_METRICS" ]; then + VIASH_PAR_STAR_QC_METRICS="$2" + else + VIASH_PAR_STAR_QC_METRICS="$VIASH_PAR_STAR_QC_METRICS;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --star_qc_metrics. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --star_qc_metrics=*) + if [ -z "$VIASH_PAR_STAR_QC_METRICS" ]; then + VIASH_PAR_STAR_QC_METRICS=$(ViashRemoveFlags "$1") + else + VIASH_PAR_STAR_QC_METRICS="$VIASH_PAR_STAR_QC_METRICS;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --eset) + if [ -z "$VIASH_PAR_ESET" ]; then + VIASH_PAR_ESET="$2" + else + VIASH_PAR_ESET="$VIASH_PAR_ESET;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --eset. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --eset=*) + if [ -z "$VIASH_PAR_ESET" ]; then + VIASH_PAR_ESET=$(ViashRemoveFlags "$1") + else + VIASH_PAR_ESET="$VIASH_PAR_ESET;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --f_data) + if [ -z "$VIASH_PAR_F_DATA" ]; then + VIASH_PAR_F_DATA="$2" + else + VIASH_PAR_F_DATA="$VIASH_PAR_F_DATA;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --f_data. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --f_data=*) + if [ -z "$VIASH_PAR_F_DATA" ]; then + VIASH_PAR_F_DATA=$(ViashRemoveFlags "$1") + else + VIASH_PAR_F_DATA="$VIASH_PAR_F_DATA;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --p_data) + if [ -z "$VIASH_PAR_P_DATA" ]; then + VIASH_PAR_P_DATA="$2" + else + VIASH_PAR_P_DATA="$VIASH_PAR_P_DATA;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --p_data. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --p_data=*) + if [ -z "$VIASH_PAR_P_DATA" ]; then + VIASH_PAR_P_DATA=$(ViashRemoveFlags "$1") + else + VIASH_PAR_P_DATA="$VIASH_PAR_P_DATA;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --html_report) + [ -n "$VIASH_PAR_HTML_REPORT" ] && ViashError Bad arguments for option \'--html_report\': \'$VIASH_PAR_HTML_REPORT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_HTML_REPORT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --html_report. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --html_report=*) + [ -n "$VIASH_PAR_HTML_REPORT" ] && ViashError Bad arguments for option \'--html_report=*\': \'$VIASH_PAR_HTML_REPORT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_HTML_REPORT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --run_params) + [ -n "$VIASH_PAR_RUN_PARAMS" ] && ViashError Bad arguments for option \'--run_params\': \'$VIASH_PAR_RUN_PARAMS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RUN_PARAMS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --run_params. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --run_params=*) + [ -n "$VIASH_PAR_RUN_PARAMS" ] && ViashError Bad arguments for option \'--run_params=*\': \'$VIASH_PAR_RUN_PARAMS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RUN_PARAMS=$(ViashRemoveFlags "$1") + shift 1 + ;; + --star_output_dir) + [ -n "$VIASH_PAR_STAR_OUTPUT_DIR" ] && ViashError Bad arguments for option \'--star_output_dir\': \'$VIASH_PAR_STAR_OUTPUT_DIR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_STAR_OUTPUT_DIR="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --star_output_dir. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --star_output_dir=*) + [ -n "$VIASH_PAR_STAR_OUTPUT_DIR" ] && ViashError Bad arguments for option \'--star_output_dir=*\': \'$VIASH_PAR_STAR_OUTPUT_DIR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_STAR_OUTPUT_DIR=$(ViashRemoveFlags "$1") + shift 1 + ;; + --nrReadsNrGenesPerChrom_dir) + [ -n "$VIASH_PAR_NRREADSNRGENESPERCHROM_DIR" ] && ViashError Bad arguments for option \'--nrReadsNrGenesPerChrom_dir\': \'$VIASH_PAR_NRREADSNRGENESPERCHROM_DIR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_NRREADSNRGENESPERCHROM_DIR="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --nrReadsNrGenesPerChrom_dir. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --nrReadsNrGenesPerChrom_dir=*) + [ -n "$VIASH_PAR_NRREADSNRGENESPERCHROM_DIR" ] && ViashError Bad arguments for option \'--nrReadsNrGenesPerChrom_dir=*\': \'$VIASH_PAR_NRREADSNRGENESPERCHROM_DIR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_NRREADSNRGENESPERCHROM_DIR=$(ViashRemoveFlags "$1") + shift 1 + ;; + --star_qc_metrics_dir) + [ -n "$VIASH_PAR_STAR_QC_METRICS_DIR" ] && ViashError Bad arguments for option \'--star_qc_metrics_dir\': \'$VIASH_PAR_STAR_QC_METRICS_DIR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_STAR_QC_METRICS_DIR="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --star_qc_metrics_dir. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --star_qc_metrics_dir=*) + [ -n "$VIASH_PAR_STAR_QC_METRICS_DIR" ] && ViashError Bad arguments for option \'--star_qc_metrics_dir=*\': \'$VIASH_PAR_STAR_QC_METRICS_DIR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_STAR_QC_METRICS_DIR=$(ViashRemoveFlags "$1") + shift 1 + ;; + --eset_dir) + [ -n "$VIASH_PAR_ESET_DIR" ] && ViashError Bad arguments for option \'--eset_dir\': \'$VIASH_PAR_ESET_DIR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ESET_DIR="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --eset_dir. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --eset_dir=*) + [ -n "$VIASH_PAR_ESET_DIR" ] && ViashError Bad arguments for option \'--eset_dir=*\': \'$VIASH_PAR_ESET_DIR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ESET_DIR=$(ViashRemoveFlags "$1") + shift 1 + ;; + --f_data_dir) + [ -n "$VIASH_PAR_F_DATA_DIR" ] && ViashError Bad arguments for option \'--f_data_dir\': \'$VIASH_PAR_F_DATA_DIR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_F_DATA_DIR="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --f_data_dir. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --f_data_dir=*) + [ -n "$VIASH_PAR_F_DATA_DIR" ] && ViashError Bad arguments for option \'--f_data_dir=*\': \'$VIASH_PAR_F_DATA_DIR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_F_DATA_DIR=$(ViashRemoveFlags "$1") + shift 1 + ;; + --p_data_dir) + [ -n "$VIASH_PAR_P_DATA_DIR" ] && ViashError Bad arguments for option \'--p_data_dir\': \'$VIASH_PAR_P_DATA_DIR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_P_DATA_DIR="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --p_data_dir. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --p_data_dir=*) + [ -n "$VIASH_PAR_P_DATA_DIR" ] && ViashError Bad arguments for option \'--p_data_dir=*\': \'$VIASH_PAR_P_DATA_DIR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_P_DATA_DIR=$(ViashRemoveFlags "$1") + shift 1 + ;; + --run_params_output) + [ -n "$VIASH_PAR_RUN_PARAMS_OUTPUT" ] && ViashError Bad arguments for option \'--run_params_output\': \'$VIASH_PAR_RUN_PARAMS_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RUN_PARAMS_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --run_params_output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --run_params_output=*) + [ -n "$VIASH_PAR_RUN_PARAMS_OUTPUT" ] && ViashError Bad arguments for option \'--run_params_output=*\': \'$VIASH_PAR_RUN_PARAMS_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RUN_PARAMS_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --html_report_output) + [ -n "$VIASH_PAR_HTML_REPORT_OUTPUT" ] && ViashError Bad arguments for option \'--html_report_output\': \'$VIASH_PAR_HTML_REPORT_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_HTML_REPORT_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --html_report_output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --html_report_output=*) + [ -n "$VIASH_PAR_HTML_REPORT_OUTPUT" ] && ViashError Bad arguments for option \'--html_report_output=*\': \'$VIASH_PAR_HTML_REPORT_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_HTML_REPORT_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/htrnaseq/io/publish_results:updatecraftbox' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'ps' 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'ps' 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_STAR_OUTPUT+x} ]; then + ViashError '--star_output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_NRREADSNRGENESPERCHROM+x} ]; then + ViashError '--nrReadsNrGenesPerChrom' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_STAR_QC_METRICS+x} ]; then + ViashError '--star_qc_metrics' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_ESET+x} ]; then + ViashError '--eset' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_F_DATA+x} ]; then + ViashError '--f_data' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_P_DATA+x} ]; then + ViashError '--p_data' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_HTML_REPORT+x} ]; then + ViashError '--html_report' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_RUN_PARAMS+x} ]; then + ViashError '--run_params' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_STAR_OUTPUT_DIR+x} ]; then + VIASH_PAR_STAR_OUTPUT_DIR="star_output" +fi +if [ -z ${VIASH_PAR_NRREADSNRGENESPERCHROM_DIR+x} ]; then + VIASH_PAR_NRREADSNRGENESPERCHROM_DIR="nrReadsNrGenesPerChrom" +fi +if [ -z ${VIASH_PAR_STAR_QC_METRICS_DIR+x} ]; then + VIASH_PAR_STAR_QC_METRICS_DIR="starLogs" +fi +if [ -z ${VIASH_PAR_ESET_DIR+x} ]; then + VIASH_PAR_ESET_DIR="esets" +fi +if [ -z ${VIASH_PAR_F_DATA_DIR+x} ]; then + VIASH_PAR_F_DATA_DIR="fData" +fi +if [ -z ${VIASH_PAR_P_DATA_DIR+x} ]; then + VIASH_PAR_P_DATA_DIR="pData" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_STAR_OUTPUT" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_STAR_OUTPUT; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi +if [ ! -z "$VIASH_PAR_NRREADSNRGENESPERCHROM" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_NRREADSNRGENESPERCHROM; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi +if [ ! -z "$VIASH_PAR_STAR_QC_METRICS" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_STAR_QC_METRICS; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi +if [ ! -z "$VIASH_PAR_ESET" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_ESET; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi +if [ ! -z "$VIASH_PAR_F_DATA" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_F_DATA; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi +if [ ! -z "$VIASH_PAR_P_DATA" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_P_DATA; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi +if [ ! -z "$VIASH_PAR_HTML_REPORT" ] && [ ! -e "$VIASH_PAR_HTML_REPORT" ]; then + ViashError "Input file '$VIASH_PAR_HTML_REPORT' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_RUN_PARAMS" ] && [ ! -e "$VIASH_PAR_RUN_PARAMS" ]; then + ViashError "Input file '$VIASH_PAR_RUN_PARAMS' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_STAR_OUTPUT_DIR" ] && [ ! -d "$(dirname "$VIASH_PAR_STAR_OUTPUT_DIR")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_STAR_OUTPUT_DIR")" +fi +if [ ! -z "$VIASH_PAR_NRREADSNRGENESPERCHROM_DIR" ] && [ ! -d "$(dirname "$VIASH_PAR_NRREADSNRGENESPERCHROM_DIR")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_NRREADSNRGENESPERCHROM_DIR")" +fi +if [ ! -z "$VIASH_PAR_STAR_QC_METRICS_DIR" ] && [ ! -d "$(dirname "$VIASH_PAR_STAR_QC_METRICS_DIR")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_STAR_QC_METRICS_DIR")" +fi +if [ ! -z "$VIASH_PAR_ESET_DIR" ] && [ ! -d "$(dirname "$VIASH_PAR_ESET_DIR")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_ESET_DIR")" +fi +if [ ! -z "$VIASH_PAR_F_DATA_DIR" ] && [ ! -d "$(dirname "$VIASH_PAR_F_DATA_DIR")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_F_DATA_DIR")" +fi +if [ ! -z "$VIASH_PAR_P_DATA_DIR" ] && [ ! -d "$(dirname "$VIASH_PAR_P_DATA_DIR")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_P_DATA_DIR")" +fi +if [ ! -z "$VIASH_PAR_RUN_PARAMS_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_RUN_PARAMS_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_RUN_PARAMS_OUTPUT")" +fi +if [ ! -z "$VIASH_PAR_HTML_REPORT_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_HTML_REPORT_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_HTML_REPORT_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_STAR_OUTPUT" ]; then + VIASH_TEST_STAR_OUTPUT=() + IFS=';' + for var in $VIASH_PAR_STAR_OUTPUT; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_STAR_OUTPUT+=( "$var" ) + done + VIASH_PAR_STAR_OUTPUT=$(IFS=';' ; echo "${VIASH_TEST_STAR_OUTPUT[*]}") +fi +if [ ! -z "$VIASH_PAR_NRREADSNRGENESPERCHROM" ]; then + VIASH_TEST_NRREADSNRGENESPERCHROM=() + IFS=';' + for var in $VIASH_PAR_NRREADSNRGENESPERCHROM; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_NRREADSNRGENESPERCHROM+=( "$var" ) + done + VIASH_PAR_NRREADSNRGENESPERCHROM=$(IFS=';' ; echo "${VIASH_TEST_NRREADSNRGENESPERCHROM[*]}") +fi +if [ ! -z "$VIASH_PAR_STAR_QC_METRICS" ]; then + VIASH_TEST_STAR_QC_METRICS=() + IFS=';' + for var in $VIASH_PAR_STAR_QC_METRICS; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_STAR_QC_METRICS+=( "$var" ) + done + VIASH_PAR_STAR_QC_METRICS=$(IFS=';' ; echo "${VIASH_TEST_STAR_QC_METRICS[*]}") +fi +if [ ! -z "$VIASH_PAR_ESET" ]; then + VIASH_TEST_ESET=() + IFS=';' + for var in $VIASH_PAR_ESET; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_ESET+=( "$var" ) + done + VIASH_PAR_ESET=$(IFS=';' ; echo "${VIASH_TEST_ESET[*]}") +fi +if [ ! -z "$VIASH_PAR_F_DATA" ]; then + VIASH_TEST_F_DATA=() + IFS=';' + for var in $VIASH_PAR_F_DATA; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_F_DATA+=( "$var" ) + done + VIASH_PAR_F_DATA=$(IFS=';' ; echo "${VIASH_TEST_F_DATA[*]}") +fi +if [ ! -z "$VIASH_PAR_P_DATA" ]; then + VIASH_TEST_P_DATA=() + IFS=';' + for var in $VIASH_PAR_P_DATA; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_P_DATA+=( "$var" ) + done + VIASH_PAR_P_DATA=$(IFS=';' ; echo "${VIASH_TEST_P_DATA[*]}") +fi +if [ ! -z "$VIASH_PAR_HTML_REPORT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_HTML_REPORT")" ) + VIASH_PAR_HTML_REPORT=$(ViashDockerAutodetectMount "$VIASH_PAR_HTML_REPORT") +fi +if [ ! -z "$VIASH_PAR_RUN_PARAMS" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_RUN_PARAMS")" ) + VIASH_PAR_RUN_PARAMS=$(ViashDockerAutodetectMount "$VIASH_PAR_RUN_PARAMS") +fi +if [ ! -z "$VIASH_PAR_STAR_OUTPUT_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_STAR_OUTPUT_DIR")" ) + VIASH_PAR_STAR_OUTPUT_DIR=$(ViashDockerAutodetectMount "$VIASH_PAR_STAR_OUTPUT_DIR") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_STAR_OUTPUT_DIR" ) +fi +if [ ! -z "$VIASH_PAR_NRREADSNRGENESPERCHROM_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_NRREADSNRGENESPERCHROM_DIR")" ) + VIASH_PAR_NRREADSNRGENESPERCHROM_DIR=$(ViashDockerAutodetectMount "$VIASH_PAR_NRREADSNRGENESPERCHROM_DIR") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_NRREADSNRGENESPERCHROM_DIR" ) +fi +if [ ! -z "$VIASH_PAR_STAR_QC_METRICS_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_STAR_QC_METRICS_DIR")" ) + VIASH_PAR_STAR_QC_METRICS_DIR=$(ViashDockerAutodetectMount "$VIASH_PAR_STAR_QC_METRICS_DIR") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_STAR_QC_METRICS_DIR" ) +fi +if [ ! -z "$VIASH_PAR_ESET_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_ESET_DIR")" ) + VIASH_PAR_ESET_DIR=$(ViashDockerAutodetectMount "$VIASH_PAR_ESET_DIR") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_ESET_DIR" ) +fi +if [ ! -z "$VIASH_PAR_F_DATA_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_F_DATA_DIR")" ) + VIASH_PAR_F_DATA_DIR=$(ViashDockerAutodetectMount "$VIASH_PAR_F_DATA_DIR") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_F_DATA_DIR" ) +fi +if [ ! -z "$VIASH_PAR_P_DATA_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_P_DATA_DIR")" ) + VIASH_PAR_P_DATA_DIR=$(ViashDockerAutodetectMount "$VIASH_PAR_P_DATA_DIR") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_P_DATA_DIR" ) +fi +if [ ! -z "$VIASH_PAR_RUN_PARAMS_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_RUN_PARAMS_OUTPUT")" ) + VIASH_PAR_RUN_PARAMS_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_RUN_PARAMS_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_RUN_PARAMS_OUTPUT" ) +fi +if [ ! -z "$VIASH_PAR_HTML_REPORT_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_HTML_REPORT_OUTPUT")" ) + VIASH_PAR_HTML_REPORT_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_HTML_REPORT_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_HTML_REPORT_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-publish_results-XXXXXX").sh +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_STAR_OUTPUT+x} ]; then echo "${VIASH_PAR_STAR_OUTPUT}" | sed "s#'#'\"'\"'#g;s#.*#par_star_output='&'#" ; else echo "# par_star_output="; fi ) +$( if [ ! -z ${VIASH_PAR_NRREADSNRGENESPERCHROM+x} ]; then echo "${VIASH_PAR_NRREADSNRGENESPERCHROM}" | sed "s#'#'\"'\"'#g;s#.*#par_nrReadsNrGenesPerChrom='&'#" ; else echo "# par_nrReadsNrGenesPerChrom="; fi ) +$( if [ ! -z ${VIASH_PAR_STAR_QC_METRICS+x} ]; then echo "${VIASH_PAR_STAR_QC_METRICS}" | sed "s#'#'\"'\"'#g;s#.*#par_star_qc_metrics='&'#" ; else echo "# par_star_qc_metrics="; fi ) +$( if [ ! -z ${VIASH_PAR_ESET+x} ]; then echo "${VIASH_PAR_ESET}" | sed "s#'#'\"'\"'#g;s#.*#par_eset='&'#" ; else echo "# par_eset="; fi ) +$( if [ ! -z ${VIASH_PAR_F_DATA+x} ]; then echo "${VIASH_PAR_F_DATA}" | sed "s#'#'\"'\"'#g;s#.*#par_f_data='&'#" ; else echo "# par_f_data="; fi ) +$( if [ ! -z ${VIASH_PAR_P_DATA+x} ]; then echo "${VIASH_PAR_P_DATA}" | sed "s#'#'\"'\"'#g;s#.*#par_p_data='&'#" ; else echo "# par_p_data="; fi ) +$( if [ ! -z ${VIASH_PAR_HTML_REPORT+x} ]; then echo "${VIASH_PAR_HTML_REPORT}" | sed "s#'#'\"'\"'#g;s#.*#par_html_report='&'#" ; else echo "# par_html_report="; fi ) +$( if [ ! -z ${VIASH_PAR_RUN_PARAMS+x} ]; then echo "${VIASH_PAR_RUN_PARAMS}" | sed "s#'#'\"'\"'#g;s#.*#par_run_params='&'#" ; else echo "# par_run_params="; fi ) +$( if [ ! -z ${VIASH_PAR_STAR_OUTPUT_DIR+x} ]; then echo "${VIASH_PAR_STAR_OUTPUT_DIR}" | sed "s#'#'\"'\"'#g;s#.*#par_star_output_dir='&'#" ; else echo "# par_star_output_dir="; fi ) +$( if [ ! -z ${VIASH_PAR_NRREADSNRGENESPERCHROM_DIR+x} ]; then echo "${VIASH_PAR_NRREADSNRGENESPERCHROM_DIR}" | sed "s#'#'\"'\"'#g;s#.*#par_nrReadsNrGenesPerChrom_dir='&'#" ; else echo "# par_nrReadsNrGenesPerChrom_dir="; fi ) +$( if [ ! -z ${VIASH_PAR_STAR_QC_METRICS_DIR+x} ]; then echo "${VIASH_PAR_STAR_QC_METRICS_DIR}" | sed "s#'#'\"'\"'#g;s#.*#par_star_qc_metrics_dir='&'#" ; else echo "# par_star_qc_metrics_dir="; fi ) +$( if [ ! -z ${VIASH_PAR_ESET_DIR+x} ]; then echo "${VIASH_PAR_ESET_DIR}" | sed "s#'#'\"'\"'#g;s#.*#par_eset_dir='&'#" ; else echo "# par_eset_dir="; fi ) +$( if [ ! -z ${VIASH_PAR_F_DATA_DIR+x} ]; then echo "${VIASH_PAR_F_DATA_DIR}" | sed "s#'#'\"'\"'#g;s#.*#par_f_data_dir='&'#" ; else echo "# par_f_data_dir="; fi ) +$( if [ ! -z ${VIASH_PAR_P_DATA_DIR+x} ]; then echo "${VIASH_PAR_P_DATA_DIR}" | sed "s#'#'\"'\"'#g;s#.*#par_p_data_dir='&'#" ; else echo "# par_p_data_dir="; fi ) +$( if [ ! -z ${VIASH_PAR_RUN_PARAMS_OUTPUT+x} ]; then echo "${VIASH_PAR_RUN_PARAMS_OUTPUT}" | sed "s#'#'\"'\"'#g;s#.*#par_run_params_output='&'#" ; else echo "# par_run_params_output="; fi ) +$( if [ ! -z ${VIASH_PAR_HTML_REPORT_OUTPUT+x} ]; then echo "${VIASH_PAR_HTML_REPORT_OUTPUT}" | sed "s#'#'\"'\"'#g;s#.*#par_html_report_output='&'#" ; else echo "# par_html_report_output="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\"'\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\"'\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\"'\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END +#!/bin/bash + +set -eo pipefail + +echo "Publishing results to multiple output directories" + +# Create output directories for multiple files +echo "Creating output directories..." + +path_pars=( + par_star_output_dir + par_nrReadsNrGenesPerChrom_dir + par_star_qc_metrics_dir + par_eset_dir + par_f_data_dir + par_p_data_dir + par_html_report_output + par_run_params_output +) + +for par in \${path_pars[@]}; do + curr_val="\${!par}" + new_value=\$(realpath --canonicalize-missing "\$curr_val") + declare -g "\$par=\$new_value" +done + +mkdir -p "\$par_star_output_dir" && echo "\$par_star_output_dir created" +mkdir -p "\$par_nrReadsNrGenesPerChrom_dir" && echo "\$par_nrReadsNrGenesPerChrom_dir created" +mkdir -p "\$par_star_qc_metrics_dir" && echo "\$par_star_qc_metrics_dir created" +mkdir -p "\$par_eset_dir" && echo "\$par_eset_dir created" +mkdir -p "\$par_f_data_dir" && echo "\$par_f_data_dir created" +mkdir -p "\$par_p_data_dir" && echo "\$par_p_data_dir created" + +echo +echo "Copying STAR output files..." +IFS=";" read -ra star_output <<<\$par_star_output +for i in "\${star_output[@]}"; do + echo "Copying \$i to \$par_star_output_dir/" + cp -rL "\$i" "\$par_star_output_dir/" +done + +echo +echo "Copying nrReadsNrGenesPerChrom files..." +IFS=";" read -ra nrReadsNrGenesPerChrom <<<\$par_nrReadsNrGenesPerChrom +for i in "\${nrReadsNrGenesPerChrom[@]}"; do + echo "Copying \$i to \$par_nrReadsNrGenesPerChrom_dir/" + cp -rL "\$i" "\$par_nrReadsNrGenesPerChrom_dir/" +done + +echo +echo "Copying STAR QC metrics files..." +IFS=";" read -ra star_qc_metrics <<<\$par_star_qc_metrics +for i in "\${star_qc_metrics[@]}"; do + echo "Copying \$i to \$par_star_qc_metrics_dir/" + cp -rL "\$i" "\$par_star_qc_metrics_dir/" +done + +echo +echo "Copying eset files..." +IFS=";" read -ra eset <<<\$par_eset +for i in "\${eset[@]}"; do + echo "Copying \$i to \$par_eset_dir/" + cp -rL "\$i" "\$par_eset_dir/" +done + +echo +echo "Copying f_data files..." +IFS=";" read -ra f_data <<<\$par_f_data +for i in "\${f_data[@]}"; do + echo "Copying \$i to \$par_f_data_dir/" + cp -rL "\$i" "\$par_f_data_dir/" +done + +echo +echo "Copying p_data files..." +IFS=";" read -ra p_data <<<\$par_p_data +for i in "\${p_data[@]}"; do + echo "Copying \$i to \$par_p_data_dir/" + cp -rL "\$i" "\$par_p_data_dir/" +done + +echo +echo "Copying single files directly..." +mkdir -p \$(dirname "\$par_html_report_output") +echo "Copying \$par_html_report to \$par_html_report_output" +cp -L "\$par_html_report" "\$par_html_report_output" + +echo "Copying \$par_run_params to \$par_run_params_output" +mkdir -p \$(dirname "\$par_run_params_output") +cp -L "\$par_run_params" "\$par_run_params_output" + +echo +echo "Publishing completed successfully!" +VIASHMAIN +bash "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_STAR_OUTPUT" ]; then + unset VIASH_TEST_STAR_OUTPUT + IFS=';' + for var in $VIASH_PAR_STAR_OUTPUT; do + unset IFS + if [ -z "$VIASH_TEST_STAR_OUTPUT" ]; then + VIASH_TEST_STAR_OUTPUT="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_STAR_OUTPUT="$VIASH_TEST_STAR_OUTPUT;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_STAR_OUTPUT="$VIASH_TEST_STAR_OUTPUT" + fi + if [ ! -z "$VIASH_PAR_NRREADSNRGENESPERCHROM" ]; then + unset VIASH_TEST_NRREADSNRGENESPERCHROM + IFS=';' + for var in $VIASH_PAR_NRREADSNRGENESPERCHROM; do + unset IFS + if [ -z "$VIASH_TEST_NRREADSNRGENESPERCHROM" ]; then + VIASH_TEST_NRREADSNRGENESPERCHROM="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_NRREADSNRGENESPERCHROM="$VIASH_TEST_NRREADSNRGENESPERCHROM;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_NRREADSNRGENESPERCHROM="$VIASH_TEST_NRREADSNRGENESPERCHROM" + fi + if [ ! -z "$VIASH_PAR_STAR_QC_METRICS" ]; then + unset VIASH_TEST_STAR_QC_METRICS + IFS=';' + for var in $VIASH_PAR_STAR_QC_METRICS; do + unset IFS + if [ -z "$VIASH_TEST_STAR_QC_METRICS" ]; then + VIASH_TEST_STAR_QC_METRICS="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_STAR_QC_METRICS="$VIASH_TEST_STAR_QC_METRICS;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_STAR_QC_METRICS="$VIASH_TEST_STAR_QC_METRICS" + fi + if [ ! -z "$VIASH_PAR_ESET" ]; then + unset VIASH_TEST_ESET + IFS=';' + for var in $VIASH_PAR_ESET; do + unset IFS + if [ -z "$VIASH_TEST_ESET" ]; then + VIASH_TEST_ESET="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_ESET="$VIASH_TEST_ESET;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_ESET="$VIASH_TEST_ESET" + fi + if [ ! -z "$VIASH_PAR_F_DATA" ]; then + unset VIASH_TEST_F_DATA + IFS=';' + for var in $VIASH_PAR_F_DATA; do + unset IFS + if [ -z "$VIASH_TEST_F_DATA" ]; then + VIASH_TEST_F_DATA="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_F_DATA="$VIASH_TEST_F_DATA;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_F_DATA="$VIASH_TEST_F_DATA" + fi + if [ ! -z "$VIASH_PAR_P_DATA" ]; then + unset VIASH_TEST_P_DATA + IFS=';' + for var in $VIASH_PAR_P_DATA; do + unset IFS + if [ -z "$VIASH_TEST_P_DATA" ]; then + VIASH_TEST_P_DATA="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_P_DATA="$VIASH_TEST_P_DATA;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_P_DATA="$VIASH_TEST_P_DATA" + fi + if [ ! -z "$VIASH_PAR_HTML_REPORT" ]; then + VIASH_PAR_HTML_REPORT=$(ViashDockerStripAutomount "$VIASH_PAR_HTML_REPORT") + fi + if [ ! -z "$VIASH_PAR_RUN_PARAMS" ]; then + VIASH_PAR_RUN_PARAMS=$(ViashDockerStripAutomount "$VIASH_PAR_RUN_PARAMS") + fi + if [ ! -z "$VIASH_PAR_STAR_OUTPUT_DIR" ]; then + VIASH_PAR_STAR_OUTPUT_DIR=$(ViashDockerStripAutomount "$VIASH_PAR_STAR_OUTPUT_DIR") + fi + if [ ! -z "$VIASH_PAR_NRREADSNRGENESPERCHROM_DIR" ]; then + VIASH_PAR_NRREADSNRGENESPERCHROM_DIR=$(ViashDockerStripAutomount "$VIASH_PAR_NRREADSNRGENESPERCHROM_DIR") + fi + if [ ! -z "$VIASH_PAR_STAR_QC_METRICS_DIR" ]; then + VIASH_PAR_STAR_QC_METRICS_DIR=$(ViashDockerStripAutomount "$VIASH_PAR_STAR_QC_METRICS_DIR") + fi + if [ ! -z "$VIASH_PAR_ESET_DIR" ]; then + VIASH_PAR_ESET_DIR=$(ViashDockerStripAutomount "$VIASH_PAR_ESET_DIR") + fi + if [ ! -z "$VIASH_PAR_F_DATA_DIR" ]; then + VIASH_PAR_F_DATA_DIR=$(ViashDockerStripAutomount "$VIASH_PAR_F_DATA_DIR") + fi + if [ ! -z "$VIASH_PAR_P_DATA_DIR" ]; then + VIASH_PAR_P_DATA_DIR=$(ViashDockerStripAutomount "$VIASH_PAR_P_DATA_DIR") + fi + if [ ! -z "$VIASH_PAR_RUN_PARAMS_OUTPUT" ]; then + VIASH_PAR_RUN_PARAMS_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_RUN_PARAMS_OUTPUT") + fi + if [ ! -z "$VIASH_PAR_HTML_REPORT_OUTPUT" ]; then + VIASH_PAR_HTML_REPORT_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_HTML_REPORT_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_STAR_OUTPUT_DIR" ] && [ ! -e "$VIASH_PAR_STAR_OUTPUT_DIR" ]; then + ViashError "Output file '$VIASH_PAR_STAR_OUTPUT_DIR' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_NRREADSNRGENESPERCHROM_DIR" ] && [ ! -e "$VIASH_PAR_NRREADSNRGENESPERCHROM_DIR" ]; then + ViashError "Output file '$VIASH_PAR_NRREADSNRGENESPERCHROM_DIR' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_STAR_QC_METRICS_DIR" ] && [ ! -e "$VIASH_PAR_STAR_QC_METRICS_DIR" ]; then + ViashError "Output file '$VIASH_PAR_STAR_QC_METRICS_DIR' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_ESET_DIR" ] && [ ! -e "$VIASH_PAR_ESET_DIR" ]; then + ViashError "Output file '$VIASH_PAR_ESET_DIR' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_F_DATA_DIR" ] && [ ! -e "$VIASH_PAR_F_DATA_DIR" ]; then + ViashError "Output file '$VIASH_PAR_F_DATA_DIR' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_P_DATA_DIR" ] && [ ! -e "$VIASH_PAR_P_DATA_DIR" ]; then + ViashError "Output file '$VIASH_PAR_P_DATA_DIR' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_RUN_PARAMS_OUTPUT" ] && [ ! -e "$VIASH_PAR_RUN_PARAMS_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_RUN_PARAMS_OUTPUT' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_HTML_REPORT_OUTPUT" ] && [ ! -e "$VIASH_PAR_HTML_REPORT_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_HTML_REPORT_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/parallel_map/.config.vsh.yaml b/target/executable/parallel_map/.config.vsh.yaml new file mode 100644 index 00000000..f2286fab --- /dev/null +++ b/target/executable/parallel_map/.config.vsh.yaml @@ -0,0 +1,338 @@ +name: "parallel_map" +version: "updatecraftbox" +authors: +- name: "Dries Schaumont" + roles: + - "maintainer" + info: + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Toni Verbeiren" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + github: "tverbeiren" + linkedin: "verbeiren" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist and CEO" +argument_groups: +- name: "Input arguments" + arguments: + - type: "file" + name: "--input_r1" + description: "Input FASTQ files for the forward reads. All FASTQ file names must\ + \ start with the prefix '{well_id}_R1', where\n'well_id' can be found as the\ + \ sequence identifier in the barcodes FASTA file (see 'barcodesFasta' argument).\n\ + For each FASTQ file, a matching FASTQ file for the reverse reads must be provided\ + \ to the 'input_r2' argument,\nmeaning that their 'well_id' prefix must match.\ + \ The number of items provided for 'input_r1' must be equal\nto the number of\ + \ items for 'input_r2'.\n" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--input_r2" + description: "Input FASTQ files for the reverse reads. All FASTQ file names must\ + \ start with the prefix '{well_id}_R2', where\n'well_id' can be found as the\ + \ sequence identifier in the barcodes FASTA file (see 'barcodesFasta' argument).\n\ + For each FASTQ file, a matching FASTQ file for the reverse reads must be provided\ + \ to the 'input_r1' argument,\nmeaning that their 'well_id' prefix must match.\ + \ The number of items provided for 'input_r1' must be equal\nto the number of\ + \ items for 'input_r2'.\n" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--genomeDir" + description: "Reference genome to match to. Can be generated from genomic FASTA\ + \ sequences and a genome annotation\nby using STAR with '--runMode genomeGenerate'.\n" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--barcodesFasta" + description: "FASTA file where each entry specifies a unique barcode sequence\ + \ present at the start of the forward input reads\n(input_r1). The IDs of each\ + \ barcode (the start of the FASTA headers up until the first whitespace character)\ + \ must\nmatch with the start of one input FASTQ pair.\n" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Barcode arguments" + arguments: + - type: "integer" + name: "--umiLength" + description: "Length of the Unique Molecular Identifiers (UMI). The UMI are expected\ + \ to be located after the barcodes in the\nforwards reads.\n" + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--limitBAMsortRAM" + info: null + default: + - "10000000000" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Runtime arguments" + arguments: + - type: "integer" + name: "--runThreadN" + description: "Number of threads to use for a single STAR execution." + info: null + default: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output arguments" + arguments: + - type: "file" + name: "--output" + description: "A list of output folders which are the result of using STAR to map\ + \ each input FASTQ pair STAR to the reference genome.\nThe order of the items\ + \ DO NOT match with the order of the entries in the barcodes FASTA file or the\ + \ input FASTQ pairs. \n" + info: null + default: + - "./*" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--joblog" + description: "Where to store the log file listing all the jobs." + info: null + default: + - "execution_log.txt" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "STAR" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +- type: "file" + path: "_viash.yaml" + dest: "_viash.yaml" +description: "Map wells in batch, using STAR\nSpliced Transcripts Alignment to a Reference\ + \ (C) Alexander Dobin\nhttps://github.com/alexdobin/STAR\n" +test_resources: +- type: "bash_script" + path: "test.sh" + is_executable: true +info: null +status: "enabled" +scope: + image: "public" + target: "public" +requirements: + commands: + - "ps" +license: "MIT" +links: + repository: "https://github.com/viash-hub/htrnaseq" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "debian:stable-slim" + target_registry: "images.viash-hub.com" + target_tag: "updatecraftbox" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + - "wget" + - "automake" + - "make" + - "gcc" + - "g++" + - "zlib1g-dev" + - "parallel" + - "file" + - "seqkit" + interactive: false + - type: "docker" + copy: + - "STAR /usr/local/bin/$STAR_BINARY" + build_args: + - "STAR_V=2.7.6a" + env: + - "STAR_SOURCE=\"https://github.com/alexdobin/STAR/archive/refs/tags/$STAR_V.tar.gz\"" + - "STAR_TARGET=\"/app/star-$STAR_V.tar.gz\"" + - "STAR_INSTALL_DIR=\"/app/STAR-$STAR_V\"" + - "STAR_BINARY=STAR" + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/parallel_map/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/parallel_map" + executable: "target/executable/parallel_map/parallel_map" + viash_version: "0.9.4" + git_commit: "e6da525fc57aaec74f348eb974b68faa647bf800" + git_remote: "https://github.com/viash-hub/htrnaseq" +package_config: + name: "htrnaseq" + version: "updatecraftbox" + summary: "A workflow for high-throughput RNA-seq data analyses.\n" + description: "This workflow is designed to process high-throughput RNA-seq data,\ + \ where every\nwell of a microarray plate is a sample. A fasta file provided as\ + \ input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow\ + \ is built in a modular fashion, where most of the base functionality\nis provided\ + \ by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\n\ + supplemented by custom base components and workflow components in this package.\n\ + \nThe full workflow is split in two major subworkflows that can be run independently:\n\ + \n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per\ + \ well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate\ + \ QC reports.\n\nEach of those can be started individually, or the full workflow\ + \ can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq)\ + \ \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner)\ + \ where a\nnumber of choices (input/output structure and location) have been made.\n\ + \nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other\ + \ formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex)\ + \ first.\n" + info: + test_resources: + - path: "gs://viash-hub-resources/htrnaseq/v2" + dest: "resources_test" + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script\ + \ += 'includeConfig(\"nextflow_labels.config\")'\n.resources += {path: '/src/config/labels.config',\ + \ dest: 'nextflow_labels.config'}\n.resources += {path: '/_viash.yaml', dest:\ + \ '_viash.yaml'}\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'updatecraftbox'" + keywords: + - "bioinformatics" + - "sequencing" + - "high-throughput" + - "RNAseq" + - "mapping" + - "counting" + - "pipeline" + - "workflow" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/viash-hub/htrnaseq" + issue_tracker: "https://github.com/viash-hub/htrnaseq/issues" diff --git a/target/executable/parallel_map/STAR b/target/executable/parallel_map/STAR new file mode 100755 index 00000000..331a838d Binary files /dev/null and b/target/executable/parallel_map/STAR differ diff --git a/target/executable/parallel_map/_viash.yaml b/target/executable/parallel_map/_viash.yaml new file mode 100644 index 00000000..d08de7b1 --- /dev/null +++ b/target/executable/parallel_map/_viash.yaml @@ -0,0 +1,21 @@ +name: htrnaseq +summary: | + A workflow for high-throughput RNA-seq data analyses. +description: "This workflow is designed to process high-throughput RNA-seq data, where every\nwell of a microarray plate is a sample. A fasta file provided as input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow is built in a modular fashion, where most of the base functionality\nis provided by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\nsupplemented by custom base components and workflow components in this package.\n\nThe full workflow is split in two major subworkflows that can be run independently:\n\n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate QC reports.\n\nEach of those can be started individually, or the full workflow can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq) \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner) where a\nnumber of choices (input/output structure and location) have been made.\n\nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex) first.\n" +license: MIT +keywords: [bioinformatics, sequencing, high-throughput, RNAseq, mapping, counting, pipeline, workflow] +links: + issue_tracker: https://github.com/viash-hub/htrnaseq/issues + repository: https://github.com/viash-hub/htrnaseq +viash_version: 0.9.4 +info: + test_resources: + - path: gs://viash-hub-resources/htrnaseq/v2 + dest: resources_test +config_mods: | + .requirements.commands := ['ps'] + .runners[.type == 'nextflow'].config.script += 'includeConfig("nextflow_labels.config")' + .resources += {path: '/src/config/labels.config', dest: 'nextflow_labels.config'} + .resources += {path: '/_viash.yaml', dest: '_viash.yaml'} +version: updatecraftbox +organization: vsh diff --git a/target/executable/parallel_map/nextflow_labels.config b/target/executable/parallel_map/nextflow_labels.config new file mode 100644 index 00000000..2821ec46 --- /dev/null +++ b/target/executable/parallel_map/nextflow_labels.config @@ -0,0 +1,108 @@ +executor { + $k8s { + submitRateLimit = '10sec' + pollInterval = '1 sec' + } +} + +process { + container = 'nextflow/bash:latest' + + // default resources + memory = { 8.Gb * task.attempt } + cpus = 8 + maxForks = 36 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + maxMemory = 192.GB + + // Resource labels + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 8 } + withLabel: midcpu { cpus = 16 } + withLabel: highcpu { cpus = 32 } + + withLabel: verylowmem { memory = { get_memory( 4.GB * task.attempt ) } } + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 16.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 64.GB * task.attempt ) } } + +} + +profiles { + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.fixOwnership = true + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + + local { + // This config is for local processing. + process { + withName: ".*parallel_map_process" { + maxForks = 1 + } + maxMemory = 25.GB + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 6 } + withLabel: highcpu { cpus = 8 } + + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 12.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 20.GB * task.attempt ) } } + } + } +} + +def get_memory(to_compare) { + if (!process.containsKey("maxMemory") || !process.maxMemory) { + return to_compare + } + + try { + if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) { + return process.maxMemory + } + else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) { + return max_memory as nextflow.util.MemoryUnit + } + else { + return to_compare + } + } catch (all) { + println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!" + System.exit(1) + } + } diff --git a/target/executable/parallel_map/parallel_map b/target/executable/parallel_map/parallel_map new file mode 100755 index 00000000..e45588a6 --- /dev/null +++ b/target/executable/parallel_map/parallel_map @@ -0,0 +1,1744 @@ +#!/usr/bin/env bash + +# parallel_map updatecraftbox +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dries Schaumont (maintainer) +# * Toni Verbeiren (author, maintainer) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="parallel_map" +VIASH_META_FUNCTIONALITY_NAME="parallel_map" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM debian:stable-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps wget automake make gcc g++ zlib1g-dev parallel file seqkit && \ + rm -rf /var/lib/apt/lists/* + +ARG STAR_V +ENV STAR_SOURCE="https://github.com/alexdobin/STAR/archive/refs/tags/$STAR_V.tar.gz" +ENV STAR_TARGET="/app/star-$STAR_V.tar.gz" +ENV STAR_INSTALL_DIR="/app/STAR-$STAR_V" +ENV STAR_BINARY=STAR +COPY STAR /usr/local/bin/$STAR_BINARY +LABEL org.opencontainers.image.authors="Dries Schaumont, Toni Verbeiren" +LABEL org.opencontainers.image.description="Companion container for running component parallel_map" +LABEL org.opencontainers.image.created="2025-09-02T13:33:10Z" +LABEL org.opencontainers.image.source="https://github.com/viash-hub/htrnaseq" +LABEL org.opencontainers.image.revision="e6da525fc57aaec74f348eb974b68faa647bf800" +LABEL org.opencontainers.image.version="updatecraftbox" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "--build-arg 'STAR_V=2.7.6a'" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "parallel_map updatecraftbox" + echo "" + echo "Map wells in batch, using STAR" + echo "Spliced Transcripts Alignment to a Reference (C) Alexander Dobin" + echo "https://github.com/alexdobin/STAR" + echo "" + echo "Input arguments:" + echo " --input_r1" + echo " type: file, required parameter, multiple values allowed, file must exist" + echo " Input FASTQ files for the forward reads. All FASTQ file names must start" + echo " with the prefix '{well_id}_R1', where" + echo " 'well_id' can be found as the sequence identifier in the barcodes FASTA" + echo " file (see 'barcodesFasta' argument)." + echo " For each FASTQ file, a matching FASTQ file for the reverse reads must be" + echo " provided to the 'input_r2' argument," + echo " meaning that their 'well_id' prefix must match. The number of items" + echo " provided for 'input_r1' must be equal" + echo " to the number of items for 'input_r2'." + echo "" + echo " --input_r2" + echo " type: file, required parameter, multiple values allowed, file must exist" + echo " Input FASTQ files for the reverse reads. All FASTQ file names must start" + echo " with the prefix '{well_id}_R2', where" + echo " 'well_id' can be found as the sequence identifier in the barcodes FASTA" + echo " file (see 'barcodesFasta' argument)." + echo " For each FASTQ file, a matching FASTQ file for the reverse reads must be" + echo " provided to the 'input_r1' argument," + echo " meaning that their 'well_id' prefix must match. The number of items" + echo " provided for 'input_r1' must be equal" + echo " to the number of items for 'input_r2'." + echo "" + echo " --genomeDir" + echo " type: file, required parameter, file must exist" + echo " Reference genome to match to. Can be generated from genomic FASTA" + echo " sequences and a genome annotation" + echo " by using STAR with '--runMode genomeGenerate'." + echo "" + echo " --barcodesFasta" + echo " type: file, required parameter, file must exist" + echo " FASTA file where each entry specifies a unique barcode sequence present" + echo " at the start of the forward input reads" + echo " (input_r1). The IDs of each barcode (the start of the FASTA headers up" + echo " until the first whitespace character) must" + echo " match with the start of one input FASTQ pair." + echo "" + echo "Barcode arguments:" + echo " --umiLength" + echo " type: integer, required parameter" + echo " Length of the Unique Molecular Identifiers (UMI). The UMI are expected" + echo " to be located after the barcodes in the" + echo " forwards reads." + echo "" + echo " --limitBAMsortRAM" + echo " type: string" + echo " default: 10000000000" + echo "" + echo "Runtime arguments:" + echo " --runThreadN" + echo " type: integer" + echo " default: 1" + echo " Number of threads to use for a single STAR execution." + echo "" + echo "Output arguments:" + echo " --output" + echo " type: file, required parameter, multiple values allowed, output, file" + echo "must exist" + echo " default: ./*" + echo " A list of output folders which are the result of using STAR to map each" + echo " input FASTQ pair STAR to the reference genome." + echo " The order of the items DO NOT match with the order of the entries in the" + echo " barcodes FASTA file or the input FASTQ pairs." + echo "" + echo " --joblog" + echo " type: file, output, file must exist" + echo " default: execution_log.txt" + echo " Where to store the log file listing all the jobs." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "parallel_map updatecraftbox" + exit + ;; + --input_r1) + if [ -z "$VIASH_PAR_INPUT_R1" ]; then + VIASH_PAR_INPUT_R1="$2" + else + VIASH_PAR_INPUT_R1="$VIASH_PAR_INPUT_R1;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_r1. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_r1=*) + if [ -z "$VIASH_PAR_INPUT_R1" ]; then + VIASH_PAR_INPUT_R1=$(ViashRemoveFlags "$1") + else + VIASH_PAR_INPUT_R1="$VIASH_PAR_INPUT_R1;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --input_r2) + if [ -z "$VIASH_PAR_INPUT_R2" ]; then + VIASH_PAR_INPUT_R2="$2" + else + VIASH_PAR_INPUT_R2="$VIASH_PAR_INPUT_R2;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input_r2. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input_r2=*) + if [ -z "$VIASH_PAR_INPUT_R2" ]; then + VIASH_PAR_INPUT_R2=$(ViashRemoveFlags "$1") + else + VIASH_PAR_INPUT_R2="$VIASH_PAR_INPUT_R2;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --genomeDir) + [ -n "$VIASH_PAR_GENOMEDIR" ] && ViashError Bad arguments for option \'--genomeDir\': \'$VIASH_PAR_GENOMEDIR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GENOMEDIR="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --genomeDir. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --genomeDir=*) + [ -n "$VIASH_PAR_GENOMEDIR" ] && ViashError Bad arguments for option \'--genomeDir=*\': \'$VIASH_PAR_GENOMEDIR\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_GENOMEDIR=$(ViashRemoveFlags "$1") + shift 1 + ;; + --barcodesFasta) + [ -n "$VIASH_PAR_BARCODESFASTA" ] && ViashError Bad arguments for option \'--barcodesFasta\': \'$VIASH_PAR_BARCODESFASTA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BARCODESFASTA="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --barcodesFasta. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --barcodesFasta=*) + [ -n "$VIASH_PAR_BARCODESFASTA" ] && ViashError Bad arguments for option \'--barcodesFasta=*\': \'$VIASH_PAR_BARCODESFASTA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BARCODESFASTA=$(ViashRemoveFlags "$1") + shift 1 + ;; + --umiLength) + [ -n "$VIASH_PAR_UMILENGTH" ] && ViashError Bad arguments for option \'--umiLength\': \'$VIASH_PAR_UMILENGTH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_UMILENGTH="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --umiLength. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --umiLength=*) + [ -n "$VIASH_PAR_UMILENGTH" ] && ViashError Bad arguments for option \'--umiLength=*\': \'$VIASH_PAR_UMILENGTH\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_UMILENGTH=$(ViashRemoveFlags "$1") + shift 1 + ;; + --limitBAMsortRAM) + [ -n "$VIASH_PAR_LIMITBAMSORTRAM" ] && ViashError Bad arguments for option \'--limitBAMsortRAM\': \'$VIASH_PAR_LIMITBAMSORTRAM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LIMITBAMSORTRAM="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --limitBAMsortRAM. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --limitBAMsortRAM=*) + [ -n "$VIASH_PAR_LIMITBAMSORTRAM" ] && ViashError Bad arguments for option \'--limitBAMsortRAM=*\': \'$VIASH_PAR_LIMITBAMSORTRAM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_LIMITBAMSORTRAM=$(ViashRemoveFlags "$1") + shift 1 + ;; + --runThreadN) + [ -n "$VIASH_PAR_RUNTHREADN" ] && ViashError Bad arguments for option \'--runThreadN\': \'$VIASH_PAR_RUNTHREADN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RUNTHREADN="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --runThreadN. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --runThreadN=*) + [ -n "$VIASH_PAR_RUNTHREADN" ] && ViashError Bad arguments for option \'--runThreadN=*\': \'$VIASH_PAR_RUNTHREADN\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_RUNTHREADN=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --joblog) + [ -n "$VIASH_PAR_JOBLOG" ] && ViashError Bad arguments for option \'--joblog\': \'$VIASH_PAR_JOBLOG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_JOBLOG="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --joblog. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --joblog=*) + [ -n "$VIASH_PAR_JOBLOG" ] && ViashError Bad arguments for option \'--joblog=*\': \'$VIASH_PAR_JOBLOG\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_JOBLOG=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/htrnaseq/parallel_map:updatecraftbox' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'ps' 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'ps' 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_INPUT_R1+x} ]; then + ViashError '--input_r1' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_INPUT_R2+x} ]; then + ViashError '--input_r2' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_GENOMEDIR+x} ]; then + ViashError '--genomeDir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_BARCODESFASTA+x} ]; then + ViashError '--barcodesFasta' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_UMILENGTH+x} ]; then + ViashError '--umiLength' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_LIMITBAMSORTRAM+x} ]; then + VIASH_PAR_LIMITBAMSORTRAM="10000000000" +fi +if [ -z ${VIASH_PAR_RUNTHREADN+x} ]; then + VIASH_PAR_RUNTHREADN="1" +fi +if [ -z ${VIASH_PAR_JOBLOG+x} ]; then + VIASH_PAR_JOBLOG="execution_log.txt" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT_R1" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_INPUT_R1; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi +if [ ! -z "$VIASH_PAR_INPUT_R2" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_INPUT_R2; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi +if [ ! -z "$VIASH_PAR_GENOMEDIR" ] && [ ! -e "$VIASH_PAR_GENOMEDIR" ]; then + ViashError "Input file '$VIASH_PAR_GENOMEDIR' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_BARCODESFASTA" ] && [ ! -e "$VIASH_PAR_BARCODESFASTA" ]; then + ViashError "Input file '$VIASH_PAR_BARCODESFASTA' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_UMILENGTH" ]]; then + if ! [[ "$VIASH_PAR_UMILENGTH" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--umiLength' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_RUNTHREADN" ]]; then + if ! [[ "$VIASH_PAR_RUNTHREADN" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--runThreadN' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_PAR_OUTPUT" ]]; then + if ! [[ "$VIASH_PAR_OUTPUT" =~ \* ]]; then + ViashError '--output' has to be a path containing a wildcard, e.g. 'output_*.txt'. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi +if [ ! -z "$VIASH_PAR_JOBLOG" ] && [ ! -d "$(dirname "$VIASH_PAR_JOBLOG")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_JOBLOG")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT_R1" ]; then + VIASH_TEST_INPUT_R1=() + IFS=';' + for var in $VIASH_PAR_INPUT_R1; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_INPUT_R1+=( "$var" ) + done + VIASH_PAR_INPUT_R1=$(IFS=';' ; echo "${VIASH_TEST_INPUT_R1[*]}") +fi +if [ ! -z "$VIASH_PAR_INPUT_R2" ]; then + VIASH_TEST_INPUT_R2=() + IFS=';' + for var in $VIASH_PAR_INPUT_R2; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_INPUT_R2+=( "$var" ) + done + VIASH_PAR_INPUT_R2=$(IFS=';' ; echo "${VIASH_TEST_INPUT_R2[*]}") +fi +if [ ! -z "$VIASH_PAR_GENOMEDIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_GENOMEDIR")" ) + VIASH_PAR_GENOMEDIR=$(ViashDockerAutodetectMount "$VIASH_PAR_GENOMEDIR") +fi +if [ ! -z "$VIASH_PAR_BARCODESFASTA" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_BARCODESFASTA")" ) + VIASH_PAR_BARCODESFASTA=$(ViashDockerAutodetectMount "$VIASH_PAR_BARCODESFASTA") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_TEST_OUTPUT=() + IFS=';' + for var in $VIASH_PAR_OUTPUT; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_OUTPUT+=( "$var" ) + VIASH_CHOWN_VARS+=( "$var" ) + done + VIASH_PAR_OUTPUT=$(IFS=';' ; echo "${VIASH_TEST_OUTPUT[*]}") +fi +if [ ! -z "$VIASH_PAR_JOBLOG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_JOBLOG")" ) + VIASH_PAR_JOBLOG=$(ViashDockerAutodetectMount "$VIASH_PAR_JOBLOG") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_JOBLOG" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-parallel_map-XXXXXX").sh +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +#!/bin/bash + +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_INPUT_R1+x} ]; then echo "${VIASH_PAR_INPUT_R1}" | sed "s#'#'\"'\"'#g;s#.*#par_input_r1='&'#" ; else echo "# par_input_r1="; fi ) +$( if [ ! -z ${VIASH_PAR_INPUT_R2+x} ]; then echo "${VIASH_PAR_INPUT_R2}" | sed "s#'#'\"'\"'#g;s#.*#par_input_r2='&'#" ; else echo "# par_input_r2="; fi ) +$( if [ ! -z ${VIASH_PAR_GENOMEDIR+x} ]; then echo "${VIASH_PAR_GENOMEDIR}" | sed "s#'#'\"'\"'#g;s#.*#par_genomeDir='&'#" ; else echo "# par_genomeDir="; fi ) +$( if [ ! -z ${VIASH_PAR_BARCODESFASTA+x} ]; then echo "${VIASH_PAR_BARCODESFASTA}" | sed "s#'#'\"'\"'#g;s#.*#par_barcodesFasta='&'#" ; else echo "# par_barcodesFasta="; fi ) +$( if [ ! -z ${VIASH_PAR_UMILENGTH+x} ]; then echo "${VIASH_PAR_UMILENGTH}" | sed "s#'#'\"'\"'#g;s#.*#par_umiLength='&'#" ; else echo "# par_umiLength="; fi ) +$( if [ ! -z ${VIASH_PAR_LIMITBAMSORTRAM+x} ]; then echo "${VIASH_PAR_LIMITBAMSORTRAM}" | sed "s#'#'\"'\"'#g;s#.*#par_limitBAMsortRAM='&'#" ; else echo "# par_limitBAMsortRAM="; fi ) +$( if [ ! -z ${VIASH_PAR_RUNTHREADN+x} ]; then echo "${VIASH_PAR_RUNTHREADN}" | sed "s#'#'\"'\"'#g;s#.*#par_runThreadN='&'#" ; else echo "# par_runThreadN="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "${VIASH_PAR_OUTPUT}" | sed "s#'#'\"'\"'#g;s#.*#par_output='&'#" ; else echo "# par_output="; fi ) +$( if [ ! -z ${VIASH_PAR_JOBLOG+x} ]; then echo "${VIASH_PAR_JOBLOG}" | sed "s#'#'\"'\"'#g;s#.*#par_joblog='&'#" ; else echo "# par_joblog="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\"'\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\"'\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\"'\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\"'\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\"'\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\"'\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END + +set -eo pipefail + +# Check if wildcard character is present in output folder template +printf "Checking if output folder template (\$par_output) contains a single wildcard character '*'. " +output_glob_character="\${par_output//[^\\*]}" +if [[ "\${#output_glob_character}" -ne "1" ]]; then + echo "The value for --output must contain exactly one '*' character. Exiting..." + exit 1 +else + echo "Done, wildcard character found!" +fi + +# Split the delimited strings into arrays +IFS=';' read -r -a input_r1 <<< "\$par_input_r1" +IFS=';' read -r -a input_r2 <<< "\$par_input_r2" + +# Read barcodes FASTQ +# seqkit will make sure to take the leading non-whitespace as sequence identifier (ID) +# Luckily, this is the same as how cutadapt determines an adapter name from the FASTA header. +readarray -t well_ids < <(seqkit seq --name "\$par_barcodesFasta" ) +readarray -t barcodes < <(seqkit seq --seq --upper-case --remove-gaps --gap-letters '^' --validate-seq "\$par_barcodesFasta") + +# Function to test for unique values in array +function arrayContainsUniqueValues { + # Pass the argument by reference + local -n arr=\$1 + # Create a temporary associative array + # in order to use its uniqueness of keys + # 'declare' in a function is automatically local + declare -A uniq_tmp + for item in "\${arr[@]}"; do + uniq_tmp[\$item]=0 # assigning a placeholder + done + local unique_array_values=(\${!uniq_tmp[@]}) + if [ "\${#unique_array_values[@]}" -eq "\${#arr[@]}" ]; then + return + fi + false +} +arrayContainsUniqueValues barcodes +is_array_unique_exit_code=\$? +if ! (exit \$is_array_unique_exit_code); then + echo "The provided barcodes should be unique!" + echo "Values: \$par_barcodes" + exit 1 +fi + +# Check that the number of values provided for the fastq files are the same. +num_r1_inputs="\${#input_r1[@]}" +num_r2_inputs="\${#input_r2[@]}" + +if [ ! "\$num_r1_inputs" -eq "\$num_r2_inputs" ]; then + echo "The number of values for arguments "\\ + "'input_r1' (\$num_r1_inputs) and 'input_r2' (\$num_r2_inputs) "\\ + "should be the same." + exit 1 +else + echo "Checked if the same as the number of R1 FASTQ (\$num_r1_inputs) and R2 FASTQ files "\\ + "(\$num_r2_inputs) were provided. Seems OK!" +fi + +# Loop over the well IDs and match them to the input FASTQ files +# The FASTQ file names should have the format {well_id}_R(1|2).fastq, +# which is the output format that the cutadapt component uses for demultiplexing. +# sorted_input_r1 and sorted_input_r2 are the input FASTQ files sorted by the order +# of the barcodes in the barcodes array (i.e. the order in the barcodes FASTA file). +declare -a sorted_input_r1=() +declare -a sorted_input_r2=() +for barcode_index in "\${!barcodes[@]}"; do + barcode="\${barcodes[\$barcode_index]}" + well_id="\${well_ids[\$barcode_index]}" + echo "Finding FASTQ files for barcode \${barcode}, well ID '\${well_id}'." + # The FASTQ files for a particular barcode must match the following regex: + input_file_regex="^\${well_id}_R[1-2]" + + for r1_index in "\${!input_r1[@]}"; do + r1_file_path=\${input_r1[\$r1_index]} + r2_file_path=\${input_r2[\$r1_index]} + # Get the file names from the full path + r1_file_name=\$(basename -- "\$r1_file_path") + r2_file_name=\$(basename -- "\$r2_file_path") + + # Check if the file names match the regex + if [[ \$r1_file_name =~ \$input_file_regex ]]; then + echo "Matched with \$r1_file_name and \$r2_file_name." + # If the R1 FASTQ file matched the regex, + # the R2 file must have also been matched + if ! [[ \$r2_file_name =~ \$input_file_regex ]]; then + echo "File \${r1_file_name} matched with regex \${input_file_regex} "\\ + "but \${r2_file_name} did not! Make sure that the order of "\\ + "the R1 and R2 input files match." + exit 1 + fi + # Add the + sorted_input_r1+=("\$r1_file_path") + sorted_input_r2+=("\$r2_file_path") + # Do not continue looking for more files for this barcode + # '2' to affect the *outer* loop (which indeed loops barcodes)! + continue 2 + fi + done + echo "Did not find FASTQ files files for well \${well_id}! "\\ + "Make sure that the input files have the correct file name format."\\ + "Input files: \${input_r1[@]}" + exit 1 +done + + +# Define the function that will be used to run a single job +function _run() { + local par_UMIlength="\$1" + local par_output="\$2" + local par_genomeDir="\$3" + local par_limitBAMsortRAM="\$4" + local par_runThreadN="\$5" + local barcode="\$6" + local input_R1="\$7" + local input_R2="\$8" + + local barcode_length="\${#barcode}" + local umi_start="\$((\$barcode_length + 1))" + + set -eo pipefail + + echo <<-EOF + Processing \$barcode + For the following inputs (lanes): + "\$star_readFilesIn + EOF + + echo "Writing barcode '\$barcode' to \$barcode.txt and using it as input". + # Note that there is no possible conflict between jobs here + # because the barcodes are unique (and the barcode is part of the name + # of the file). + echo "\$barcode" > "\$barcode.txt" + + local dir="\${par_output//\\*/\$barcode}/" + echo "Setting output for barcode '\$barcode' to '\$dir'." + mkdir -p "\$dir" + + # check if files are compressed + local TMPDIR=\$(mktemp -d "\$meta_temp_dir/parallel_map-\$barcode-XXXXXX") + function clean_up { + [[ -d "\$TMPDIR" ]] && rm -r "\$TMPDIR" + } + trap clean_up RETURN + + # Decompress the input files when needed + # NOTE: for some reason, using STAR's --readFilesCommand does not always work + # This might be because STAR creates fifo files (see https://man7.org/linux/man-pages/man7/fifo.7.html) + # and this requires a filesystem that supports this. Another cause might be that the input files + # are symlinks. When testing this, using '--readFilesCommand "zcat"' + # always produced empty BAM files, but also a succesfull exit code (0) so the problem is not reported. + # However, the logs showed the following error: "gzip -: unexpected end of file". + + function is_gzipped { + printf "Checking if input '\$1' (barcode '\$barcode') is gzipped... " + if file "\$1" | grep -q 'gzip'; then + echo "Done, detected compressed file." + return + fi + echo "Done, file does not need decompression." + false + } + + # Resolve symbolic links to actual file paths + input_R1=\$(realpath \$input_R1) + input_R2=\$(realpath \$input_R2) + + if is_gzipped \$input_R1; then + local compressed_file_name_r1="\$(basename -- \$input_R1)" + local uncompressed_file_r1="\$TMPDIR/\${compressed_file_name_r1%.gz}" + printf "Unpacking input to \$uncompressed_file_r1... " + zcat "\$input_R1" > "\$uncompressed_file_r1" + echo "Decompression done." + else + local uncompressed_file_r1="\$input_R1" + fi + + if is_gzipped \$input_R2; then + local compressed_file_name_r2="\$(basename -- \$input_R2)" + local uncompressed_file_r2="\$TMPDIR/\${compressed_file_name_r2%.gz}" + printf "Unpacking input to \$uncompressed_file_r2... " + zcat "\$input_R2" > "\$uncompressed_file_r2" + echo "Decompression done." + else + local uncompressed_file_r2="\$input_R2" + fi + + local n_input_lines_r1=\$(wc -l < "\$uncompressed_file_r1") + local n_input_lines_r2=\$(wc -l < "\$uncompressed_file_r2") + + printf "Checking if length of input file mates match. " + if (( \$n_input_lines_r1 != n_input_lines_r2 )); then + echo "The length of file \$input_R1 (\$n_input_lines_r1) does not match with \$input_R2 (\$n_input_lines_r2)" + return 1 + else + echo "Seems OK, \$n_input_lines_r1 input lines." + fi + echo "Starting STAR for barcode '\$barcode'" + # soloType 'Droplet' is the same as 'CB_UMI_Simple': one UMI and one cell barcode of fixed length. + # By default in this mode, STAR will look for the cell barcode and the UMI int the last files specified with --readFilesIn + # So we need to specify R2 first and R1 second, because R1 contains the barcode and UMI. + # Also, you might be tempted to use '--soloBarcodeMate 1' to alter this behavior, but this requires the clipping + # the barcode from this mate by specifying --clip5pNbases and/or --clip3pNbases, which we do not want to do. + STAR \\ + --readFilesIn "\$uncompressed_file_r2" "\$uncompressed_file_r1" \\ + --soloType Droplet \\ + --quantMode GeneCounts \\ + --genomeLoad LoadAndKeep \\ + --limitBAMsortRAM "\$par_limitBAMsortRAM" \\ + --runThreadN "\$par_runThreadN" \\ + --outFilterMultimapNmax 1 \\ + --outSAMtype BAM SortedByCoordinate \\ + --soloCBstart 1 \\ + --readFilesType "Fastx" \\ + --soloCBlen "\$barcode_length" \\ + --soloUMIstart "\$umi_start" \\ + --soloUMIlen "\$par_UMIlength" \\ + --soloBarcodeReadLength 0 \\ + --soloStrand Unstranded \\ + --soloFeatures Gene \\ + --genomeDir "\$par_genomeDir" \\ + --outReadsUnmapped Fastx \\ + --outSAMunmapped Within \\ + --outSAMattributes NH HI nM AS CR UR CB UB GX GN \\ + --soloCBwhitelist "\$barcode.txt" \\ + --outFileNamePrefix "\$dir" \\ + --outTmpDir "\$TMPDIR/STARtemp/" + + printf "Done running STAR. " + # Check if the number of processed reads is equal to the number of input reads + local n_input_reads=\$((\$n_input_lines_r1 / 4)) + local nr_output_reads=\$(grep -Po "Number\\ of\\ input\\ reads \\\\|\\W*\\K\\d+" "\$dir/Log.final.out") + if (( \$nr_output_reads != \$n_input_reads )); then + echo "Not all input reads were processed for barcode \$barcode." + return 1 + else + echo "Processed \$nr_output_reads reads for barcode \$barcode". + fi + + printf "Making sure that the output has the proper permissions." + find "\$dir" -type d -exec chmod o+x {} \\; + chmod -R o+r "\$dir" + echo "Done" +} + +# Export the function - requires bash +export -f _run + +# Load reference genome +echo "Loading reference genome" +STAR --genomeLoad LoadAndExit --genomeDir "\$par_genomeDir" + +# Run the concurrent jobs using GNU parallel + +# Make sure that parallel uses the correct shell +export PARALLEL_SHELL="/bin/bash" + +# Some notes: +# --halt now,fail=1: instruct parallel to exit when a job has failed and kill remaining running jobs. +# +# ::: is a special syntax for GNU parallel to delineate inputs +# If multiple ::: are given, each group will be treated as an input source, and all combinations of input +# sources will be generated. E.g. ::: 1 2 ::: a b c will result in the combinations (1,a) (1,b) (1,c) (2,a) (2,b) (2,c) +# The delimiter :::+ (note the extra '+') links the argument to the previous argument, and one argument from each of the input +# sources will be read. +parallel_cmd=("parallel" "--jobs" "80%" "--verbose" "--memfree" "2G" + "--tmpdir" "\$meta_temp_dir" + "--retry-failed" "--retries" "4" "--halt" "soon,fail=1" + "--joblog" "\$par_joblog" "_run" "{}") + +# Arguments for which there is one value, so these will not create extra jobs +parallel_cmd+=(":::" "\$par_umiLength" ":::" "\$par_output" ":::" "\$par_genomeDir" ":::" "\$par_limitBAMsortRAM" ":::" "\$par_runThreadN") + +# Argument which in fact will cause extra jobs to be spawned, per job one item from each argument will be selected +# Thus, these argument lists should have the same length. +parallel_cmd+=(":::" "\${barcodes[@]}" ":::+" "\${sorted_input_r1[@]}" ":::+" "\${sorted_input_r2[@]}") + +set +eo pipefail +"\${parallel_cmd[@]}" +exit_code=\$? +set -eo pipefail + +echo "GNU parallel finished!" + +# Unload reference +printf "Unloading reference genome. " +STAR --genomeLoad Remove --genomeDir "\$par_genomeDir" +echo "Done!" + +# Exit code from GNU parallel: +# If fail=1 is used, the exit status will be the exit status of the failing job. +echo "Checking exit code" +if ((exit_code>0)); then + # Note that the ending HERE must be indented with TAB characters (not spaces) + # in order to remove leading indentation + MESSAGE=\$( + cat <<-HERE + ================================================================== + + !!! An error occurred for one of the jobs. + Exit code of the failing job: \$exit_code. + + %s + + ================================================================== + + HERE + ) + printf "\$MESSAGE" "\$(<\$par_joblog)" + exit 1 +else + cat <<-HERE + ================================================================== + + Mapping went fine (exit code '\$exit_code'), zero errors occurred + + ================================================================== + HERE + +fi +VIASHMAIN +bash "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT_R1" ]; then + unset VIASH_TEST_INPUT_R1 + IFS=';' + for var in $VIASH_PAR_INPUT_R1; do + unset IFS + if [ -z "$VIASH_TEST_INPUT_R1" ]; then + VIASH_TEST_INPUT_R1="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_INPUT_R1="$VIASH_TEST_INPUT_R1;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_INPUT_R1="$VIASH_TEST_INPUT_R1" + fi + if [ ! -z "$VIASH_PAR_INPUT_R2" ]; then + unset VIASH_TEST_INPUT_R2 + IFS=';' + for var in $VIASH_PAR_INPUT_R2; do + unset IFS + if [ -z "$VIASH_TEST_INPUT_R2" ]; then + VIASH_TEST_INPUT_R2="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_INPUT_R2="$VIASH_TEST_INPUT_R2;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_INPUT_R2="$VIASH_TEST_INPUT_R2" + fi + if [ ! -z "$VIASH_PAR_GENOMEDIR" ]; then + VIASH_PAR_GENOMEDIR=$(ViashDockerStripAutomount "$VIASH_PAR_GENOMEDIR") + fi + if [ ! -z "$VIASH_PAR_BARCODESFASTA" ]; then + VIASH_PAR_BARCODESFASTA=$(ViashDockerStripAutomount "$VIASH_PAR_BARCODESFASTA") + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_PAR_JOBLOG" ]; then + VIASH_PAR_JOBLOG=$(ViashDockerStripAutomount "$VIASH_PAR_JOBLOG") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && ! compgen -G "$VIASH_PAR_OUTPUT" > /dev/null; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_JOBLOG" ] && [ ! -e "$VIASH_PAR_JOBLOG" ]; then + ViashError "Output file '$VIASH_PAR_JOBLOG' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/report/create_report/.config.vsh.yaml b/target/executable/report/create_report/.config.vsh.yaml new file mode 100644 index 00000000..190e2cbc --- /dev/null +++ b/target/executable/report/create_report/.config.vsh.yaml @@ -0,0 +1,268 @@ +name: "create_report" +namespace: "report" +version: "updatecraftbox" +authors: +- name: "Dries Schaumont" + roles: + - "maintainer" + info: + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Marijke Van Moerbeke" + roles: + - "author" + - "maintainer" + info: + links: + github: "mvanmoerbeke" + orcid: "0000-0002-3097-5621" + linkedin: "marijke-van-moerbeke-84303a34" + organizations: + - name: "OpenAnalytics" + href: "https://www.openanalytics.eu" + role: "Statistical Consultant" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--eset" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--output_report" + info: null + example: + - "report.html" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "r_script" + path: "script.R" + is_executable: true +- type: "r_script" + path: "template.Rmd" + is_executable: true +- type: "r_script" + path: "plateLayouts.R" + is_executable: true +- type: "file" + path: "OutputSTARsolo.png" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +- type: "file" + path: "_viash.yaml" + dest: "_viash.yaml" +description: "Create a basic QC report in HTML format based on a number of esets.\n" +test_resources: +- type: "r_script" + path: "test.R" + is_executable: true +- type: "file" + path: "test_data" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +requirements: + commands: + - "ps" +license: "MIT" +links: + repository: "https://github.com/viash-hub/htrnaseq" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "rocker/r2u:24.04" + target_registry: "images.viash-hub.com" + target_tag: "updatecraftbox" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + - "pandoc" + interactive: false + - type: "r" + script: + - "install.packages(\"BiocManager\")" + - "BiocManager::install(version = \"3.21\", type = \"source\", checkBuilt = TRUE)" + bioc_force_install: false + warnings_as_errors: true + - type: "r" + cran: + - "ggplot2" + - "knitr" + - "gridExtra" + - "RColorBrewer" + - "processx" + - "whisker" + - "rmarkdown" + - "bookdown" + - "data.table" + - "platetools" + - "htmltools" + - "DT" + - "logger" + - "bit64" + bioc: + - "Biobase" + - "ComplexHeatmap" + script: + - "install.packages(\"oaStyle\", repos = c(rdepot = \"https://repos.openanalytics.eu/repo/public\"\ + , getOption(\"repos\")))" + bioc_force_install: false + warnings_as_errors: true + test_setup: + - type: "r" + packages: + - "testthat" + - "R.utils" + bioc_force_install: false + warnings_as_errors: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/report/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/report/create_report" + executable: "target/executable/report/create_report/create_report" + viash_version: "0.9.4" + git_commit: "e6da525fc57aaec74f348eb974b68faa647bf800" + git_remote: "https://github.com/viash-hub/htrnaseq" +package_config: + name: "htrnaseq" + version: "updatecraftbox" + summary: "A workflow for high-throughput RNA-seq data analyses.\n" + description: "This workflow is designed to process high-throughput RNA-seq data,\ + \ where every\nwell of a microarray plate is a sample. A fasta file provided as\ + \ input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow\ + \ is built in a modular fashion, where most of the base functionality\nis provided\ + \ by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\n\ + supplemented by custom base components and workflow components in this package.\n\ + \nThe full workflow is split in two major subworkflows that can be run independently:\n\ + \n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per\ + \ well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate\ + \ QC reports.\n\nEach of those can be started individually, or the full workflow\ + \ can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq)\ + \ \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner)\ + \ where a\nnumber of choices (input/output structure and location) have been made.\n\ + \nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other\ + \ formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex)\ + \ first.\n" + info: + test_resources: + - path: "gs://viash-hub-resources/htrnaseq/v2" + dest: "resources_test" + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script\ + \ += 'includeConfig(\"nextflow_labels.config\")'\n.resources += {path: '/src/config/labels.config',\ + \ dest: 'nextflow_labels.config'}\n.resources += {path: '/_viash.yaml', dest:\ + \ '_viash.yaml'}\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'updatecraftbox'" + keywords: + - "bioinformatics" + - "sequencing" + - "high-throughput" + - "RNAseq" + - "mapping" + - "counting" + - "pipeline" + - "workflow" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/viash-hub/htrnaseq" + issue_tracker: "https://github.com/viash-hub/htrnaseq/issues" diff --git a/target/executable/report/create_report/OutputSTARsolo.png b/target/executable/report/create_report/OutputSTARsolo.png new file mode 100644 index 00000000..cb77d8e0 Binary files /dev/null and b/target/executable/report/create_report/OutputSTARsolo.png differ diff --git a/target/executable/report/create_report/_viash.yaml b/target/executable/report/create_report/_viash.yaml new file mode 100644 index 00000000..d08de7b1 --- /dev/null +++ b/target/executable/report/create_report/_viash.yaml @@ -0,0 +1,21 @@ +name: htrnaseq +summary: | + A workflow for high-throughput RNA-seq data analyses. +description: "This workflow is designed to process high-throughput RNA-seq data, where every\nwell of a microarray plate is a sample. A fasta file provided as input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow is built in a modular fashion, where most of the base functionality\nis provided by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\nsupplemented by custom base components and workflow components in this package.\n\nThe full workflow is split in two major subworkflows that can be run independently:\n\n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate QC reports.\n\nEach of those can be started individually, or the full workflow can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq) \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner) where a\nnumber of choices (input/output structure and location) have been made.\n\nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex) first.\n" +license: MIT +keywords: [bioinformatics, sequencing, high-throughput, RNAseq, mapping, counting, pipeline, workflow] +links: + issue_tracker: https://github.com/viash-hub/htrnaseq/issues + repository: https://github.com/viash-hub/htrnaseq +viash_version: 0.9.4 +info: + test_resources: + - path: gs://viash-hub-resources/htrnaseq/v2 + dest: resources_test +config_mods: | + .requirements.commands := ['ps'] + .runners[.type == 'nextflow'].config.script += 'includeConfig("nextflow_labels.config")' + .resources += {path: '/src/config/labels.config', dest: 'nextflow_labels.config'} + .resources += {path: '/_viash.yaml', dest: '_viash.yaml'} +version: updatecraftbox +organization: vsh diff --git a/target/executable/report/create_report/create_report b/target/executable/report/create_report/create_report new file mode 100755 index 00000000..05510868 --- /dev/null +++ b/target/executable/report/create_report/create_report @@ -0,0 +1,1191 @@ +#!/usr/bin/env bash + +# create_report updatecraftbox +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dries Schaumont (maintainer) +# * Marijke Van Moerbeke (author, maintainer) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="create_report" +VIASH_META_FUNCTIONALITY_NAME="create_report" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM rocker/r2u:24.04 +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps pandoc && \ + rm -rf /var/lib/apt/lists/* + +RUN Rscript -e 'options(warn = 2); install.packages("BiocManager")' && \ + Rscript -e 'options(warn = 2); BiocManager::install(version = "3.21", type = "source", checkBuilt = TRUE)' + +RUN Rscript -e 'options(warn = 2); if (!requireNamespace("remotes", quietly = TRUE)) install.packages("remotes")' && \ + Rscript -e 'options(warn = 2); if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager")' && \ + Rscript -e 'options(warn = 2); if (!requireNamespace("Biobase", quietly = TRUE)) BiocManager::install("Biobase")' && \ + Rscript -e 'options(warn = 2); if (!requireNamespace("ComplexHeatmap", quietly = TRUE)) BiocManager::install("ComplexHeatmap")' && \ + Rscript -e 'options(warn = 2); remotes::install_cran(c("ggplot2", "knitr", "gridExtra", "RColorBrewer", "processx", "whisker", "rmarkdown", "bookdown", "data.table", "platetools", "htmltools", "DT", "logger", "bit64"), repos = "https://cran.rstudio.com")' && \ + Rscript -e 'options(warn = 2); install.packages("oaStyle", repos = c(rdepot = "https://repos.openanalytics.eu/repo/public", getOption("repos")))' + +LABEL org.opencontainers.image.authors="Dries Schaumont, Marijke Van Moerbeke" +LABEL org.opencontainers.image.description="Companion container for running component report create_report" +LABEL org.opencontainers.image.created="2025-09-02T13:33:10Z" +LABEL org.opencontainers.image.source="https://github.com/viash-hub/htrnaseq" +LABEL org.opencontainers.image.revision="e6da525fc57aaec74f348eb974b68faa647bf800" +LABEL org.opencontainers.image.version="updatecraftbox" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "create_report updatecraftbox" + echo "" + echo "Create a basic QC report in HTML format based on a number of esets." + echo "" + echo "Arguments:" + echo " --eset" + echo " type: file, required parameter, multiple values allowed, file must exist" + echo "" + echo " --output_report" + echo " type: file, required parameter, output, file must exist" + echo " example: report.html" + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "create_report updatecraftbox" + exit + ;; + --eset) + if [ -z "$VIASH_PAR_ESET" ]; then + VIASH_PAR_ESET="$2" + else + VIASH_PAR_ESET="$VIASH_PAR_ESET;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --eset. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --eset=*) + if [ -z "$VIASH_PAR_ESET" ]; then + VIASH_PAR_ESET=$(ViashRemoveFlags "$1") + else + VIASH_PAR_ESET="$VIASH_PAR_ESET;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --output_report) + [ -n "$VIASH_PAR_OUTPUT_REPORT" ] && ViashError Bad arguments for option \'--output_report\': \'$VIASH_PAR_OUTPUT_REPORT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_REPORT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output_report. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output_report=*) + [ -n "$VIASH_PAR_OUTPUT_REPORT" ] && ViashError Bad arguments for option \'--output_report=*\': \'$VIASH_PAR_OUTPUT_REPORT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT_REPORT=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/htrnaseq/report/create_report:updatecraftbox' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'ps' 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'ps' 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_ESET+x} ]; then + ViashError '--eset' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT_REPORT+x} ]; then + ViashError '--output_report' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_ESET" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_ESET; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT_REPORT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT_REPORT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT_REPORT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_ESET" ]; then + VIASH_TEST_ESET=() + IFS=';' + for var in $VIASH_PAR_ESET; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_ESET+=( "$var" ) + done + VIASH_PAR_ESET=$(IFS=';' ; echo "${VIASH_TEST_ESET[*]}") +fi +if [ ! -z "$VIASH_PAR_OUTPUT_REPORT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT_REPORT")" ) + VIASH_PAR_OUTPUT_REPORT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT_REPORT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT_REPORT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-create_report-XXXXXX").R +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +## VIASH START +# The following code has been auto-generated by Viash. +# treat warnings as errors +.viash_orig_warn <- options(warn = 2) + +par <- list( + "eset" = $( if [ ! -z ${VIASH_PAR_ESET+x} ]; then echo -n "strsplit('"; echo -n "$VIASH_PAR_ESET" | sed "s#['\\]#\\\\&#g"; echo "', split = ';')[[1]]"; else echo NULL; fi ), + "output_report" = $( if [ ! -z ${VIASH_PAR_OUTPUT_REPORT+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_OUTPUT_REPORT" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ) +) +meta <- list( + "name" = $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_META_NAME" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "functionality_name" = $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_META_FUNCTIONALITY_NAME" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "resources_dir" = $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_META_RESOURCES_DIR" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "executable" = $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo -n "'"; echo -n "$VIASH_META_EXECUTABLE" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "config" = $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo -n "'"; echo -n "$VIASH_META_CONFIG" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "temp_dir" = $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_META_TEMP_DIR" | sed "s#['\\]#\\\\&#g"; echo "'"; else echo NULL; fi ), + "cpus" = $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_META_CPUS" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_b" = $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_B" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_kb" = $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_KB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_mb" = $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_MB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_gb" = $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_GB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_tb" = $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_TB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_pb" = $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_PB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_kib" = $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_KIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_mib" = $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_MIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_gib" = $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_GIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_tib" = $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_TIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_pib" = $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_PIB" | sed "s#['\\]#\\\\&#g"; echo "')"; else echo NULL; fi ) +) +dep <- list( + +) + + +# restore original warn setting +options(.viash_orig_warn) +rm(.viash_orig_warn) + +## VIASH END +library(whisker) +library(logger) + +log_info("Setting temporary directory to: {meta\$temp_dir}") +Sys.setenv(TMP = meta\$temp_dir) +temp_folder <- tempdir(check = TRUE) +log_info("Created temporary directory {temp_folder}") + +template <- file.path(meta\$resources_dir, "template.Rmd") + +esets_normalized <- lapply(par\$eset, function(eset_path) { + return(file.path(normalizePath(dirname(eset_path)), basename(eset_path))) +}) + +log_info(paste0( + "Rendering markdown {template} to HTML ", + "{par\$output_report} with esets {paste(esets_normalized, collapse = ', ')}" +)) + +rmarkdown::render( + normalizePath(template), + output_file = basename(par\$output_report), + output_dir = dirname(par\$output_report), + runtime = "static", + intermediates_dir = par\$report_dir, + clean = TRUE, + params = list( + esets = esets_normalized, + outputDir = par\$report_dir + ) +) + +log_info("Done") +VIASHMAIN +Rscript "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_ESET" ]; then + unset VIASH_TEST_ESET + IFS=';' + for var in $VIASH_PAR_ESET; do + unset IFS + if [ -z "$VIASH_TEST_ESET" ]; then + VIASH_TEST_ESET="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_ESET="$VIASH_TEST_ESET;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_ESET="$VIASH_TEST_ESET" + fi + if [ ! -z "$VIASH_PAR_OUTPUT_REPORT" ]; then + VIASH_PAR_OUTPUT_REPORT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT_REPORT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT_REPORT" ] && [ ! -e "$VIASH_PAR_OUTPUT_REPORT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT_REPORT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/report/create_report/nextflow_labels.config b/target/executable/report/create_report/nextflow_labels.config new file mode 100644 index 00000000..2821ec46 --- /dev/null +++ b/target/executable/report/create_report/nextflow_labels.config @@ -0,0 +1,108 @@ +executor { + $k8s { + submitRateLimit = '10sec' + pollInterval = '1 sec' + } +} + +process { + container = 'nextflow/bash:latest' + + // default resources + memory = { 8.Gb * task.attempt } + cpus = 8 + maxForks = 36 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + maxMemory = 192.GB + + // Resource labels + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 8 } + withLabel: midcpu { cpus = 16 } + withLabel: highcpu { cpus = 32 } + + withLabel: verylowmem { memory = { get_memory( 4.GB * task.attempt ) } } + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 16.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 64.GB * task.attempt ) } } + +} + +profiles { + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.fixOwnership = true + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + + local { + // This config is for local processing. + process { + withName: ".*parallel_map_process" { + maxForks = 1 + } + maxMemory = 25.GB + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 6 } + withLabel: highcpu { cpus = 8 } + + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 12.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 20.GB * task.attempt ) } } + } + } +} + +def get_memory(to_compare) { + if (!process.containsKey("maxMemory") || !process.maxMemory) { + return to_compare + } + + try { + if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) { + return process.maxMemory + } + else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) { + return max_memory as nextflow.util.MemoryUnit + } + else { + return to_compare + } + } catch (all) { + println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!" + System.exit(1) + } + } diff --git a/target/executable/report/create_report/plateLayouts.R b/target/executable/report/create_report/plateLayouts.R new file mode 100755 index 00000000..a2909d4a --- /dev/null +++ b/target/executable/report/create_report/plateLayouts.R @@ -0,0 +1,447 @@ + +#' Displays the annotation of the wells in a plateLayout +#' @param plateData a data.table object containing the information +#' of the plate. This must contain a "WellID". +#' @param plateName The plate name +#' @param valueVariable The name of the variable in 'plateData' to +#' be visualized in a plate layout. +#' @param textVariable The name of the variable in 'plateData' to be +#' shown in the wells of the plate layout. If NULL, the valueVariable +#' is shown. +#' @param colours A named character vector containing the colours +#' for the different levels of the valuevariable. The names should +#' correspond to the dose levels. if not specified, a scheme of blues +#' will be provided. +#' @param breaks Numeric vector indicating breaks for plot coloring. +#' @param colourWellText Colour to display the text in the wells. +#' @param layout Integer vector of length two with number of rows and +#' colums in a plate, e.g. \code{c(16,24)} +#' @param legend.title A title for the legend +#' @param plot.title A title for the plot, will be contracted +#' with the plate name +#' @param ... additional arguments for \code{plateLayout.default} function +#' @import data.table +#' @importFrom platetools fill_plate +#' @export +plateLayout.annotation <- function( + plateData, + plateName = character(), + valueVariable = "Dose", + textVariable = NULL, + breaks = NULL, colours = NULL, + colourWellText = "black", + layout = c(16, 24), + legend.title = "Dose", + plot.title = "Plate Annotation - ", + textFontSize = 9, ... +) { + WellID <- Label <- NULL + + if (!(all(c("WellID", "SampleName") %in% colnames(plateData)))) { + stop(" 'WellID' and 'SampleName' column required in plateData object") + } + + #Check WellID Format + checkWellID <- grepl("^[[:upper:]]{1,2}[[:digit:]]{1,2}$", plateData$WellID) + if(!all(checkWellID)){ + stop("WellID does not have the correct format") + } + + + plateData[, WellID := paste0( + sub(".*([[:alpha:]]).+", "\\1", plateData$WellID), + sprintf( + "%02d", as.numeric(sub(".*[[:alpha:]](.+)", "\\1", plateData$WellID)) + ) + )] + + plateData <- platetools::fill_plate(plateData, "WellID", plate = layout[1]*layout[2]) + + plateData$column <- factor( + sprintf( + "%02d", + as.numeric(sub(".*[[:alpha:]](.+)", "\\1", plateData$WellID)) + ), + levels = sprintf("%02d", seq(1, layout[2])) + ) + plateData$row <- factor(sub(".*([[:alpha:]]).+", "\\1", plateData$WellID), + levels = LETTERS[seq(1, layout[1])]) + + if (!is.null(valueVariable)){ + plateData[, values := as.character(plateData[, ..valueVariable][[1]])] + valueVar <- "values" + }else{ + plateData[, values := "grey"] + valueVar <- "values" + colours <- setNames("grey", "grey") + } + + + if (is.null(colours)) { + + blues <- colorRampPalette(c("#d6e0ff", "#2171B5")) + greens <- colorRampPalette(c("light green", "dark green")) + + numLevels <- sort(as.numeric(as.character(unique(plateData[, values])[ + grepl( + "^[[:digit:]]+([.][[:digit:]]+)?$", + trimws(unique(plateData[, values])) + ) + ]))) + otherLevels <- sort(as.character(unique(plateData[, values])[ + !grepl( + "^[[:digit:]]+([.][[:digit:]]+)?$", + trimws(unique(plateData[,values])) + ) + ])) + + colours <- c(blues(length(numLevels)), greens(length(otherLevels)), "red") + names(colours) <- c(numLevels, otherLevels, "failed") + } + + if (!is.null(textVariable)) { + plateData[, + Label := do.call(paste, c(.SD, sep = "\n ")), + .SDcols = textVariable + ] + plateData[, Label := gsub("-", "-\n", Label)] + plateData[, Label := gsub("_", "_\n", Label)] + textVar <- "Label" + } else { + textVar <- NULL + } + + + if (is.null(breaks)){ + breaks <- seq_len(length(colours)) + } + + plateLayout( + plateData = plateData, valueVariable = valueVar, + textVariable = textVar, plateName = plateName, + breaks = breaks, colourWellText = colourWellText, + legend.title = legend.title, layout = layout, + colours = colours, plot.title = plot.title, + textFontSize = textFontSize, ... + ) +} + + + +#' Create a heatmap of values in a plateLayout view. The values can be +#' library sizes, number of genes, qcScore (0/1) or a factor. +#' @param plateData A data.table of the values to be visualized with +#' at least the column of interest (specified in 'varOfInterest') +#' and a 'WellID' column indicating the wells in the plate. The WellID +#' is a combination of a letter (row in the plate) and an integer +#' (column in the plate). +#' @param valueVariable The name of the variable in 'plateData' +#' to be visualized in a plate layout +#' @param textVariable The name of the variable in 'plateData' +#' to be shown in the wells of the plate layout. Defaults to the +#' valueVariable and if NULL, no text will be displayed. +#' @param breaks Numeric vector indicating breaks for plot coloring. +#' @param colours Colours to be used for levels specified by +#' the breaks. If NULL, a colour scheme of purples is shown. +#' @param colourWellText Colour to display the text in the wells. +#' @param layout Integer vector of length two with number of rows +#' and colums in a plate, e.g. \code{c(16,24)} +#' @param makeContourColours Logical, whether or not the plate +#' layout will contain a contour colours for the wells based on the +#' parameters in 'contourColours' and 'categories' +#' @param contourVariable The variable used for the contour colouring +#' @param contourColours Character vector specifying a colour for +#' each range in 'categories' +#' @param labelsCategories Character vector specifying the names +#' (labels) for each range in 'categories' +#' @param categories if contour Variable is not a factor, a numeric +#' vector specifying the categories to divide the 'varOfInterest', +#' including the lower and upper limits. +#' @param plateName The plate name +#' @param plot.title A title for the plot, will be contracted with +#' the plate name +#' @param legend.title A title for the legend +#' @param displayHeatmap Logical, whether to display the plateLayout heatmap +#' @param saveHeatmap Logical, whether to save the plateLayout heatmap +#' @param outputDir The directory where the plateLayout heatmap should be saved +#' @param prefix The prefix to the file name of the saved plateLayout heatmap +#' @param ... additional arguments for \code{ComplexHeatmap::Heatmap} function +#' @importFrom platetools fill_plate +#' @importFrom RColorBrewer brewer.pal +#' @importFrom ComplexHeatmap Heatmap +#' @importFrom circlize colorRamp2 +#' @importFrom grid grid.text grid.rect gpar legendGrob gpar +#' @importFrom grDevices dev.off png +#' @importFrom graphics title +#' @export +plateLayout <- function( + plateData, valueVariable, textVariable = valueVariable, + breaks = NULL, colours = NULL, colourWellText = "white", textFontSize = 6, + layout = c(16, 24), makeContourColours = FALSE, contourVariable = character(), + contourColours = c("red", "orange", "seagreen3"),lwdContours = c(1, 1, 1), + labelsCategories = c('1', '2', '3'), categories = NULL, plateName = character(), + plot.title = character(), legend.title = NULL, legendFontSize = 15, + row_split = rep("A", 16), col_split = rep("A", 24), legendFontSizeTitle = 15, + displayHeatmap = TRUE, saveHeatmap = FALSE, outputDir = ".", prefix = "" +) { + WellID <- NULL + if (!(all(c("WellID", "SampleName") %in% colnames(plateData)))) { + stop(" 'WellID' and 'SampleName' column required in plateData object") + } + + + plateData[, WellID := paste0( + sub(".*([[:alpha:]]).+", "\\1", plateData$WellID), + sprintf( + "%02d", + as.numeric(sub(".*[[:alpha:]](.+)", "\\1", plateData$WellID)) + ) + )] + + plateData <- platetools::fill_plate(plateData, "WellID", plate = 384) + + plateData$column <- factor( + sprintf("%02d", as.numeric( + sub(".*[[:alpha:]](.+)", "\\1", plateData$WellID) + )), + levels = sprintf("%02d", seq(1, layout[2])) + ) + plateData$row <- factor(sub(".*([[:alpha:]]).+", "\\1", plateData$WellID), + levels = LETTERS[seq(1, layout[1])]) + + + plateValues <- plateLayoutFormat( + plateData, + varOfInterest = valueVariable, + rows = layout[1], + cols = layout[2] + ) + if (!is.null(textVariable)) { + plateText <- plateLayoutFormat( + plateData, varOfInterest = textVariable, + rows = layout[1], + cols = layout[2] + ) + } + plot.title <- gsub( + "^([a-z])", "\\U\\1", + gsub("([A-Z])", " \\1", + plot.title, perl = TRUE), perl = TRUE + ) + mainTitle <- paste0(plot.title, plateName) + plateContourColours <- matrix("", nrow = layout[1], ncol = layout[2]) + + if (makeContourColours) { + contourData <- plateData[WellType %in% c("nonEmpty", "Treated Wells"), ] + + if (is.numeric(contourData[, ..contourVariable][[1]])) { + contourData$contours <- cut( + contourData[, ..contourVariable][[1]], + categories, left = TRUE, + right = TRUE, + labels = labelsCategories) + } + else { + contourData$contours <- contourData[, ..contourVariable][[1]] + } + names(contourColours) <- labelsCategories + names(lwdContours) <- labelsCategories + for (i in seq_len(layout[1])) { + for (j in seq_len(layout[2])) { + tryCatch({ + sampleHit <- which( + as.character(contourData$WellID) == paste0( + LETTERS[i], sprintf("%02d", j) + ) + ) + if (length(sampleHit) == 1) { + plateContourColours[i, j] <- as.character( + contourData[sampleHit,'contours'][[1]] + ) + } + }, + error = function(e) { + print(paste0(LETTERS[i], sprintf("%02d", j), " is missing.")) + } + ) + } + } + } + + plateValues$contours <- plateContourColours + colnames(plateValues$values) <- seq_len(ncol(plateValues$values)) + + if (is.null(breaks)) { + breakValues <- plateValues$values + breakValues[which(is.na(breakValues))] <- 0 + if (all(breakValues >= 0)) { + breaks <- computeBreaks(7, max(plateValues$values, na.rm = TRUE)) + } else { + breaks <- quantile(plateValues$values, probs = seq(0, 1, 0.125)) + } + } + + if (is.null(colours)) { + colours <- tryCatch({ + circlize::colorRamp2( + breaks = breaks, + colors = brewer.pal(length(breaks), "Purples") + ) + }, + error = function(cond){ + + message("Recomputed breaks for proper colour mapping") + + breakValues <- plateValues$values + breakValues[which(is.na(breakValues))] <- 0 + if (all(breakValues >= 0)) { + breaks <- computeBreaks(7, max(plateValues$values, na.rm = TRUE)) + } else { + breaks <- quantile(plateValues$values, probs = seq(0, 1, 0.125)) + } + + circlize::colorRamp2( + breaks = breaks, + colors = brewer.pal(length(breaks), "Purples") + ) + + }) + } + + ht <- Heatmap( + plateValues$values, + column_title = mainTitle, column_title_side = "top", + rect_gp = gpar(lwd = 0.4), + cluster_rows = FALSE, cluster_columns = FALSE, + col = colours, row_title = NULL, + row_split = row_split, column_split = col_split, + row_names_side = "left", + cluster_row_slices = FALSE, + cluster_column_slices = FALSE, + show_heatmap_legend = TRUE, + heatmap_legend_param = list( + title = ifelse( + is.null(legend.title), + paste0(valueVariable, "\n"), + paste0(legend.title, "\n") + ), + grid_height = unit(9, "mm"), border = "black", + labels_gp = gpar(fontsize = legendFontSize), + title_gp = gpar(fontsize = legendFontSizeTitle) + ), + cell_fun = function(j, i, x, y, width, height, fill) { + if (is.na(plateValues$values[i, j])) { + grid.rect( + x, y, width, height, + gp = gpar(fill = "white", alpha = 0.7, lwd = 0.7, col = "white") + ) + } + else if (!is.null(textVariable)) { + grid.text( + plateText$values[i, j], x, y, + just = "centre", + gp = gpar(fontsize = textFontSize, col = colourWellText) + ) + } + if (makeContourColours) { + if (!is.na(plateValues$contours[i, j])) { + grid.rect( + x, y, width, height, + gp = gpar( + col = contourColours[as.character(plateValues$contours[i, j])], + fill = NA, + lwd = lwdContours[as.character(plateValues$contours[i, j])] + ) + ) + } + } + } + ) + + if (displayHeatmap) { + print(ht) + } + if (saveHeatmap) { + png( + file.path( + outputDir, + paste0(prefix,gsub(" |-", "",plot.title), "_", plateName, ".png") + ), + width = 30, height = 10, units = "cm", res = 1200 + ) + print(ht) + dev.off() + } + + return(ht) +} + + +#' Return numerical matrix with number of reads that corresponds to the +#' plate layout +#' @param data A data.frame of the values to be visualized with at least +#' the columnof interest (specified in 'varOfInterest') and a 'WellID' column +#' indicating the wells in the plate. The WellID is a combination of a +#' letter (row in the plate) and an integer (column in the plate). +#' @param varOfInterest The name of the variable in 'data' to be visualized +#' in a plate layout +#' @param rows number of rows in a plate layout +#' @param cols number of columns in a plate layout +#' @param verbose if \code{TRUE}, samples missing from the plate +#' will be reported +#' @export +plateLayoutFormat <- function( + data, varOfInterest, + rows = 16, cols = 24, + verbose = FALSE +) { + plateValues <- matrix(NA, nrow = rows, ncol = cols) + for (i in seq_len(rows)) { + for (j in seq_len(cols)) { + tryCatch({ + sampleHit <- which( + as.character(data$WellID) == paste0(LETTERS[i], sprintf("%02d", j)) + ) + if(length(sampleHit) == 1){ + plateValues[i, j] <- data[sampleHit, ..varOfInterest][[1]] + } + }, + error = function(e) { + if (verbose == TRUE) { + print(paste0(LETTERS[i], sprintf("%02d", j), " is missing.")) + } + } + ) + } + } + + row.names(plateValues) <- LETTERS[1:rows] + return(list("values" = plateValues)) +} + + + +#' Helper function to automate break selection for raw count data +#' +#' This function creates an exponentially increasing vector for given number +#' breaks between zero and some element of choice. It is particularly useful for +#' raw counts or raw counts per million. +#' +#' @param nBreaks Number of breaks to be generated +#' @param maxElement Maximum value of data entries +#' @export +computeBreaks <- function(nBreaks, variable) { + + maxElement <- max(variable, na.rm = TRUE) + if (length(unique(variable)) == 1) { + breaks <- c(0, 0.5, ifelse(maxElement < 1, 1, maxElement)) + } else { + coefSystem <- solve( + rbind(c(1, 1), c(1, (nBreaks - 1)))) %*% c(0, log(maxElement) + ) + coefExp <- c(exp(coefSystem[1]), coefSystem[2]) + breaks <- coefExp[1] * exp((1:(nBreaks - 1)) * coefExp[2]) + breaks <- unique(c(0, breaks)) + } + return(breaks) +} diff --git a/target/executable/report/create_report/template.Rmd b/target/executable/report/create_report/template.Rmd new file mode 100755 index 00000000..20f8c190 --- /dev/null +++ b/target/executable/report/create_report/template.Rmd @@ -0,0 +1,977 @@ +--- +title: "Exploratory Data Report" +date: "`r format(Sys.time(), '%d %B, %Y')`" +editor_options: + chunk_output_type: console +output: + oaStyle::html_report +# parameters which are overwritten by the script +params: + outputDir: 'output/' + esets: + - sample1.rds + - sample2.rds +--- + + + + + + + + + +```{r params, eval = TRUE, include = FALSE} +outputDir <- params$outputDir +esets <- params$esets +``` + + +```{r outputDir, echo = FALSE} +## Required: ABSOLUTE outputDir +outputDir <- file.path(outputDir) + +# When working on a windows computer it should be +# "/Users/..." instead of "C:/Users/..." +if (.Platform$OS.type == "windows") { + outputDir <- paste0( + "/", + paste( + unlist(strsplit(outputDir, split = "/"))[-1], collapse = "/" + ), + "/" + ) +} +``` + + + + +```{r optionsChunkDoNotModify, echo = FALSE, message = FALSE, warning=FALSE} + +## Chunk with options for knitr. This chunk should not be modified. +knitr::opts_chunk$set( + eval = TRUE, + echo = FALSE, + message = FALSE, + cache = FALSE, + warning = FALSE, + error = FALSE, + comment = NA, #"#", + tidy = FALSE, + collapse = TRUE, + out.width = "100%", + fig.width = 20, + fig.height = 10, + results = "asis") + +knitr::opts_knit$set(root.dir = getwd()) + +options(warn = 1, width = 200) + +``` + +```{r libraries_and_functions} +source("plateLayouts.R") +library(ComplexHeatmap) +library(data.table) +library(ggplot2) +library(knitr) +library(Biobase) +library(gridExtra) +library(RColorBrewer) +``` + + +```{r dataImport} + +# Create esetList +esetList <- sapply( + esets, simplify = FALSE, + USE.NAMES = TRUE, + function(eset_raw) { + if (!file.exists(eset_raw)) { + stop(paste0("Provided path '", eset_raw, "' is not a file.")) + } + eset <- readRDS(eset_raw) + } +) +pools <- sapply(esetList, function(eset) { + unique(eset$PoolName) +}) +names(esetList) <- unlist(pools) + +# Create qcData +pDataList <- lapply(esetList, function(eset) data.table(pData(eset))) +qcData <- rbindlist(pDataList, fill = TRUE) + +textVars <- "SampleName" +annotationVar <- "PoolName" + +if (!"SampleName" %in% names(qcData)) { + qcData[, SampleName := paste0(PoolName, "_", WellBC)] +} +qcData[, log10LibSize := round(log10(NumberOfInputReads))] +qcData[, (annotationVar) := lapply(.SD, as.factor), .SDcols = annotationVar] + + +colourList <- list() +Design_levels <- sort( + as.character(unique(qcData[, ..annotationVar][[1]])), + decreasing = TRUE +) + +if (length(Design_levels) == 1) { + colours <- c("#d6e0ff", "lightgrey") + names(colours) <- c(Design_levels, "Empty") + colourList[[annotationVar]] <- list( + "colours" = colours, + "annotVar" = annotationVar, + "text" = textVars + ) +}else if (length(Design_levels) == 2) { + colours <- c("#d6e0ff", "#FF9999") + + names(colours) <- c(Design_levels) + colourList[[annotationVar]] <- list( + "colours" = colours, + "annotVar" = annotationVar, + "text" = textVars + ) +} else if (length(Design_levels) <= 20) { + + if (length(Design_levels) > 12) { + colours <- c( + brewer.pal(12, "Set3"), + brewer.pal((length(Design_levels) - 12), + "Pastel2") + ) + } else { + colours <- c(brewer.pal(length(Design_levels), "Set3")) + } + + names(colours) <- c(Design_levels) + colourList[[annotationVar]] <- list( + "colours" = colours, + "annotVar" = annotationVar, + "text" = textVars + ) +} else { + colours <- c("#d6e0ff") + names(colours) <- c("nonEmpty") + colourList[[annotVar]] <- list( + "colours" = colours, + "annotVar" = annotVar, + "text" = annotVar + ) +} +``` + +# Pool Description + +Per pool within this study, there are several pool layout plots shown, based on the + +* number of STAR input reads (= library size) + +* log10 transformed number of STAR input reads + +* number of detected UMIs + +* number of detected genes + +* number of chromosomal reads + +* percentage of ERCC + +* percentage of mitochondria + + +> The values for the different samples within each pool is expected to be comparable if the content of the different pools is equally diverse. + +```{r plateAnnotation, out.width = "100%",fig.width = 20, fig.height= 10} + +plateVars <- c("NumberOfInputReads", "log10LibSize", "NumberOfMappedReads", + "NumberOfChromReads", "NumberOfUMIs", "NumberOfGenes", + "pctMT", "pctERCC") + +breaksVars <- lapply( + plateVars, + function(var) { + computeBreaks(7, qcData[, ..var]) + } +) +names(breaksVars) <- plateVars + +for (pool in pools){ + cat("\n\n") + cat(paste0("## ", pool, " {.tabset} \n\n")) + poolData <- qcData[PoolName == pool] + lapply(plateVars, function(plateVar) { + cat("\n\n") + cat(sprintf("### %s {.unnumbered}", plateVar)) + cat("\n\n") + plateLayout( + poolData, valueVariable = plateVar, + textFontSize = 10, legendFontSize = 12, + plateName = pool, plot.title = "libSize - ", + legend.title = "libSize", breaks = breaksVars[[plateVar]] + ) + cat("\n\n") + }) + cat("\n\n") +} +``` + +
+ + +# Data Distributions + + +## Reads Distributions {.tabset} + +The 4 box plots below represent the distributions per pool of the different samples based on: + +* the number of STAR input reads + +* the number of STAR mapped reads + +* the percentage of STAR mapped reads + +* the number of detected genes + +> The distributions contribute to the QC metrics mentioned in Par 3. The higher these values, the better. +> The data range for the different plates is expected to be comparable if the content of the different plates is equally diverse. + + +### Number of Input Reads {.tabset .unnumbered} + +```{r settings_1} + +nColPlots = 1 +figHeight = 7 + +``` + +#### Distribution {.tabset .unnumbered} + + +```{r boxplots_input_plate, fig.height = figHeight} +ggplot( + qcData, + aes( + x = PoolName, + y = NumberOfInputReads, colour = PoolName + ) +) + geom_boxplot() + ylab("Number of Input Reads") + + ggtitle("Number of Input Reads") + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size = 10), + axis.text.y = element_text(angle = 90, size = 14), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title = element_text(size = 15), + axis.text.x = element_blank(), + axis.ticks.x = element_blank() + ) + +``` + + +### Number of Mapped Reads {.tabset .unnumbered} + +#### Distribution {.unnumbered} + +```{r boxplots_mapped_plate, fig.height = figHeight} + +ggplot( + qcData, + aes(x = PoolName, y = NumberOfMappedReads, colour = PoolName) +) + geom_boxplot() + ylab("Number of Mapped Reads") + + ggtitle("Number of Mapped Reads") + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size = 10), + axis.text.y = element_text(angle = 90, size = 14), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title.y = element_text(size = 15), + axis.text.x = element_blank(), + axis.ticks.x = element_blank() + ) +``` + + +#### pct Mapped Reads {.unnumbered} + +```{r boxplots_pctMapped_plate, fig.height = figHeight} +ggplot( + qcData, + aes(x = PoolName, y = PctMappedReads, colour = PoolName) +) + + geom_boxplot() + + ylab("pct Mapped Reads") + + ggtitle("pct Mapped Reads") + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size = 10), + axis.text.y = element_text(angle = 90, size = 14), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title.y = element_text(size = 15), + axis.text.x = element_blank(), + axis.ticks.x = element_blank() + ) +``` + +### Number of Chromosomal Reads {.tabset .unnumbered} + +#### Distribution {.unnumbered} + +```{r boxplots_chrom_plate, fig.height = figHeight} + +ggplot( + qcData, + aes(x = PoolName, y = NumberOfChromReads, colour = PoolName) +) + geom_boxplot() + ylab("Number of Chromosomal Reads") + + ggtitle("Number of Chromosomal Reads") + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size = 10), + axis.text.y = element_text(angle = 90, size = 14), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title.y = element_text(size = 15), + axis.text.x = element_blank(), + axis.ticks.x = element_blank() + ) + +``` + +#### pct Chromosomal Reads {.unnumbered} + +```{r boxplots_pctChrom_plate, fig.height = figHeight} + +ggplot( + qcData, + aes(x = PoolName, y = pctChrom, colour = PoolName) +) + geom_boxplot() + ylab("pct Chromosomal Reads") + + ggtitle("pct Chromosomal Reads") + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size = 10), + axis.text.y = element_text(angle = 90, size = 14), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title.y = element_text(size = 15), + axis.text.x = element_blank(), + axis.ticks.x = element_blank() + ) +``` + +### Number of UMIs {.tabset .unnumbered} + +#### Distribution {.tabset .unnumbered} + + +```{r boxplots_umi_plate, fig.height = figHeight} + +ggplot( + qcData, + aes(x = PoolName, y = NumberOfUMIs, colour = PoolName) +) + geom_boxplot() + ylab("Number of UMIs") + + ggtitle('Number of UMIs') + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size = 10), + axis.text.y = element_text(angle = 90, size = 14), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title = element_text(size = 15), + axis.text.x = element_blank(), + axis.ticks.x = element_blank() + ) + +``` + +#### Density distribution {.unnumbered} + +```{r density_numberOfUMIs} + +## Pre-filtering data exploration +dt_plot <- melt( + qcData, + id.vars = c("SampleName", "PoolName", "WellID"), + measure.vars = c("NumberOfInputReads", "NumberOfMappedReads", "NumberOfUMIs") +) + +readsDensity_plot <- ggplot(dt_plot, aes(value)) +readsDensity_plot <- readsDensity_plot + + geom_density(aes(fill = variable), alpha=0.8) + + facet_grid(~ PoolName, scales = "free_x", space = "fixed", drop = TRUE) + + geom_vline( + xintercept = 5e5, + linetype = "dashed", + color = "steelblue3", size = 2 + ) + + annotate( + "text", + x = 3.5e5, y = 2e-6, label = "500k", + angle = 90, color = "steelblue3", size = 10 + ) + + geom_vline( + xintercept = 1.5e6, linetype = "dashed", + color = "forestgreen", size = 2 + ) + + annotate( + "text", x = 1.35e6, y = 2e-6, label = "1.5M", + angle = 90, color = "forestgreen", size = 10 + ) + + labs( + title = "Density plot", + subtitle = paste0( + "# Samples with NumberOfMappedReads > 1.5M: ", + length(which(qcData$NumberOfMappedReads > 1.5e6)), + "\n# Samples with NumberOfUMIs > 500k: ", + length(which(qcData$NumberOfUMIs > 5e5)) + ), + caption = paste0("# Total samples (after removing empty): ", nrow(qcData)), + x = "Count", + fill = "Variable" + ) + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size = 5), + axis.text.x = element_text(angle = 90, size = 14), + plot.title = element_text(size = 18), + plot.subtitle = element_text(size = 17), + plot.caption = element_text(size = 15), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title = element_text(size = 15), + axis.text.y = element_blank(), + axis.ticks.y = element_blank(), + axis.title.y = element_blank() + ) +readsDensity_plot + +``` + +### Number of Genes {.tabset .unnumbered} + +#### Distribution {.unnumbered} + +```{r boxplots_genes_plate, fig.height = figHeight} +ggplot( + qcData, + aes(x = PoolName, y = NumberOfGenes, colour = PoolName) +) + + geom_boxplot() + ylab("Number of Genes") + + ggtitle("Number of Genes") + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size = 10), + axis.text.y = element_text(angle = 90, size = 14), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title.y = element_text(size = 15), + axis.text.x = element_blank(), + axis.ticks.x = element_blank() + ) +``` + +## {.tabset .toc-ignore .unnumbered} + + +In addition, several plots are shown visualizing the efficiency of the reads-to-genes translation: + +* the number of input reads vs the number of mapped reads + +* the number of chromosomal reads vs the number of mapped reads + +* the number of mapped reads per UMI vs the number of mapped reads + +* the number of UNI vs the number of mapped reads + +* the number of mapped reads vs the number of genes + +* the number of chromosomal reads vs the number of genes + +* the number of mapped reads per UMI vs the number of genes + +### Mapping Efficiency {.tabset .unnumbered} + +#### Number of Input Reads {.unnumbered} + +```{r mapping_efficiency_1_plate, fig.height = 7} + +ggplot( + qcData, + aes(x = NumberOfInputReads, y = NumberOfMappedReads, colour = PoolName) +) + + geom_point() + + xlab("Number of Input Reads") + + ylab("Number of Mapped Reads") + + ggtitle("Number of Mapped Reads vs Number of Input Reads") + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size = 10), + axis.text = element_text(angle = 90, size = 15), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title = element_text(size = 15) + ) + +``` + + +#### Number of Chromosomal Reads {.unnumbered} + +```{r mapping_efficiency_2_plate, fig.height = 7} + +ggplot( + qcData, + aes(x = NumberOfChromReads, y = NumberOfMappedReads, colour = PoolName) +) + geom_point() + + xlab("Number of Chromosomal Reads") + ylab("Number of Mapped Reads") + + ggtitle("Number of Chromosomal Reads vs Number of Mapped Reads") + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size = 10), + axis.text = element_text(angle = 90, size = 15), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title = element_text(size = 15) +) + +``` + + +#### Number of UMI {.unnumbered} + +```{r mapping_efficiency_4_plate, fig.height = 7} + +ggplot( + qcData, + aes(x =NumberOfUMIs, y = NumberOfMappedReads, colour = PoolName) +) + geom_point() + + ylab("Number of Mapped Reads") + xlab("Number of UMIs ") + + ggtitle("Number of UMIs vs Number of Mapped Reads") + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size = 10), + axis.text = element_text(angle = 90, size = 15), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title = element_text(size = 15) + ) + +``` + +### Counting Efficiency {.tabset .unnumbered} + +#### Number of Mapped Reads {.unnumbered} + +```{r gene_efficiency_1_plate, fig.height = 7} +ggplot( + qcData, + aes(x = NumberOfMappedReads, y = NumberOfGenes, colour = PoolName) +) + geom_point() + + ylab("Number of Genes") + xlab("Number of Mapped Reads") + + ggtitle("Number of Genes vs Number of Mapped Reads") + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size = 10), + axis.text = element_text(angle = 90, size = 15), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title = element_text(size = 15) + ) +``` + +#### Number of Chromosomal Reads {.unnumbered} + +```{r gene_efficiency_2_plate, fig.height = 7} +ggplot( + qcData, + aes(x = NumberOfChromReads, y = NumberOfGenes, colour = PoolName) +) + geom_point() + + ylab("Number of Genes") + xlab("Number of Chromosomal Reads") + + ggtitle("Number of Genes vs Number of Chromosomal Reads") + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size = 10), + axis.text = element_text(angle = 90, size = 15), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title = element_text(size = 15) + ) +``` + + + +## Sequencing Saturation {.tabset} + +The barplots below represent the sequencing saturation per sample as determined by STAR, split per pool. +The HT-RNAseq platform aims for shallow sequencing resulting in relatively low sequencing saturations of 10-20%. +In addition, the sequencing saturation vs the number of input reads is shown. + +### Sequencing Saturation {.unnumbered} + + + +```{r sequencingSaturation, fig.height = figHeight} + +ggplot( + qcData, + aes(x = WellID, y = SequencingSaturation, fill = PoolName) +) + geom_bar(stat = "identity", position = "dodge") + + xlab("Samples") + ggtitle("Sequencing Saturation per Sample") + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(1, "lines"), + text = element_text(size = 10), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title = element_text(size = 15), + axis.text.x = element_blank(), + axis.text.y = element_text(size = 15), + axis.ticks.x = element_blank() + ) +``` + +### Sequencing Saturation - Input Reads {.unnumbered} + + +```{r sequencingSaturation_inputReads, fig.height = figHeight} + + +ggplot( + qcData, + aes(x = NumberOfInputReads, y = SequencingSaturation, colour = PoolName) +) + geom_point() + + ggtitle("Sequencing Saturation vs Number of Input Reads") + + theme(strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size = 10), + axis.text = element_text(angle = 90, size = 15), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title = element_text(size = 15) + ) +``` + +### Sequencing Saturation - Mapped Reads {.unnumbered} + +```{r sequencingSaturation_mappedReads, fig.height = figHeight} +ggplot( + qcData, + aes(x = NumberOfChromReads, y = SequencingSaturation, colour = PoolName) +) + geom_point() + + ggtitle("Sequencing Saturation vs Number of Chromosomal Reads") + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size=10), + axis.text = element_text(angle = 90, size = 15), + plot.title = element_text(size=18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title = element_text(size = 15) +) +``` + +
+ +## Genomic Origin {.tabset} + +The 3 boxplots below represent, per pool, the distributions of the percentage of reads mapping to: + +* chromosomal regions + +* mitochondrial regions + +* ERCC spike-ins + +The 4th plot summarises the above results across samples per pool. + +The 5th plot shows the percentage of reads mapped to the transcriptome (as counted by STAR). This measurement serves as a proxy for the percentage of reads mapped to exons. + +> The percentage ERCC contributes to the QC metrics mentioned in Par 3. This value is ideally as low as possible (but non-zero to ensure the they have been spiked in) and comparable for the different pools. + + + + +### pctChrom {.tabset .unnumbered} + + +```{r genomicOrigin_chrom_plate, fig.height = figHeight} + +ggplot( + qcData, aes(x = PoolName, y = pctChrom, colour = PoolName) +) + + geom_boxplot() + + ggtitle("pctChrom") + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size = 10), + axis.text.y = element_text(angle = 90, size = 14), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title.y = element_text(size = 15), + axis.text.x = element_blank(), + axis.ticks.x = element_blank() + ) +``` + + +### pctMT {.tabset .unnumbered} + +```{r genomicOrigin_mt_plate, fig.height = figHeight} + +ggplot( + qcData, + aes(x = PoolName, y = pctMT, colour = PoolName) +) + + geom_boxplot() + ggtitle("pctMT") + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size = 10), + axis.text.y = element_text(angle = 90, size = 14), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title.y = element_text(size = 15), + axis.text.x = element_blank(), + axis.ticks.x = element_blank() + ) +``` + +### pctERCC {.tabset .unnumbered} + + +```{r genomicOrigin_ercc_plate, fig.height = figHeight} +ggplot(qcData, aes(x = PoolName, y = pctERCC, colour = PoolName)) + + geom_boxplot() + + ggtitle("pctERCC") + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size = 10), + axis.text.y = element_text(angle = 90, size = 14), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title.y = element_text(size = 15), + axis.text.x = element_blank(), + axis.ticks.x = element_blank() + ) +``` + + +### Genomic Summary {.tabset .unnumbered} + + + +```{r genomicOrigin_summary_plate} +meanPctChromMTData <- qcData[, .( + "pctChrom" = median(pctChrom), + "pctMT" = median(pctMT), + "pctERCC" = median(pctERCC) +), by = PoolName] +meanPctChromMTDataLong <- melt( + meanPctChromMTData, + id.vars = "PoolName", + measure.vars = c("pctChrom", "pctMT", "pctERCC"), + variable.name = "Origin", value.name = "pct" +) +ggplot( + meanPctChromMTDataLong, + aes(fill = Origin, y = pct, x = PoolName)) + + geom_bar(position = "stack", stat = "identity") + + ggtitle("Genomic Origin") + + theme( + text = element_text(size = 10), + axis.text = element_text(angle = 90, size = 15), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title = element_text(size = 15) + ) + +``` + + + +# Depletion {.tabset} + +
+```{r depletion} + + +for (eset_name in pools) { + cat("\n\n") + cat(paste0("## ", eset_name, " {.unnumbered}")) + cat("\n\n") + + eset <- esetList[[eset_name]] + average_reads <- sort(apply(exprs(eset), 1, mean), decreasing = TRUE) + plotData <- data.table( + ENSGID = names(average_reads), + av_count = average_reads + ) + + gen_descript <- data.table( + ENSGID = eset@featureData@data$gene_id, + Description = eset@featureData@data$GENENAME + ) + order_gen_descript <- gen_descript[ + match(plotData$ENSGID, gen_descript$ENSGID), + ] + + g <- ggplot( + plotData[c(1:100)], + aes(x = reorder(ENSGID, -av_count), y = av_count) + ) + geom_bar(stat = "identity") + + theme( + axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 12), + axis.text.y = element_text(size = 12), + legend.text = element_text(size = 15), + legend.title = element_text(size = 15), + axis.title = element_text(size = 18), + plot.title = element_text(size = 20) + ) + ylab("Average Counts") + xlab("Genes") + + print(g) + + cat("\n\n") + cat("
") + cat("
") + + print(htmltools::tagList((DT::datatable(order_gen_descript[1:100, ])))) +} +``` +
+ + +
+
+
+
+ +# Glossary {.unnumbered} + + +## Read {.unlisted .unnumbered} + +A read is a oligonucleotide (a short RNA fragment) that has been sequenced. It consists of a fixed number of base pairs (bp) and therefore has a specific read length. + + + +## Input Read {.unlisted .unnumbered} + +Each read of the fastq file used as input to the STAR aligner is considered an input read. + + + +## Read With Valid Barcode {.unlisted .unnumbered} + +A read with a valid barcode is a read for which the barcode matches the white list of barcodes under the given restriction of the number of allowed mismatches. The number of reads with a valid barcode is lower or equal to the number of input reads. + + + +## Mapped Read {.unlisted .unnumbered} + +A read that has been aligned against the reference genome and for which one or more suitable matching locations have been found is a mapped read. Depending on the number of allowed mismatches this might or might not be be an exact match. The number of mapped reads is lower or equal to the number of reads with a valid barcode. + + + +## Uniquely Mapped Read {.unlisted .unnumbered} + +A read for which one and only one suitable matching location in the reference genome was found is an uniquely mapped read. The number of uniquely mapped reads is lower or equal to the number of mapped reads. + + + +## Counted Read {.unlisted .unnumbered} + +A mapped read will only be counted if it overlaps (1 nucleotide or more) with one and only one gene. The number of counted reads is lower or equal to the number of (uniquely) mapped reads. + + + +## UMIs {.unlisted .unnumbered} + +Unique molecular identifiers (UMI) are short sequences in order to uniquely tag each molecule in a sample library. Sequencing with UMIs allows bioinformatics software to filter out duplicate reads and PCR errors with a high level of accuracy and report unique reads. + +The reported UMIs is the number of UMIs among the set of reads that map to an unique gene, i.e the number of reads is deduplicated. + + + +## pctERCC {.unlisted .unnumbered} + +The percentage of reads mapping to the ERCC genes among the total number of **mapped** reads. + + + +## pctMT {.unlisted .unnumbered} + +The percentage of reads mapping to the MT genes among the total number of **mapped** reads. + + + +## Sequencing Saturation {.unlisted .unnumbered} + +The sequencing saturation is a measure of the fraction of library complexity. The inverse of one minus the sequencing saturation can be interpreted as the number of additional reads it would take to detect a new transcript. Consequently, a low sequencing saturation indicates a shallow sequencing in which a new transcript could be discovered with a few reads. + +
+
+
+
+ +
+![](OutputSTARsolo.png) +
+ +
+
diff --git a/target/executable/stats/combine_star_logs/.config.vsh.yaml b/target/executable/stats/combine_star_logs/.config.vsh.yaml new file mode 100644 index 00000000..b65090ac --- /dev/null +++ b/target/executable/stats/combine_star_logs/.config.vsh.yaml @@ -0,0 +1,257 @@ +name: "combine_star_logs" +namespace: "stats" +version: "updatecraftbox" +authors: +- name: "Dries Schaumont" + roles: + - "author" + - "maintainer" + info: + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Arguments" + arguments: + - type: "string" + name: "--barcodes" + description: "Barcodes responding to the respective log files.\n" + info: null + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--star_logs" + description: "Paths to the STAR log files (most frequently called Log.final.out)\n" + info: null + example: + - "Log.final.out" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--gene_summary_logs" + description: "Paths to the Summary.csv files from the STAR Solo output. Can be\ + \ found in\nthe 'Solo.out/Gene' folder relative to the root of the STAR output\ + \ directory. \n" + info: null + example: + - "Summary.txt" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--reads_per_gene_logs" + description: "Paths to the 'ReadsPerGene.out.tab' files as output by STAR.\n" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--output" + description: "Tab-delimited file describing for each barcode (as the rows), the\ + \ metrics (as columns)\ngathered from the different input files. \n" + info: null + default: + - "starLogs.txt" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +- type: "file" + path: "_viash.yaml" + dest: "_viash.yaml" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "test_data" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +requirements: + commands: + - "ps" +license: "MIT" +links: + repository: "https://github.com/viash-hub/htrnaseq" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "updatecraftbox" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "pandas" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/stats/combine_star_logs/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/stats/combine_star_logs" + executable: "target/executable/stats/combine_star_logs/combine_star_logs" + viash_version: "0.9.4" + git_commit: "e6da525fc57aaec74f348eb974b68faa647bf800" + git_remote: "https://github.com/viash-hub/htrnaseq" +package_config: + name: "htrnaseq" + version: "updatecraftbox" + summary: "A workflow for high-throughput RNA-seq data analyses.\n" + description: "This workflow is designed to process high-throughput RNA-seq data,\ + \ where every\nwell of a microarray plate is a sample. A fasta file provided as\ + \ input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow\ + \ is built in a modular fashion, where most of the base functionality\nis provided\ + \ by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\n\ + supplemented by custom base components and workflow components in this package.\n\ + \nThe full workflow is split in two major subworkflows that can be run independently:\n\ + \n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per\ + \ well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate\ + \ QC reports.\n\nEach of those can be started individually, or the full workflow\ + \ can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq)\ + \ \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner)\ + \ where a\nnumber of choices (input/output structure and location) have been made.\n\ + \nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other\ + \ formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex)\ + \ first.\n" + info: + test_resources: + - path: "gs://viash-hub-resources/htrnaseq/v2" + dest: "resources_test" + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script\ + \ += 'includeConfig(\"nextflow_labels.config\")'\n.resources += {path: '/src/config/labels.config',\ + \ dest: 'nextflow_labels.config'}\n.resources += {path: '/_viash.yaml', dest:\ + \ '_viash.yaml'}\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'updatecraftbox'" + keywords: + - "bioinformatics" + - "sequencing" + - "high-throughput" + - "RNAseq" + - "mapping" + - "counting" + - "pipeline" + - "workflow" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/viash-hub/htrnaseq" + issue_tracker: "https://github.com/viash-hub/htrnaseq/issues" diff --git a/target/executable/stats/combine_star_logs/_viash.yaml b/target/executable/stats/combine_star_logs/_viash.yaml new file mode 100644 index 00000000..d08de7b1 --- /dev/null +++ b/target/executable/stats/combine_star_logs/_viash.yaml @@ -0,0 +1,21 @@ +name: htrnaseq +summary: | + A workflow for high-throughput RNA-seq data analyses. +description: "This workflow is designed to process high-throughput RNA-seq data, where every\nwell of a microarray plate is a sample. A fasta file provided as input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow is built in a modular fashion, where most of the base functionality\nis provided by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\nsupplemented by custom base components and workflow components in this package.\n\nThe full workflow is split in two major subworkflows that can be run independently:\n\n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate QC reports.\n\nEach of those can be started individually, or the full workflow can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq) \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner) where a\nnumber of choices (input/output structure and location) have been made.\n\nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex) first.\n" +license: MIT +keywords: [bioinformatics, sequencing, high-throughput, RNAseq, mapping, counting, pipeline, workflow] +links: + issue_tracker: https://github.com/viash-hub/htrnaseq/issues + repository: https://github.com/viash-hub/htrnaseq +viash_version: 0.9.4 +info: + test_resources: + - path: gs://viash-hub-resources/htrnaseq/v2 + dest: resources_test +config_mods: | + .requirements.commands := ['ps'] + .runners[.type == 'nextflow'].config.script += 'includeConfig("nextflow_labels.config")' + .resources += {path: '/src/config/labels.config', dest: 'nextflow_labels.config'} + .resources += {path: '/_viash.yaml', dest: '_viash.yaml'} +version: updatecraftbox +organization: vsh diff --git a/target/executable/stats/combine_star_logs/combine_star_logs b/target/executable/stats/combine_star_logs/combine_star_logs new file mode 100755 index 00000000..c53dddff --- /dev/null +++ b/target/executable/stats/combine_star_logs/combine_star_logs @@ -0,0 +1,1512 @@ +#!/usr/bin/env bash + +# combine_star_logs updatecraftbox +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dries Schaumont (author, maintainer) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="combine_star_logs" +VIASH_META_FUNCTIONALITY_NAME="combine_star_logs" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "pandas" + +LABEL org.opencontainers.image.authors="Dries Schaumont" +LABEL org.opencontainers.image.description="Companion container for running component stats combine_star_logs" +LABEL org.opencontainers.image.created="2025-09-02T13:33:10Z" +LABEL org.opencontainers.image.source="https://github.com/viash-hub/htrnaseq" +LABEL org.opencontainers.image.revision="e6da525fc57aaec74f348eb974b68faa647bf800" +LABEL org.opencontainers.image.version="updatecraftbox" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "combine_star_logs updatecraftbox" + echo "" + echo "Arguments:" + echo " --barcodes" + echo " type: string, required parameter, multiple values allowed" + echo " Barcodes responding to the respective log files." + echo "" + echo " --star_logs" + echo " type: file, required parameter, multiple values allowed, file must exist" + echo " example: Log.final.out" + echo " Paths to the STAR log files (most frequently called Log.final.out)" + echo "" + echo " --gene_summary_logs" + echo " type: file, required parameter, multiple values allowed, file must exist" + echo " example: Summary.txt" + echo " Paths to the Summary.csv files from the STAR Solo output. Can be found" + echo " in" + echo " the 'Solo.out/Gene' folder relative to the root of the STAR output" + echo " directory." + echo "" + echo " --reads_per_gene_logs" + echo " type: file, required parameter, multiple values allowed, file must exist" + echo " Paths to the 'ReadsPerGene.out.tab' files as output by STAR." + echo "" + echo " --output" + echo " type: file, output, file must exist" + echo " default: starLogs.txt" + echo " Tab-delimited file describing for each barcode (as the rows), the" + echo " metrics (as columns)" + echo " gathered from the different input files." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "combine_star_logs updatecraftbox" + exit + ;; + --barcodes) + if [ -z "$VIASH_PAR_BARCODES" ]; then + VIASH_PAR_BARCODES="$2" + else + VIASH_PAR_BARCODES="$VIASH_PAR_BARCODES;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --barcodes. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --barcodes=*) + if [ -z "$VIASH_PAR_BARCODES" ]; then + VIASH_PAR_BARCODES=$(ViashRemoveFlags "$1") + else + VIASH_PAR_BARCODES="$VIASH_PAR_BARCODES;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --star_logs) + if [ -z "$VIASH_PAR_STAR_LOGS" ]; then + VIASH_PAR_STAR_LOGS="$2" + else + VIASH_PAR_STAR_LOGS="$VIASH_PAR_STAR_LOGS;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --star_logs. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --star_logs=*) + if [ -z "$VIASH_PAR_STAR_LOGS" ]; then + VIASH_PAR_STAR_LOGS=$(ViashRemoveFlags "$1") + else + VIASH_PAR_STAR_LOGS="$VIASH_PAR_STAR_LOGS;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --gene_summary_logs) + if [ -z "$VIASH_PAR_GENE_SUMMARY_LOGS" ]; then + VIASH_PAR_GENE_SUMMARY_LOGS="$2" + else + VIASH_PAR_GENE_SUMMARY_LOGS="$VIASH_PAR_GENE_SUMMARY_LOGS;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --gene_summary_logs. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --gene_summary_logs=*) + if [ -z "$VIASH_PAR_GENE_SUMMARY_LOGS" ]; then + VIASH_PAR_GENE_SUMMARY_LOGS=$(ViashRemoveFlags "$1") + else + VIASH_PAR_GENE_SUMMARY_LOGS="$VIASH_PAR_GENE_SUMMARY_LOGS;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --reads_per_gene_logs) + if [ -z "$VIASH_PAR_READS_PER_GENE_LOGS" ]; then + VIASH_PAR_READS_PER_GENE_LOGS="$2" + else + VIASH_PAR_READS_PER_GENE_LOGS="$VIASH_PAR_READS_PER_GENE_LOGS;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --reads_per_gene_logs. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --reads_per_gene_logs=*) + if [ -z "$VIASH_PAR_READS_PER_GENE_LOGS" ]; then + VIASH_PAR_READS_PER_GENE_LOGS=$(ViashRemoveFlags "$1") + else + VIASH_PAR_READS_PER_GENE_LOGS="$VIASH_PAR_READS_PER_GENE_LOGS;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/htrnaseq/stats/combine_star_logs:updatecraftbox' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'ps' 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'ps' 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_BARCODES+x} ]; then + ViashError '--barcodes' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_STAR_LOGS+x} ]; then + ViashError '--star_logs' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_GENE_SUMMARY_LOGS+x} ]; then + ViashError '--gene_summary_logs' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_READS_PER_GENE_LOGS+x} ]; then + ViashError '--reads_per_gene_logs' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + VIASH_PAR_OUTPUT="starLogs.txt" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_STAR_LOGS" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_STAR_LOGS; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi +if [ ! -z "$VIASH_PAR_GENE_SUMMARY_LOGS" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_GENE_SUMMARY_LOGS; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi +if [ ! -z "$VIASH_PAR_READS_PER_GENE_LOGS" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_READS_PER_GENE_LOGS; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_STAR_LOGS" ]; then + VIASH_TEST_STAR_LOGS=() + IFS=';' + for var in $VIASH_PAR_STAR_LOGS; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_STAR_LOGS+=( "$var" ) + done + VIASH_PAR_STAR_LOGS=$(IFS=';' ; echo "${VIASH_TEST_STAR_LOGS[*]}") +fi +if [ ! -z "$VIASH_PAR_GENE_SUMMARY_LOGS" ]; then + VIASH_TEST_GENE_SUMMARY_LOGS=() + IFS=';' + for var in $VIASH_PAR_GENE_SUMMARY_LOGS; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_GENE_SUMMARY_LOGS+=( "$var" ) + done + VIASH_PAR_GENE_SUMMARY_LOGS=$(IFS=';' ; echo "${VIASH_TEST_GENE_SUMMARY_LOGS[*]}") +fi +if [ ! -z "$VIASH_PAR_READS_PER_GENE_LOGS" ]; then + VIASH_TEST_READS_PER_GENE_LOGS=() + IFS=';' + for var in $VIASH_PAR_READS_PER_GENE_LOGS; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_READS_PER_GENE_LOGS+=( "$var" ) + done + VIASH_PAR_READS_PER_GENE_LOGS=$(IFS=';' ; echo "${VIASH_TEST_READS_PER_GENE_LOGS[*]}") +fi +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-combine_star_logs-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import logging +import pandas as pd +from itertools import batched, starmap + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'barcodes': $( if [ ! -z ${VIASH_PAR_BARCODES+x} ]; then echo "r'${VIASH_PAR_BARCODES//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'star_logs': $( if [ ! -z ${VIASH_PAR_STAR_LOGS+x} ]; then echo "r'${VIASH_PAR_STAR_LOGS//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'gene_summary_logs': $( if [ ! -z ${VIASH_PAR_GENE_SUMMARY_LOGS+x} ]; then echo "r'${VIASH_PAR_GENE_SUMMARY_LOGS//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'reads_per_gene_logs': $( if [ ! -z ${VIASH_PAR_READS_PER_GENE_LOGS+x} ]; then echo "r'${VIASH_PAR_READS_PER_GENE_LOGS//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + +logger = logging.getLogger() +console_handler = logging.StreamHandler() +logger.addHandler(console_handler) +logger.setLevel(logging.DEBUG) + + +def handle_percentages(column_value): + # TODO: handle this more gracefully + if column_value: + return column_value.strip('%') + return column_value + +def star_log_to_dataframe(barcode: str, log_path) -> pd.DataFrame: + logger.info("Reading STAR log %s for barcode '%s'", log_path, barcode) + result = pd.read_table(log_path, sep=r"\\|\\t+", converters={"Value": handle_percentages}, + engine="python", header=None, skip_blank_lines=True, + skipinitialspace=True, names=["Category", "Value"], index_col=0, + skiprows=[0, 1, 2]) + logger.info("Read %d row(s) and %d column(s) from STAR logs at %s", + *result.shape, log_path) + return result + + +def summary_to_dataframe(barcode: str, summary_path) -> pd.DataFrame: + logger.info("Reading summary log %s for barcode %s", summary_path, barcode) + result = pd.read_table(summary_path, sep=",", + header=None, names=["Category", "Value"], + index_col=0, dtype=pd.StringDtype()) + logger.info("Read %d row(s) and %d column(s) from summary file at %s", + *result.shape, summary_path) + return result + + +def reads_per_gene_to_dataframe(barcode, read_per_gene_path) -> pd.DataFrame: + logger.info("Reading reads per gene file %s for barcode %s", read_per_gene_path, barcode) + result = pd.read_table(read_per_gene_path, skiprows=[0, 1, 2, 3], header=None, sep="\\t", + dtype={"geneID": pd.StringDtype(), + "Unstranded": pd.Int64Dtype(), + "posStrand": pd.Int64Dtype(), + "negStrand": pd.Int64Dtype()}, + index_col=0, names=["geneID", "Unstranded", "posStrand", "negStrand"]) + result = result[["Unstranded"]] # Do not use .loc here because we need a DataFrame, not a Series + df = pd.DataFrame({"Value": result.sum()}) + df = df.rename({"Unstranded": "NumberOfCountedReads"}, errors="raise") + df.index.name = "Category" + logger.info("Read %d row(s) and %d column(s) from reads per gene file at %s", + *df.shape, read_per_gene_path) + return df + +def star_log_remove_unwanted_entries_and_adjust_format(barcode, df: pd.DataFrame) -> pd.DataFrame: + """ + For a single star log (Log.final.out) in dataframe format, filter out the + entries that are not needed and format the labels for some metrics: + - Replace '%' with 'pect' in the labels. + - Remove labels ending with ':' + (mostly the section separators like 'MULTI-MAPPING READS:' and 'UNMAPPED READS:') + - Remove the metrics we do no need based on the following keywords: + Mapping speed, Average, Number of splices, per base, chimeric reads, average + + The dataframe provided as input must have an index with 1 level with the metric names. + """ + # Remove index values ending with ':' (rows like 'MULTI-MAPPING READS:','UNIQUE READS:') + logger.info("Filtering STAR logs for barcode %s. Starting with %d row(s) and %d column(s)", barcode, *df.shape) + to_keep = ~df.index.to_series().str.endswith(":") + # Remove index values where the values contain any of these substrings + regex_columns_to_remove = "Mapping speed|Average|Number of splices|per base|chimeric reads|average" + to_keep = to_keep & ~df.index.to_series().str.contains(regex_columns_to_remove, regex=True) + logger.info("Removed the following log entries for barcode '%s':\\n\\t%s", + barcode, + "\\n\\t".join(to_keep[~to_keep].index.to_list())) + result = df.loc[to_keep] + + # Replace % by pect, remove columns, use camel case and remove spaces + # You might be tempted to use .title() to make everything uppercase, + # but characters which are already uppercase should stay that way. + # (example: NumberOfUMIs and not NumberOfUmis) + result.index = result.index.str.replace("%", "pect")\\ + .str.replace(":", "")\\ + .str.replace(r"(?:^|\\s).", lambda m:m.group(0).upper(), regex=True)\\ + .str.replace(" ", "") + result = result.rename({"UniquelyMappedReadsNumber": "NumberOfMappedReads", + "UniquelyMappedReadsPect": "PctMappedReads"}, errors="raise") + logger.info("Done filtering STAR logs for barcode %s. Result has %d row(s) and %d column(s). " + "Found entries:\\n\\t%s", + barcode, *result.shape, "\\n\\t".join(result.index.to_list())) + return result + + +def summary_remove_unwanted_entries_and_adjust_format(barcode, df: pd.DataFrame) -> pd.DataFrame: + logger.info("Filtering and formatting summary logs for barcode %s. " + "Starting with %d row(s) and %d column(s)", barcode, *df.shape) + columns_to_remove = ( + "Number of Reads", + "Q30 Bases in RNA read", + "Reads Mapped to Genome: Unique", + "Reads Mapped to Transcriptome: Unique Genes", + "Reads in Cells Mapped to Unique Genes", + "Median UMI per Cell", + "Median Genes per Cell", + "Reads Mapped to Genome: Unique+Multiple", + "Median Reads per Cell", + "Mean UMI per Cell", + "Mean Genes per Cell", + ) + + to_keep = ~df.index.isin(columns_to_remove) + logger.info("Removed the following summary entries for barcode '%s':\\n\\t%s", + barcode, + "\\n\\t".join(df.loc[~to_keep].index.to_list())) + result = df.loc[to_keep] + result.index = result.index.str.replace(r"(?:^|\\s).", lambda m:m.group(0).upper(), + regex=True).str.replace(" ", "") + to_rename = {"UMIsInCells": "NumberOfUMIs", + "TotalGenesDetected": "NumberOfGenes"} + try: + result = result.rename(to_rename, errors="raise") + except KeyError as e: + raise KeyError(f"Tried to rename log entries ({','.join(to_rename)}) in the summary " + f"log for barcode {barcode}, but an entry was not found in the file. " + "Make sure that you are using the correct version of STAR." + f"Available entries: {", ".join(result.index.to_list())}") from e + logger.info("Done filtering summary logs for barcode %s. Result has %d row(s) and %d column(s). " + "Found entries:\\n\\t%s", + barcode, *result.shape, "\\n\\t".join(result.index.to_list())) + return result + + +def join_dfs(df_list, barcodes) -> pd.DataFrame: + # Combine the dataframes together and add the barcodes as a level to the dataframe + # in order to make a 2-level index (first level the barcodes and second level the metrics). + result = pd.concat(dict(zip(barcodes, df_list)), names=["WellBC"]) + # Pivot the table by moving the metrics to the columns. Its added as an extra level, + # so we can just frop the 'Values' level that was already there + result = result.unstack(level="Category").droplevel(0, axis="columns") + return result + +def main(par): + logger.info("Component started.") + # Provide an overview of the parameters in the logs + parameters_str = [f'\\t{param}: {param_val}\\n' for param, param_val in par.items()] + logger.info("Parameters:\\n%s", "".join(parameters_str).rstrip()) + star_logs, gene_summary_logs, reads_per_gene_logs, barcodes = par["star_logs"], \\ + par["gene_summary_logs"], par["reads_per_gene_logs"], par["barcodes"] + number_of_inputs = tuple(len(i) for i in (star_logs, gene_summary_logs, + reads_per_gene_logs, barcodes)) + if len(set(number_of_inputs)) != 1: + raise ValueError("Expected the same number of inputs for 'star_logs' (%d), " + "'gene_summary_logs' (%d), 'reads_per_gene_logs' (%d) " + "and 'barcodes' (%d)." % number_of_inputs) + + logs_to_process = [ + (star_log_to_dataframe, star_log_remove_unwanted_entries_and_adjust_format, star_logs), + (summary_to_dataframe, summary_remove_unwanted_entries_and_adjust_format, gene_summary_logs), + (reads_per_gene_to_dataframe, None, reads_per_gene_logs), + ] + logger.info("Formatting the contents of the log files.") + all_logs_data = [] + for df_generator, formatter, data in logs_to_process: + data_as_df = list(starmap(df_generator, zip(barcodes, data))) + data_formatted = data_as_df + if formatter: + data_formatted = list(starmap(formatter, zip(barcodes, data_as_df))) + data_joined = join_dfs(data_formatted, barcodes) + all_logs_data.append(data_joined) + + logger.info("Joining entries across the different logs together.") + all_stats = pd.concat(all_logs_data, axis=1) + logger.info("Log statistics were gathered for the following barcodes: %s", + ", ".join(all_stats.index.to_list())) + dtypes = { + 'NumberOfInputReads': pd.UInt64Dtype(), + 'NumberOfMappedReads': pd.UInt64Dtype(), + 'PctMappedReads': pd.Float64Dtype(), + 'NumberOfReadsMappedToMultipleLoci': pd.UInt64Dtype(), + 'PectOfReadsMappedToMultipleLoci': pd.Float64Dtype(), + 'NumberOfReadsMappedToTooManyLoci': pd.UInt64Dtype(), + 'PectOfReadsMappedToTooManyLoci': pd.Float64Dtype(), + 'NumberOfReadsUnmappedTooManyMismatches': pd.UInt64Dtype(), + 'PectOfReadsUnmappedTooManyMismatches': pd.Float64Dtype(), + 'NumberOfReadsUnmappedTooShort': pd.UInt64Dtype(), + 'PectOfReadsUnmappedTooShort': pd.Float64Dtype(), + 'NumberOfReadsUnmappedOther': pd.UInt64Dtype(), + 'PectOfReadsUnmappedOther': pd.Float64Dtype(), + 'ReadsWithValidBarcodes': pd.Float64Dtype(), + 'SequencingSaturation': pd.Float64Dtype(), + 'Q30BasesInCB+UMI': pd.Float64Dtype(), + 'ReadsMappedToTranscriptome:Unique+MultipeGenes': pd.Float64Dtype(), + 'EstimatedNumberOfCells': pd.UInt64Dtype(), + 'FractionOfReadsInCells': pd.Float64Dtype(), + 'MeanReadsPerCell': pd.UInt64Dtype(), + 'NumberOfUMIs': pd.UInt64Dtype(), + 'NumberOfGenes': pd.UInt64Dtype(), + 'NumberOfCountedReads': pd.UInt64Dtype(), + } + all_stats = all_stats.astype(dtypes) + # batched() is used here to print a limited amount of columnns at a time + # to make sure that they are all displayed (pandas might limit the view for readability) + logger.info("Summary of final output:\\n%s\\n", + "\\n".join(repr(all_stats.loc[:,columns].describe()) + for columns in batched(all_stats.columns, 3))) + logger.info("Writing output to %s", par["output"]) + all_stats.reset_index("WellBC").to_csv(par["output"], sep="\\t", header=True, + index=False, float_format='%g') + logger.info("Finished %s.", meta["name"]) + +if __name__ == "__main__": + main(par) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_STAR_LOGS" ]; then + unset VIASH_TEST_STAR_LOGS + IFS=';' + for var in $VIASH_PAR_STAR_LOGS; do + unset IFS + if [ -z "$VIASH_TEST_STAR_LOGS" ]; then + VIASH_TEST_STAR_LOGS="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_STAR_LOGS="$VIASH_TEST_STAR_LOGS;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_STAR_LOGS="$VIASH_TEST_STAR_LOGS" + fi + if [ ! -z "$VIASH_PAR_GENE_SUMMARY_LOGS" ]; then + unset VIASH_TEST_GENE_SUMMARY_LOGS + IFS=';' + for var in $VIASH_PAR_GENE_SUMMARY_LOGS; do + unset IFS + if [ -z "$VIASH_TEST_GENE_SUMMARY_LOGS" ]; then + VIASH_TEST_GENE_SUMMARY_LOGS="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_GENE_SUMMARY_LOGS="$VIASH_TEST_GENE_SUMMARY_LOGS;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_GENE_SUMMARY_LOGS="$VIASH_TEST_GENE_SUMMARY_LOGS" + fi + if [ ! -z "$VIASH_PAR_READS_PER_GENE_LOGS" ]; then + unset VIASH_TEST_READS_PER_GENE_LOGS + IFS=';' + for var in $VIASH_PAR_READS_PER_GENE_LOGS; do + unset IFS + if [ -z "$VIASH_TEST_READS_PER_GENE_LOGS" ]; then + VIASH_TEST_READS_PER_GENE_LOGS="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_READS_PER_GENE_LOGS="$VIASH_TEST_READS_PER_GENE_LOGS;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_READS_PER_GENE_LOGS="$VIASH_TEST_READS_PER_GENE_LOGS" + fi + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/stats/combine_star_logs/nextflow_labels.config b/target/executable/stats/combine_star_logs/nextflow_labels.config new file mode 100644 index 00000000..2821ec46 --- /dev/null +++ b/target/executable/stats/combine_star_logs/nextflow_labels.config @@ -0,0 +1,108 @@ +executor { + $k8s { + submitRateLimit = '10sec' + pollInterval = '1 sec' + } +} + +process { + container = 'nextflow/bash:latest' + + // default resources + memory = { 8.Gb * task.attempt } + cpus = 8 + maxForks = 36 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + maxMemory = 192.GB + + // Resource labels + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 8 } + withLabel: midcpu { cpus = 16 } + withLabel: highcpu { cpus = 32 } + + withLabel: verylowmem { memory = { get_memory( 4.GB * task.attempt ) } } + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 16.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 64.GB * task.attempt ) } } + +} + +profiles { + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.fixOwnership = true + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + + local { + // This config is for local processing. + process { + withName: ".*parallel_map_process" { + maxForks = 1 + } + maxMemory = 25.GB + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 6 } + withLabel: highcpu { cpus = 8 } + + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 12.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 20.GB * task.attempt ) } } + } + } +} + +def get_memory(to_compare) { + if (!process.containsKey("maxMemory") || !process.maxMemory) { + return to_compare + } + + try { + if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) { + return process.maxMemory + } + else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) { + return max_memory as nextflow.util.MemoryUnit + } + else { + return to_compare + } + } catch (all) { + println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!" + System.exit(1) + } + } diff --git a/target/executable/stats/generate_pool_statistics/.config.vsh.yaml b/target/executable/stats/generate_pool_statistics/.config.vsh.yaml new file mode 100644 index 00000000..64465e02 --- /dev/null +++ b/target/executable/stats/generate_pool_statistics/.config.vsh.yaml @@ -0,0 +1,241 @@ +name: "generate_pool_statistics" +namespace: "stats" +version: "updatecraftbox" +authors: +- name: "Dries Schaumont" + roles: + - "author" + - "maintainer" + info: + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Marijke Van Moerbeke" + roles: + - "contributor" + info: + links: + github: "mvanmoerbeke" + orcid: "0000-0002-3097-5621" + linkedin: "marijke-van-moerbeke-84303a34" + organizations: + - name: "OpenAnalytics" + href: "https://www.openanalytics.eu" + role: "Statistical Consultant" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--nrReadsNrGenesPerChrom" + description: "Path to an output file that contains a .tsv formatted table describing\n\ + per chromosome the number of reads that were mapped to that chromosome (NumberOfReads\n\ + column) and the number of genes on that chromosome that had at least one\nread\ + \ mapped to it (NumberOfGenes).\n" + info: null + default: + - "processedBamFile_well1.tsv" + - "processedBamfile_well2.tsv" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--nrReadsNrGenesPerChromPool" + description: "Pivot table in tsv format of the combined input nrReadsNrGenesPerChrom\ + \ files. Describes\nper chromosome (as columns) the number of reads, as well\ + \ as the total number \nof reads per cell barcode and the percentage of nuclear,\ + \ ERCC and mitochondrial\nreads.\n" + info: null + example: + - "nrReadsNrGenesPerChrom.txt" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +- type: "file" + path: "_viash.yaml" + dest: "_viash.yaml" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +info: null +status: "enabled" +scope: + image: "public" + target: "public" +requirements: + commands: + - "ps" +license: "MIT" +links: + repository: "https://github.com/viash-hub/htrnaseq" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "updatecraftbox" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "pandas" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/stats/generate_pool_statistics/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/stats/generate_pool_statistics" + executable: "target/executable/stats/generate_pool_statistics/generate_pool_statistics" + viash_version: "0.9.4" + git_commit: "e6da525fc57aaec74f348eb974b68faa647bf800" + git_remote: "https://github.com/viash-hub/htrnaseq" +package_config: + name: "htrnaseq" + version: "updatecraftbox" + summary: "A workflow for high-throughput RNA-seq data analyses.\n" + description: "This workflow is designed to process high-throughput RNA-seq data,\ + \ where every\nwell of a microarray plate is a sample. A fasta file provided as\ + \ input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow\ + \ is built in a modular fashion, where most of the base functionality\nis provided\ + \ by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\n\ + supplemented by custom base components and workflow components in this package.\n\ + \nThe full workflow is split in two major subworkflows that can be run independently:\n\ + \n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per\ + \ well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate\ + \ QC reports.\n\nEach of those can be started individually, or the full workflow\ + \ can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq)\ + \ \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner)\ + \ where a\nnumber of choices (input/output structure and location) have been made.\n\ + \nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other\ + \ formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex)\ + \ first.\n" + info: + test_resources: + - path: "gs://viash-hub-resources/htrnaseq/v2" + dest: "resources_test" + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script\ + \ += 'includeConfig(\"nextflow_labels.config\")'\n.resources += {path: '/src/config/labels.config',\ + \ dest: 'nextflow_labels.config'}\n.resources += {path: '/_viash.yaml', dest:\ + \ '_viash.yaml'}\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'updatecraftbox'" + keywords: + - "bioinformatics" + - "sequencing" + - "high-throughput" + - "RNAseq" + - "mapping" + - "counting" + - "pipeline" + - "workflow" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/viash-hub/htrnaseq" + issue_tracker: "https://github.com/viash-hub/htrnaseq/issues" diff --git a/target/executable/stats/generate_pool_statistics/_viash.yaml b/target/executable/stats/generate_pool_statistics/_viash.yaml new file mode 100644 index 00000000..d08de7b1 --- /dev/null +++ b/target/executable/stats/generate_pool_statistics/_viash.yaml @@ -0,0 +1,21 @@ +name: htrnaseq +summary: | + A workflow for high-throughput RNA-seq data analyses. +description: "This workflow is designed to process high-throughput RNA-seq data, where every\nwell of a microarray plate is a sample. A fasta file provided as input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow is built in a modular fashion, where most of the base functionality\nis provided by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\nsupplemented by custom base components and workflow components in this package.\n\nThe full workflow is split in two major subworkflows that can be run independently:\n\n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate QC reports.\n\nEach of those can be started individually, or the full workflow can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq) \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner) where a\nnumber of choices (input/output structure and location) have been made.\n\nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex) first.\n" +license: MIT +keywords: [bioinformatics, sequencing, high-throughput, RNAseq, mapping, counting, pipeline, workflow] +links: + issue_tracker: https://github.com/viash-hub/htrnaseq/issues + repository: https://github.com/viash-hub/htrnaseq +viash_version: 0.9.4 +info: + test_resources: + - path: gs://viash-hub-resources/htrnaseq/v2 + dest: resources_test +config_mods: | + .requirements.commands := ['ps'] + .runners[.type == 'nextflow'].config.script += 'includeConfig("nextflow_labels.config")' + .resources += {path: '/src/config/labels.config', dest: 'nextflow_labels.config'} + .resources += {path: '/_viash.yaml', dest: '_viash.yaml'} +version: updatecraftbox +organization: vsh diff --git a/target/executable/stats/generate_pool_statistics/generate_pool_statistics b/target/executable/stats/generate_pool_statistics/generate_pool_statistics new file mode 100755 index 00000000..1fa7ff73 --- /dev/null +++ b/target/executable/stats/generate_pool_statistics/generate_pool_statistics @@ -0,0 +1,1237 @@ +#!/usr/bin/env bash + +# generate_pool_statistics updatecraftbox +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dries Schaumont (author, maintainer) +# * Marijke Van Moerbeke (contributor) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="generate_pool_statistics" +VIASH_META_FUNCTIONALITY_NAME="generate_pool_statistics" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "pandas" + +LABEL org.opencontainers.image.authors="Dries Schaumont, Marijke Van Moerbeke" +LABEL org.opencontainers.image.description="Companion container for running component stats generate_pool_statistics" +LABEL org.opencontainers.image.created="2025-09-02T13:33:10Z" +LABEL org.opencontainers.image.source="https://github.com/viash-hub/htrnaseq" +LABEL org.opencontainers.image.revision="e6da525fc57aaec74f348eb974b68faa647bf800" +LABEL org.opencontainers.image.version="updatecraftbox" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "generate_pool_statistics updatecraftbox" + echo "" + echo "Arguments:" + echo " --nrReadsNrGenesPerChrom" + echo " type: file, multiple values allowed, file must exist" + echo " default: processedBamFile_well1.tsv;processedBamfile_well2.tsv" + echo " Path to an output file that contains a .tsv formatted table describing" + echo " per chromosome the number of reads that were mapped to that chromosome" + echo " (NumberOfReads" + echo " column) and the number of genes on that chromosome that had at least one" + echo " read mapped to it (NumberOfGenes)." + echo "" + echo " --nrReadsNrGenesPerChromPool" + echo " type: file, output, file must exist" + echo " example: nrReadsNrGenesPerChrom.txt" + echo " Pivot table in tsv format of the combined input nrReadsNrGenesPerChrom" + echo " files. Describes" + echo " per chromosome (as columns) the number of reads, as well as the total" + echo " number" + echo " of reads per cell barcode and the percentage of nuclear, ERCC and" + echo " mitochondrial" + echo " reads." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "generate_pool_statistics updatecraftbox" + exit + ;; + --nrReadsNrGenesPerChrom) + if [ -z "$VIASH_PAR_NRREADSNRGENESPERCHROM" ]; then + VIASH_PAR_NRREADSNRGENESPERCHROM="$2" + else + VIASH_PAR_NRREADSNRGENESPERCHROM="$VIASH_PAR_NRREADSNRGENESPERCHROM;""$2" + fi + [ $# -lt 2 ] && ViashError Not enough arguments passed to --nrReadsNrGenesPerChrom. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --nrReadsNrGenesPerChrom=*) + if [ -z "$VIASH_PAR_NRREADSNRGENESPERCHROM" ]; then + VIASH_PAR_NRREADSNRGENESPERCHROM=$(ViashRemoveFlags "$1") + else + VIASH_PAR_NRREADSNRGENESPERCHROM="$VIASH_PAR_NRREADSNRGENESPERCHROM;"$(ViashRemoveFlags "$1") + fi + shift 1 + ;; + --nrReadsNrGenesPerChromPool) + [ -n "$VIASH_PAR_NRREADSNRGENESPERCHROMPOOL" ] && ViashError Bad arguments for option \'--nrReadsNrGenesPerChromPool\': \'$VIASH_PAR_NRREADSNRGENESPERCHROMPOOL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_NRREADSNRGENESPERCHROMPOOL="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --nrReadsNrGenesPerChromPool. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --nrReadsNrGenesPerChromPool=*) + [ -n "$VIASH_PAR_NRREADSNRGENESPERCHROMPOOL" ] && ViashError Bad arguments for option \'--nrReadsNrGenesPerChromPool=*\': \'$VIASH_PAR_NRREADSNRGENESPERCHROMPOOL\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_NRREADSNRGENESPERCHROMPOOL=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/htrnaseq/stats/generate_pool_statistics:updatecraftbox' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'ps' 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'ps' 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_NRREADSNRGENESPERCHROM+x} ]; then + VIASH_PAR_NRREADSNRGENESPERCHROM="processedBamFile_well1.tsv;processedBamfile_well2.tsv" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_NRREADSNRGENESPERCHROM" ]; then + IFS=';' + set -f + for file in $VIASH_PAR_NRREADSNRGENESPERCHROM; do + unset IFS + if [ ! -e "$file" ]; then + ViashError "Input file '$file' does not exist." + exit 1 + fi + done + set +f +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_NRREADSNRGENESPERCHROMPOOL" ] && [ ! -d "$(dirname "$VIASH_PAR_NRREADSNRGENESPERCHROMPOOL")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_NRREADSNRGENESPERCHROMPOOL")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_NRREADSNRGENESPERCHROM" ]; then + VIASH_TEST_NRREADSNRGENESPERCHROM=() + IFS=';' + for var in $VIASH_PAR_NRREADSNRGENESPERCHROM; do + unset IFS + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$var")" ) + var=$(ViashDockerAutodetectMount "$var") + VIASH_TEST_NRREADSNRGENESPERCHROM+=( "$var" ) + done + VIASH_PAR_NRREADSNRGENESPERCHROM=$(IFS=';' ; echo "${VIASH_TEST_NRREADSNRGENESPERCHROM[*]}") +fi +if [ ! -z "$VIASH_PAR_NRREADSNRGENESPERCHROMPOOL" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_NRREADSNRGENESPERCHROMPOOL")" ) + VIASH_PAR_NRREADSNRGENESPERCHROMPOOL=$(ViashDockerAutodetectMount "$VIASH_PAR_NRREADSNRGENESPERCHROMPOOL") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_NRREADSNRGENESPERCHROMPOOL" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-generate_pool_statistics-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import pandas as pd +from pathlib import Path +import re + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'nrReadsNrGenesPerChrom': $( if [ ! -z ${VIASH_PAR_NRREADSNRGENESPERCHROM+x} ]; then echo "r'${VIASH_PAR_NRREADSNRGENESPERCHROM//\'/\'\"\'\"r\'}'.split(';')"; else echo None; fi ), + 'nrReadsNrGenesPerChromPool': $( if [ ! -z ${VIASH_PAR_NRREADSNRGENESPERCHROMPOOL+x} ]; then echo "r'${VIASH_PAR_NRREADSNRGENESPERCHROMPOOL//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + +INDEX_COL = ["WellBC", "WellID"] + +if __name__ == "__main__": + ######### + # nrReadsNrGenesPerChrom file + ######### + nr_reads_nr_genes_wells = [] + par["nrReadsNrGenesPerChrom"] = list(map(Path, par["nrReadsNrGenesPerChrom"])) + for nr_reads_nr_genes_file in par["nrReadsNrGenesPerChrom"]: + nr_reads_nr_gene_well = pd.read_csv(nr_reads_nr_genes_file, + header=0, delimiter="\\t", + dtype={"WellBC": pd.StringDtype(), + "WellID": pd.StringDtype(), + "Chr": pd.StringDtype(), + "NumberOfReads": pd.UInt64Dtype(), + "NumberOfGenes": pd.UInt64Dtype()}) + if nr_reads_nr_gene_well.empty: + raise ValueError(f"{nr_reads_nr_genes_file.name} does not seem to contain any information!") + nr_reads_nr_genes_wells.append(nr_reads_nr_gene_well) + nr_reads_nr_genes_pool = pd.concat(nr_reads_nr_genes_wells, ignore_index=True,) + total_nr_reads_per_chromosome = nr_reads_nr_genes_pool.pivot_table(index=INDEX_COL, columns="Chr", + values=["NumberOfReads"], fill_value=0, + aggfunc="sum").droplevel(0, axis=1) + total_nr_reads_per_chromosome.columns.name = None + # Remove scaffolds/chromosomes with no counts + total_nr_reads_per_chromosome = total_nr_reads_per_chromosome.loc[:, (total_nr_reads_per_chromosome != 0).any(axis=0)] + ##### Total number of genes from all chromosomes + total_nr_genes = nr_reads_nr_genes_pool.loc[:, INDEX_COL + ['NumberOfGenes']].groupby(["WellBC", "WellID"]).sum() + + ##### Total counts across (irrespective of chromosome) + total_sum_of_reads = total_nr_reads_per_chromosome.sum(numeric_only=True, axis=1) + + ##### Logic to split up chromosome per type + chromosome_names = total_nr_reads_per_chromosome.columns.to_list() + chr_regex = re.compile(r"^(chr)?\\d+") + matching_chromosomes = [chr_name for chr_name + in chromosome_names + if chr_regex.match(chr_name)] + sex_chromosome_names = ["X", "Y"] + mitochondrial_chr_name = "MT" + # This is logic from the original HT pipeline, + # only when all of the matched chromosomes start with "chr", the mitochonrial, X and Y + # chromosomes should also start with 'chr' + if all(chr_name.startswith("chr") for chr_name in matching_chromosomes): + sex_chromosome_names += ["chrX", "chrY"] + mitochondrial_chr_name = "chrM" + + ###### Counts for mitochondrial reads + try: + mitochondrial_reads = total_nr_reads_per_chromosome.loc[:,mitochondrial_chr_name] + except KeyError: + mitochondrial_reads = 0 + percentage_mitochondrial_reads = round(mitochondrial_reads / total_sum_of_reads * 100, 2) + + ###### Counts for ERCC reads + total_ercc_reads = total_nr_reads_per_chromosome.filter(regex=r"^ERCC").sum(axis=1) + percentage_ercc_reads = round(total_ercc_reads / total_sum_of_reads * 100, 2) + + ###### Counts for nuclear chromosomes + total_chromosomal_reads = total_nr_reads_per_chromosome.loc[:,matching_chromosomes].sum(axis=1) + percentage_chromosomal_reads = round(total_chromosomal_reads / total_sum_of_reads * 100, 2) + + cols_to_add = { + "pctChrom": percentage_chromosomal_reads, + "pctMT": percentage_mitochondrial_reads, + "pctERCC": percentage_ercc_reads, + "SumReads": total_sum_of_reads, + "NumberOfGenes": total_nr_genes, + "NumberOfERCCReads": total_ercc_reads, + "NumberOfChromReads": total_chromosomal_reads, + "NumberOfMTReads": mitochondrial_reads, + } + total_nr_reads_per_chromosome = total_nr_reads_per_chromosome.assign( + **cols_to_add + ) + + total_nr_reads_per_chromosome.reset_index(names=INDEX_COL)\\ + .to_csv(par["nrReadsNrGenesPerChromPool"], sep="\\t", + header=True, index=False, float_format="%g", + columns=tuple(INDEX_COL) + tuple(chromosome_names) + tuple(cols_to_add.keys()) + ) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_NRREADSNRGENESPERCHROM" ]; then + unset VIASH_TEST_NRREADSNRGENESPERCHROM + IFS=';' + for var in $VIASH_PAR_NRREADSNRGENESPERCHROM; do + unset IFS + if [ -z "$VIASH_TEST_NRREADSNRGENESPERCHROM" ]; then + VIASH_TEST_NRREADSNRGENESPERCHROM="$(ViashDockerStripAutomount "$var")" + else + VIASH_TEST_NRREADSNRGENESPERCHROM="$VIASH_TEST_NRREADSNRGENESPERCHROM;""$(ViashDockerStripAutomount "$var")" + fi + done + VIASH_PAR_NRREADSNRGENESPERCHROM="$VIASH_TEST_NRREADSNRGENESPERCHROM" + fi + if [ ! -z "$VIASH_PAR_NRREADSNRGENESPERCHROMPOOL" ]; then + VIASH_PAR_NRREADSNRGENESPERCHROMPOOL=$(ViashDockerStripAutomount "$VIASH_PAR_NRREADSNRGENESPERCHROMPOOL") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_NRREADSNRGENESPERCHROMPOOL" ] && [ ! -e "$VIASH_PAR_NRREADSNRGENESPERCHROMPOOL" ]; then + ViashError "Output file '$VIASH_PAR_NRREADSNRGENESPERCHROMPOOL' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/stats/generate_pool_statistics/nextflow_labels.config b/target/executable/stats/generate_pool_statistics/nextflow_labels.config new file mode 100644 index 00000000..2821ec46 --- /dev/null +++ b/target/executable/stats/generate_pool_statistics/nextflow_labels.config @@ -0,0 +1,108 @@ +executor { + $k8s { + submitRateLimit = '10sec' + pollInterval = '1 sec' + } +} + +process { + container = 'nextflow/bash:latest' + + // default resources + memory = { 8.Gb * task.attempt } + cpus = 8 + maxForks = 36 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + maxMemory = 192.GB + + // Resource labels + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 8 } + withLabel: midcpu { cpus = 16 } + withLabel: highcpu { cpus = 32 } + + withLabel: verylowmem { memory = { get_memory( 4.GB * task.attempt ) } } + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 16.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 64.GB * task.attempt ) } } + +} + +profiles { + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.fixOwnership = true + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + + local { + // This config is for local processing. + process { + withName: ".*parallel_map_process" { + maxForks = 1 + } + maxMemory = 25.GB + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 6 } + withLabel: highcpu { cpus = 8 } + + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 12.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 20.GB * task.attempt ) } } + } + } +} + +def get_memory(to_compare) { + if (!process.containsKey("maxMemory") || !process.maxMemory) { + return to_compare + } + + try { + if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) { + return process.maxMemory + } + else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) { + return max_memory as nextflow.util.MemoryUnit + } + else { + return to_compare + } + } catch (all) { + println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!" + System.exit(1) + } + } diff --git a/target/executable/stats/generate_well_statistics/.config.vsh.yaml b/target/executable/stats/generate_well_statistics/.config.vsh.yaml new file mode 100644 index 00000000..9a90c0fa --- /dev/null +++ b/target/executable/stats/generate_well_statistics/.config.vsh.yaml @@ -0,0 +1,313 @@ +name: "generate_well_statistics" +namespace: "stats" +version: "updatecraftbox" +authors: +- name: "Dries Schaumont" + roles: + - "author" + - "maintainer" + info: + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Marijke Van Moerbeke" + roles: + - "contributor" + info: + links: + github: "mvanmoerbeke" + orcid: "0000-0002-3097-5621" + linkedin: "marijke-van-moerbeke-84303a34" + organizations: + - name: "OpenAnalytics" + href: "https://www.openanalytics.eu" + role: "Statistical Consultant" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + description: "The .bam file as returned by the mapping tool STAR." + info: null + example: + - "input.bam" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--barcode" + description: "The barcode for the well that is being processed. Is only used to\ + \ add a metadata\ncolumn to all output files.\n" + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--well_id" + description: "ID of this well. Only used to add a metadata column to the output\ + \ files.\n" + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--processedBAMFile" + description: "Path to a .tsv file listing, per read in the BAM file,\nthe value\ + \ for the \"CB\", \"UX\", \"GX\" and \"GN\" tag, together with the\nchromsome\ + \ to which the read was mapped to.\n" + info: null + default: + - "processedBamFile.txt" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--nrReadsNrGenesPerChrom" + description: "Path to an output file that contains a .tsv formatted table describing\n\ + per chromosome the number of reads that were mapped to that chromosome (NumberOfReads\n\ + column) and the number of genes on that chromosome that had at least one\nread\ + \ mapped to it (NumberOfGenes).\n" + info: null + default: + - "nrReadsNrGenesPerChrom.txt" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--nrReadsNrUMIsPerCB" + description: "Path to an output file that contains a .tsv formatted table describing\n\ + per barcode the number of UMI's (nrUMIs) and the total number of reads (NumberOfReads).\n" + info: null + default: + - "nrReadsNrUMIsPerCB.txt" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--umiFreqTop" + description: "Path to an output file that contains a .tsv formatted table describing\n\ + per UMI (column UB) the frequency at which they occur in the reads (column\n\ + N). Only the top 100 UMIs are included.\n" + info: null + default: + - "umiFreqTop100.txt" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--threads" + description: "Number of threads to use for decompressing BAM files.\n" + info: null + default: + - 1 + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +- type: "file" + path: "_viash.yaml" + dest: "_viash.yaml" +description: "Generate summary statistics from BAM files generated by STAR solo." +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "test.sam" +- type: "file" + path: "empty.sam" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +requirements: + commands: + - "ps" +license: "MIT" +links: + repository: "https://github.com/viash-hub/htrnaseq" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.13-trixie" + target_registry: "images.viash-hub.com" + target_tag: "updatecraftbox" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "pysam" + - "pandas" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/stats/generate_well_statistics/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/stats/generate_well_statistics" + executable: "target/executable/stats/generate_well_statistics/generate_well_statistics" + viash_version: "0.9.4" + git_commit: "e6da525fc57aaec74f348eb974b68faa647bf800" + git_remote: "https://github.com/viash-hub/htrnaseq" +package_config: + name: "htrnaseq" + version: "updatecraftbox" + summary: "A workflow for high-throughput RNA-seq data analyses.\n" + description: "This workflow is designed to process high-throughput RNA-seq data,\ + \ where every\nwell of a microarray plate is a sample. A fasta file provided as\ + \ input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow\ + \ is built in a modular fashion, where most of the base functionality\nis provided\ + \ by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\n\ + supplemented by custom base components and workflow components in this package.\n\ + \nThe full workflow is split in two major subworkflows that can be run independently:\n\ + \n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per\ + \ well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate\ + \ QC reports.\n\nEach of those can be started individually, or the full workflow\ + \ can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq)\ + \ \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner)\ + \ where a\nnumber of choices (input/output structure and location) have been made.\n\ + \nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other\ + \ formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex)\ + \ first.\n" + info: + test_resources: + - path: "gs://viash-hub-resources/htrnaseq/v2" + dest: "resources_test" + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script\ + \ += 'includeConfig(\"nextflow_labels.config\")'\n.resources += {path: '/src/config/labels.config',\ + \ dest: 'nextflow_labels.config'}\n.resources += {path: '/_viash.yaml', dest:\ + \ '_viash.yaml'}\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'updatecraftbox'" + keywords: + - "bioinformatics" + - "sequencing" + - "high-throughput" + - "RNAseq" + - "mapping" + - "counting" + - "pipeline" + - "workflow" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/viash-hub/htrnaseq" + issue_tracker: "https://github.com/viash-hub/htrnaseq/issues" diff --git a/target/executable/stats/generate_well_statistics/_viash.yaml b/target/executable/stats/generate_well_statistics/_viash.yaml new file mode 100644 index 00000000..d08de7b1 --- /dev/null +++ b/target/executable/stats/generate_well_statistics/_viash.yaml @@ -0,0 +1,21 @@ +name: htrnaseq +summary: | + A workflow for high-throughput RNA-seq data analyses. +description: "This workflow is designed to process high-throughput RNA-seq data, where every\nwell of a microarray plate is a sample. A fasta file provided as input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow is built in a modular fashion, where most of the base functionality\nis provided by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\nsupplemented by custom base components and workflow components in this package.\n\nThe full workflow is split in two major subworkflows that can be run independently:\n\n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate QC reports.\n\nEach of those can be started individually, or the full workflow can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq) \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner) where a\nnumber of choices (input/output structure and location) have been made.\n\nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex) first.\n" +license: MIT +keywords: [bioinformatics, sequencing, high-throughput, RNAseq, mapping, counting, pipeline, workflow] +links: + issue_tracker: https://github.com/viash-hub/htrnaseq/issues + repository: https://github.com/viash-hub/htrnaseq +viash_version: 0.9.4 +info: + test_resources: + - path: gs://viash-hub-resources/htrnaseq/v2 + dest: resources_test +config_mods: | + .requirements.commands := ['ps'] + .runners[.type == 'nextflow'].config.script += 'includeConfig("nextflow_labels.config")' + .resources += {path: '/src/config/labels.config', dest: 'nextflow_labels.config'} + .resources += {path: '/_viash.yaml', dest: '_viash.yaml'} +version: updatecraftbox +organization: vsh diff --git a/target/executable/stats/generate_well_statistics/generate_well_statistics b/target/executable/stats/generate_well_statistics/generate_well_statistics new file mode 100755 index 00000000..8a9fe773 --- /dev/null +++ b/target/executable/stats/generate_well_statistics/generate_well_statistics @@ -0,0 +1,1372 @@ +#!/usr/bin/env bash + +# generate_well_statistics updatecraftbox +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. +# +# Component authors: +# * Dries Schaumont (author, maintainer) +# * Marijke Van Moerbeke (contributor) + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="generate_well_statistics" +VIASH_META_FUNCTIONALITY_NAME="generate_well_statistics" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.13-trixie +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "pysam" "pandas" + +LABEL org.opencontainers.image.authors="Dries Schaumont, Marijke Van Moerbeke" +LABEL org.opencontainers.image.description="Companion container for running component stats generate_well_statistics" +LABEL org.opencontainers.image.created="2025-09-02T13:33:10Z" +LABEL org.opencontainers.image.source="https://github.com/viash-hub/htrnaseq" +LABEL org.opencontainers.image.revision="e6da525fc57aaec74f348eb974b68faa647bf800" +LABEL org.opencontainers.image.version="updatecraftbox" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "generate_well_statistics updatecraftbox" + echo "" + echo "Generate summary statistics from BAM files generated by STAR solo." + echo "" + echo "Arguments:" + echo " --input" + echo " type: file, file must exist" + echo " example: input.bam" + echo " The .bam file as returned by the mapping tool STAR." + echo "" + echo " --barcode" + echo " type: string, required parameter" + echo " The barcode for the well that is being processed. Is only used to add a" + echo " metadata" + echo " column to all output files." + echo "" + echo " --well_id" + echo " type: string, required parameter" + echo " ID of this well. Only used to add a metadata column to the output files." + echo "" + echo " --processedBAMFile" + echo " type: file, output, file must exist" + echo " default: processedBamFile.txt" + echo " Path to a .tsv file listing, per read in the BAM file," + echo " the value for the \"CB\", \"UX\", \"GX\" and \"GN\" tag, together with the" + echo " chromsome to which the read was mapped to." + echo "" + echo " --nrReadsNrGenesPerChrom" + echo " type: file, output, file must exist" + echo " default: nrReadsNrGenesPerChrom.txt" + echo " Path to an output file that contains a .tsv formatted table describing" + echo " per chromosome the number of reads that were mapped to that chromosome" + echo " (NumberOfReads" + echo " column) and the number of genes on that chromosome that had at least one" + echo " read mapped to it (NumberOfGenes)." + echo "" + echo " --nrReadsNrUMIsPerCB" + echo " type: file, output, file must exist" + echo " default: nrReadsNrUMIsPerCB.txt" + echo " Path to an output file that contains a .tsv formatted table describing" + echo " per barcode the number of UMI's (nrUMIs) and the total number of reads" + echo " (NumberOfReads)." + echo "" + echo " --umiFreqTop" + echo " type: file, output, file must exist" + echo " default: umiFreqTop100.txt" + echo " Path to an output file that contains a .tsv formatted table describing" + echo " per UMI (column UB) the frequency at which they occur in the reads" + echo " (column" + echo " N). Only the top 100 UMIs are included." + echo "" + echo " --threads" + echo " type: integer" + echo " default: 1" + echo " min: 1" + echo " Number of threads to use for decompressing BAM files." + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "generate_well_statistics updatecraftbox" + exit + ;; + --input) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --input. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --input=*) + [ -n "$VIASH_PAR_INPUT" ] && ViashError Bad arguments for option \'--input=*\': \'$VIASH_PAR_INPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_INPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + --barcode) + [ -n "$VIASH_PAR_BARCODE" ] && ViashError Bad arguments for option \'--barcode\': \'$VIASH_PAR_BARCODE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BARCODE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --barcode. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --barcode=*) + [ -n "$VIASH_PAR_BARCODE" ] && ViashError Bad arguments for option \'--barcode=*\': \'$VIASH_PAR_BARCODE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_BARCODE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --well_id) + [ -n "$VIASH_PAR_WELL_ID" ] && ViashError Bad arguments for option \'--well_id\': \'$VIASH_PAR_WELL_ID\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WELL_ID="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --well_id. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --well_id=*) + [ -n "$VIASH_PAR_WELL_ID" ] && ViashError Bad arguments for option \'--well_id=*\': \'$VIASH_PAR_WELL_ID\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_WELL_ID=$(ViashRemoveFlags "$1") + shift 1 + ;; + --processedBAMFile) + [ -n "$VIASH_PAR_PROCESSEDBAMFILE" ] && ViashError Bad arguments for option \'--processedBAMFile\': \'$VIASH_PAR_PROCESSEDBAMFILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PROCESSEDBAMFILE="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --processedBAMFile. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --processedBAMFile=*) + [ -n "$VIASH_PAR_PROCESSEDBAMFILE" ] && ViashError Bad arguments for option \'--processedBAMFile=*\': \'$VIASH_PAR_PROCESSEDBAMFILE\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PROCESSEDBAMFILE=$(ViashRemoveFlags "$1") + shift 1 + ;; + --nrReadsNrGenesPerChrom) + [ -n "$VIASH_PAR_NRREADSNRGENESPERCHROM" ] && ViashError Bad arguments for option \'--nrReadsNrGenesPerChrom\': \'$VIASH_PAR_NRREADSNRGENESPERCHROM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_NRREADSNRGENESPERCHROM="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --nrReadsNrGenesPerChrom. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --nrReadsNrGenesPerChrom=*) + [ -n "$VIASH_PAR_NRREADSNRGENESPERCHROM" ] && ViashError Bad arguments for option \'--nrReadsNrGenesPerChrom=*\': \'$VIASH_PAR_NRREADSNRGENESPERCHROM\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_NRREADSNRGENESPERCHROM=$(ViashRemoveFlags "$1") + shift 1 + ;; + --nrReadsNrUMIsPerCB) + [ -n "$VIASH_PAR_NRREADSNRUMISPERCB" ] && ViashError Bad arguments for option \'--nrReadsNrUMIsPerCB\': \'$VIASH_PAR_NRREADSNRUMISPERCB\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_NRREADSNRUMISPERCB="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --nrReadsNrUMIsPerCB. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --nrReadsNrUMIsPerCB=*) + [ -n "$VIASH_PAR_NRREADSNRUMISPERCB" ] && ViashError Bad arguments for option \'--nrReadsNrUMIsPerCB=*\': \'$VIASH_PAR_NRREADSNRUMISPERCB\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_NRREADSNRUMISPERCB=$(ViashRemoveFlags "$1") + shift 1 + ;; + --umiFreqTop) + [ -n "$VIASH_PAR_UMIFREQTOP" ] && ViashError Bad arguments for option \'--umiFreqTop\': \'$VIASH_PAR_UMIFREQTOP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_UMIFREQTOP="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --umiFreqTop. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --umiFreqTop=*) + [ -n "$VIASH_PAR_UMIFREQTOP" ] && ViashError Bad arguments for option \'--umiFreqTop=*\': \'$VIASH_PAR_UMIFREQTOP\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_UMIFREQTOP=$(ViashRemoveFlags "$1") + shift 1 + ;; + --threads) + [ -n "$VIASH_PAR_THREADS" ] && ViashError Bad arguments for option \'--threads\': \'$VIASH_PAR_THREADS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_THREADS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --threads. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --threads=*) + [ -n "$VIASH_PAR_THREADS" ] && ViashError Bad arguments for option \'--threads=*\': \'$VIASH_PAR_THREADS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_THREADS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/htrnaseq/stats/generate_well_statistics:updatecraftbox' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'ps' 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'ps' 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_BARCODE+x} ]; then + ViashError '--barcode' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_WELL_ID+x} ]; then + ViashError '--well_id' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# filling in defaults +if [ -z ${VIASH_PAR_PROCESSEDBAMFILE+x} ]; then + VIASH_PAR_PROCESSEDBAMFILE="processedBamFile.txt" +fi +if [ -z ${VIASH_PAR_NRREADSNRGENESPERCHROM+x} ]; then + VIASH_PAR_NRREADSNRGENESPERCHROM="nrReadsNrGenesPerChrom.txt" +fi +if [ -z ${VIASH_PAR_NRREADSNRUMISPERCB+x} ]; then + VIASH_PAR_NRREADSNRUMISPERCB="nrReadsNrUMIsPerCB.txt" +fi +if [ -z ${VIASH_PAR_UMIFREQTOP+x} ]; then + VIASH_PAR_UMIFREQTOP="umiFreqTop100.txt" +fi +if [ -z ${VIASH_PAR_THREADS+x} ]; then + VIASH_PAR_THREADS="1" +fi + +# check whether required files exist +if [ ! -z "$VIASH_PAR_INPUT" ] && [ ! -e "$VIASH_PAR_INPUT" ]; then + ViashError "Input file '$VIASH_PAR_INPUT' does not exist." + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_PAR_THREADS" ]]; then + if ! [[ "$VIASH_PAR_THREADS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError '--threads' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi + if [[ $VIASH_PAR_THREADS -lt 1 ]]; then + ViashError '--threads' has be more than or equal to 1. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_PROCESSEDBAMFILE" ] && [ ! -d "$(dirname "$VIASH_PAR_PROCESSEDBAMFILE")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_PROCESSEDBAMFILE")" +fi +if [ ! -z "$VIASH_PAR_NRREADSNRGENESPERCHROM" ] && [ ! -d "$(dirname "$VIASH_PAR_NRREADSNRGENESPERCHROM")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_NRREADSNRGENESPERCHROM")" +fi +if [ ! -z "$VIASH_PAR_NRREADSNRUMISPERCB" ] && [ ! -d "$(dirname "$VIASH_PAR_NRREADSNRUMISPERCB")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_NRREADSNRUMISPERCB")" +fi +if [ ! -z "$VIASH_PAR_UMIFREQTOP" ] && [ ! -d "$(dirname "$VIASH_PAR_UMIFREQTOP")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_UMIFREQTOP")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_INPUT")" ) + VIASH_PAR_INPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_INPUT") +fi +if [ ! -z "$VIASH_PAR_PROCESSEDBAMFILE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_PROCESSEDBAMFILE")" ) + VIASH_PAR_PROCESSEDBAMFILE=$(ViashDockerAutodetectMount "$VIASH_PAR_PROCESSEDBAMFILE") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_PROCESSEDBAMFILE" ) +fi +if [ ! -z "$VIASH_PAR_NRREADSNRGENESPERCHROM" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_NRREADSNRGENESPERCHROM")" ) + VIASH_PAR_NRREADSNRGENESPERCHROM=$(ViashDockerAutodetectMount "$VIASH_PAR_NRREADSNRGENESPERCHROM") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_NRREADSNRGENESPERCHROM" ) +fi +if [ ! -z "$VIASH_PAR_NRREADSNRUMISPERCB" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_NRREADSNRUMISPERCB")" ) + VIASH_PAR_NRREADSNRUMISPERCB=$(ViashDockerAutodetectMount "$VIASH_PAR_NRREADSNRUMISPERCB") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_NRREADSNRUMISPERCB" ) +fi +if [ ! -z "$VIASH_PAR_UMIFREQTOP" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_UMIFREQTOP")" ) + VIASH_PAR_UMIFREQTOP=$(ViashDockerAutodetectMount "$VIASH_PAR_UMIFREQTOP") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_UMIFREQTOP" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-generate_well_statistics-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import pysam +import pandas as pd +import logging + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'barcode': $( if [ ! -z ${VIASH_PAR_BARCODE+x} ]; then echo "r'${VIASH_PAR_BARCODE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'well_id': $( if [ ! -z ${VIASH_PAR_WELL_ID+x} ]; then echo "r'${VIASH_PAR_WELL_ID//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'processedBAMFile': $( if [ ! -z ${VIASH_PAR_PROCESSEDBAMFILE+x} ]; then echo "r'${VIASH_PAR_PROCESSEDBAMFILE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'nrReadsNrGenesPerChrom': $( if [ ! -z ${VIASH_PAR_NRREADSNRGENESPERCHROM+x} ]; then echo "r'${VIASH_PAR_NRREADSNRGENESPERCHROM//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'nrReadsNrUMIsPerCB': $( if [ ! -z ${VIASH_PAR_NRREADSNRUMISPERCB+x} ]; then echo "r'${VIASH_PAR_NRREADSNRUMISPERCB//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'umiFreqTop': $( if [ ! -z ${VIASH_PAR_UMIFREQTOP+x} ]; then echo "r'${VIASH_PAR_UMIFREQTOP//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'threads': $( if [ ! -z ${VIASH_PAR_THREADS+x} ]; then echo "int(r'${VIASH_PAR_THREADS//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END +logger = logging.getLogger() +console_handler = logging.StreamHandler() +logger.addHandler(console_handler) +logger.setLevel(logging.DEBUG) + +if __name__ == "__main__": + logger.info("Component started.") + parameters_str = [f'\\t{param}: {param_val}\\n' for param, param_val in par.items()] + logger.info("Parameters:\\n%s", "".join(parameters_str).rstrip()) + logger.info("Opening '%s'", par["input"]) + samfile = pysam.AlignmentFile(par["input"], "rb", threads=par["threads"]) + all_tags = [] + index = [] + tags_selection = ("CB", "UB", "GX", "GN") + for aligned_segment in samfile: + tags = dict(aligned_segment.get_tags()) + all_tags.append(tags) + reference_name = aligned_segment.reference_name + index.append("*" if not reference_name else reference_name) + if not index: + # Workaround for https://github.com/pandas-dev/pandas/issues/58594 + tag_dataframe = pd.DataFrame([], index=[], columns=tags_selection) + else: + tag_dataframe = pd.DataFrame.from_records(all_tags, index=index, + columns=tags_selection) + tag_dataframe_to_write = tag_dataframe.copy() + logger.info("Done reading BAM file. Found %i entries", tag_dataframe.shape[0]) + tag_dataframe.assign(WellBC=par["barcode"], WellID=par["well_id"])\\ + .reset_index(names="Chr")\\ + .to_csv(par["processedBAMFile"], sep="\\t", na_rep="", + header=True, index=False, + columns=("WellBC", "WellID", "Chr") + tags_selection) + logger.info("Constructing of dataframe done.") + # Number of genes that had a read mapped to them per chromosome, + # and the number of reads mapped to those genes per chromosome. + nr_reads_nr_genes = tag_dataframe.dropna(subset=["GX"]).groupby(level=0).agg( + NumberOfReads=pd.NamedAgg("GX", aggfunc="size"), + NumberOfGenes=pd.NamedAgg(column="GX", aggfunc="nunique") + ) + nr_reads_nr_genes = nr_reads_nr_genes.reindex(samfile.header.references, fill_value=0) + logger.info("Done calculating number of reads per gene and per chromesome. Writing to %s", + par['nrReadsNrGenesPerChrom']) + nr_reads_nr_genes.reset_index(names="Chr").assign(WellBC=par["barcode"], WellID=par["well_id"])\\ + .to_csv(par["nrReadsNrGenesPerChrom"], sep="\\t", + header=True, index=False, + columns=("WellBC", "WellID", "Chr", "NumberOfReads", "NumberOfGenes")) + + # Number of reads mapped to the reference, grouped by UMI + nr_read_per_umi = tag_dataframe.groupby('UB').size()\\ + .drop("", errors="ignore").sort_values(ascending=False).head(100) + nr_read_per_umi_df = nr_read_per_umi.to_frame(name="N") + logger.info("Done calculating number of mapped reads per UMI, writing to %s", par["umiFreqTop"]) + nr_read_per_umi_df.assign(WellBC=par["barcode"], WellID=par["well_id"]).reset_index(names="UB")\\ + .to_csv(par["umiFreqTop"], header=True, sep="\\t", + index=False, columns=("WellBC", "WellID", "UB", "N")) + + # Total number of mapped reads and total number of UMIs (not grouped per chromosome) + nr_reads_and_umi_per_barcode = tag_dataframe.groupby(by="CB").agg( + NumberOfReads=pd.NamedAgg("CB", "size"), + nrUMIs=pd.NamedAgg("UB", "nunique") + ) + logger.info("Done calculating number of mapped reads and number of UMIs per Cell Barcode, writing to %s", + par["nrReadsNrUMIsPerCB"]) + nr_reads_and_umi_per_barcode.assign(WellBC=par["barcode"], WellID=par["well_id"]).reset_index(names="CB")\\ + .to_csv(par["nrReadsNrUMIsPerCB"], sep="\\t", header=True, + index=False, columns=("WellBC", "WellID", "CB", "NumberOfReads", "nrUMIs")) + logger.info("Finished!") +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_INPUT" ]; then + VIASH_PAR_INPUT=$(ViashDockerStripAutomount "$VIASH_PAR_INPUT") + fi + if [ ! -z "$VIASH_PAR_PROCESSEDBAMFILE" ]; then + VIASH_PAR_PROCESSEDBAMFILE=$(ViashDockerStripAutomount "$VIASH_PAR_PROCESSEDBAMFILE") + fi + if [ ! -z "$VIASH_PAR_NRREADSNRGENESPERCHROM" ]; then + VIASH_PAR_NRREADSNRGENESPERCHROM=$(ViashDockerStripAutomount "$VIASH_PAR_NRREADSNRGENESPERCHROM") + fi + if [ ! -z "$VIASH_PAR_NRREADSNRUMISPERCB" ]; then + VIASH_PAR_NRREADSNRUMISPERCB=$(ViashDockerStripAutomount "$VIASH_PAR_NRREADSNRUMISPERCB") + fi + if [ ! -z "$VIASH_PAR_UMIFREQTOP" ]; then + VIASH_PAR_UMIFREQTOP=$(ViashDockerStripAutomount "$VIASH_PAR_UMIFREQTOP") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_PROCESSEDBAMFILE" ] && [ ! -e "$VIASH_PAR_PROCESSEDBAMFILE" ]; then + ViashError "Output file '$VIASH_PAR_PROCESSEDBAMFILE' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_NRREADSNRGENESPERCHROM" ] && [ ! -e "$VIASH_PAR_NRREADSNRGENESPERCHROM" ]; then + ViashError "Output file '$VIASH_PAR_NRREADSNRGENESPERCHROM' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_NRREADSNRUMISPERCB" ] && [ ! -e "$VIASH_PAR_NRREADSNRUMISPERCB" ]; then + ViashError "Output file '$VIASH_PAR_NRREADSNRUMISPERCB' does not exist." + exit 1 +fi +if [ ! -z "$VIASH_PAR_UMIFREQTOP" ] && [ ! -e "$VIASH_PAR_UMIFREQTOP" ]; then + ViashError "Output file '$VIASH_PAR_UMIFREQTOP' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/executable/stats/generate_well_statistics/nextflow_labels.config b/target/executable/stats/generate_well_statistics/nextflow_labels.config new file mode 100644 index 00000000..2821ec46 --- /dev/null +++ b/target/executable/stats/generate_well_statistics/nextflow_labels.config @@ -0,0 +1,108 @@ +executor { + $k8s { + submitRateLimit = '10sec' + pollInterval = '1 sec' + } +} + +process { + container = 'nextflow/bash:latest' + + // default resources + memory = { 8.Gb * task.attempt } + cpus = 8 + maxForks = 36 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + maxMemory = 192.GB + + // Resource labels + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 8 } + withLabel: midcpu { cpus = 16 } + withLabel: highcpu { cpus = 32 } + + withLabel: verylowmem { memory = { get_memory( 4.GB * task.attempt ) } } + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 16.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 64.GB * task.attempt ) } } + +} + +profiles { + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.fixOwnership = true + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + + local { + // This config is for local processing. + process { + withName: ".*parallel_map_process" { + maxForks = 1 + } + maxMemory = 25.GB + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 6 } + withLabel: highcpu { cpus = 8 } + + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 12.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 20.GB * task.attempt ) } } + } + } +} + +def get_memory(to_compare) { + if (!process.containsKey("maxMemory") || !process.maxMemory) { + return to_compare + } + + try { + if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) { + return process.maxMemory + } + else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) { + return max_memory as nextflow.util.MemoryUnit + } + else { + return to_compare + } + } catch (all) { + println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!" + System.exit(1) + } + } diff --git a/target/executable/utils/save_params/.config.vsh.yaml b/target/executable/utils/save_params/.config.vsh.yaml new file mode 100644 index 00000000..4fc40747 --- /dev/null +++ b/target/executable/utils/save_params/.config.vsh.yaml @@ -0,0 +1,204 @@ +name: "save_params" +namespace: "utils" +version: "updatecraftbox" +argument_groups: +- name: "Inputs" + arguments: + - type: "string" + name: "--id" + description: "The id of the job\n" + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--params_yaml" + description: "base64 encoded yaml containing the state\n" + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "The output YAML file\n" + info: null + example: + - "output.yaml" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +- type: "file" + path: "_viash.yaml" + dest: "_viash.yaml" +description: "Save parameters to a YAML file\n" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +requirements: + commands: + - "ps" +license: "MIT" +links: + repository: "https://github.com/viash-hub/htrnaseq" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "updatecraftbox" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "pyyaml" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/utils/save_params/config.vsh.yaml" + runner: "executable" + engine: "docker|native" + output: "target/executable/utils/save_params" + executable: "target/executable/utils/save_params/save_params" + viash_version: "0.9.4" + git_commit: "e6da525fc57aaec74f348eb974b68faa647bf800" + git_remote: "https://github.com/viash-hub/htrnaseq" +package_config: + name: "htrnaseq" + version: "updatecraftbox" + summary: "A workflow for high-throughput RNA-seq data analyses.\n" + description: "This workflow is designed to process high-throughput RNA-seq data,\ + \ where every\nwell of a microarray plate is a sample. A fasta file provided as\ + \ input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow\ + \ is built in a modular fashion, where most of the base functionality\nis provided\ + \ by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\n\ + supplemented by custom base components and workflow components in this package.\n\ + \nThe full workflow is split in two major subworkflows that can be run independently:\n\ + \n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per\ + \ well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate\ + \ QC reports.\n\nEach of those can be started individually, or the full workflow\ + \ can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq)\ + \ \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner)\ + \ where a\nnumber of choices (input/output structure and location) have been made.\n\ + \nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other\ + \ formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex)\ + \ first.\n" + info: + test_resources: + - path: "gs://viash-hub-resources/htrnaseq/v2" + dest: "resources_test" + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script\ + \ += 'includeConfig(\"nextflow_labels.config\")'\n.resources += {path: '/src/config/labels.config',\ + \ dest: 'nextflow_labels.config'}\n.resources += {path: '/_viash.yaml', dest:\ + \ '_viash.yaml'}\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'updatecraftbox'" + keywords: + - "bioinformatics" + - "sequencing" + - "high-throughput" + - "RNAseq" + - "mapping" + - "counting" + - "pipeline" + - "workflow" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/viash-hub/htrnaseq" + issue_tracker: "https://github.com/viash-hub/htrnaseq/issues" diff --git a/target/executable/utils/save_params/_viash.yaml b/target/executable/utils/save_params/_viash.yaml new file mode 100644 index 00000000..d08de7b1 --- /dev/null +++ b/target/executable/utils/save_params/_viash.yaml @@ -0,0 +1,21 @@ +name: htrnaseq +summary: | + A workflow for high-throughput RNA-seq data analyses. +description: "This workflow is designed to process high-throughput RNA-seq data, where every\nwell of a microarray plate is a sample. A fasta file provided as input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow is built in a modular fashion, where most of the base functionality\nis provided by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\nsupplemented by custom base components and workflow components in this package.\n\nThe full workflow is split in two major subworkflows that can be run independently:\n\n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate QC reports.\n\nEach of those can be started individually, or the full workflow can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq) \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner) where a\nnumber of choices (input/output structure and location) have been made.\n\nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex) first.\n" +license: MIT +keywords: [bioinformatics, sequencing, high-throughput, RNAseq, mapping, counting, pipeline, workflow] +links: + issue_tracker: https://github.com/viash-hub/htrnaseq/issues + repository: https://github.com/viash-hub/htrnaseq +viash_version: 0.9.4 +info: + test_resources: + - path: gs://viash-hub-resources/htrnaseq/v2 + dest: resources_test +config_mods: | + .requirements.commands := ['ps'] + .runners[.type == 'nextflow'].config.script += 'includeConfig("nextflow_labels.config")' + .resources += {path: '/src/config/labels.config', dest: 'nextflow_labels.config'} + .resources += {path: '/_viash.yaml', dest: '_viash.yaml'} +version: updatecraftbox +organization: vsh diff --git a/target/executable/utils/save_params/nextflow_labels.config b/target/executable/utils/save_params/nextflow_labels.config new file mode 100644 index 00000000..2821ec46 --- /dev/null +++ b/target/executable/utils/save_params/nextflow_labels.config @@ -0,0 +1,108 @@ +executor { + $k8s { + submitRateLimit = '10sec' + pollInterval = '1 sec' + } +} + +process { + container = 'nextflow/bash:latest' + + // default resources + memory = { 8.Gb * task.attempt } + cpus = 8 + maxForks = 36 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + maxMemory = 192.GB + + // Resource labels + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 8 } + withLabel: midcpu { cpus = 16 } + withLabel: highcpu { cpus = 32 } + + withLabel: verylowmem { memory = { get_memory( 4.GB * task.attempt ) } } + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 16.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 64.GB * task.attempt ) } } + +} + +profiles { + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.fixOwnership = true + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + + local { + // This config is for local processing. + process { + withName: ".*parallel_map_process" { + maxForks = 1 + } + maxMemory = 25.GB + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 6 } + withLabel: highcpu { cpus = 8 } + + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 12.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 20.GB * task.attempt ) } } + } + } +} + +def get_memory(to_compare) { + if (!process.containsKey("maxMemory") || !process.maxMemory) { + return to_compare + } + + try { + if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) { + return process.maxMemory + } + else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) { + return max_memory as nextflow.util.MemoryUnit + } + else { + return to_compare + } + } catch (all) { + println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!" + System.exit(1) + } + } diff --git a/target/executable/utils/save_params/save_params b/target/executable/utils/save_params/save_params new file mode 100755 index 00000000..7377b03d --- /dev/null +++ b/target/executable/utils/save_params/save_params @@ -0,0 +1,1137 @@ +#!/usr/bin/env bash + +# save_params updatecraftbox +# +# This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +# work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +# Intuitive. +# +# The component may contain files which fall under a different license. The +# authors of this component should specify the license in the header of such +# files, or include a separate license file detailing the licenses of all included +# files. + +set -e + +if [ -z "$VIASH_TEMP" ]; then + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$VIASH_TMP} + VIASH_TEMP=${VIASH_TEMP:-$TMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TMP} + VIASH_TEMP=${VIASH_TEMP:-$TEMPDIR} + VIASH_TEMP=${VIASH_TEMP:-$TEMP} + VIASH_TEMP=${VIASH_TEMP:-/tmp} +fi + +# define helper functions +# ViashQuote: put quotes around non flag values +# $1 : unquoted string +# return : possibly quoted string +# examples: +# ViashQuote --foo # returns --foo +# ViashQuote bar # returns 'bar' +# Viashquote --foo=bar # returns --foo='bar' +function ViashQuote { + if [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+=.+$ ]]; then + echo "$1" | sed "s#=\(.*\)#='\1'#" + elif [[ "$1" =~ ^-+[a-zA-Z0-9_\-]+$ ]]; then + echo "$1" + else + echo "'$1'" + fi +} +# ViashRemoveFlags: Remove leading flag +# $1 : string with a possible leading flag +# return : string without possible leading flag +# examples: +# ViashRemoveFlags --foo=bar # returns bar +function ViashRemoveFlags { + echo "$1" | sed 's/^--*[a-zA-Z0-9_\-]*=//' +} +# ViashSourceDir: return the path of a bash file, following symlinks +# usage : ViashSourceDir ${BASH_SOURCE[0]} +# $1 : Should always be set to ${BASH_SOURCE[0]} +# returns : The absolute path of the bash file +function ViashSourceDir { + local source="$1" + while [ -h "$source" ]; do + local dir="$( cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd )" + source="$(readlink "$source")" + [[ $source != /* ]] && source="$dir/$source" + done + cd -P "$( dirname "$source" )" >/dev/null 2>&1 && pwd +} +# ViashFindTargetDir: return the path of the '.build.yaml' file, following symlinks +# usage : ViashFindTargetDir 'ScriptPath' +# $1 : The location from where to start the upward search +# returns : The absolute path of the '.build.yaml' file +function ViashFindTargetDir { + local source="$1" + while [[ "$source" != "" && ! -e "$source/.build.yaml" ]]; do + source=${source%/*} + done + echo $source +} +# see https://en.wikipedia.org/wiki/Syslog#Severity_level +VIASH_LOGCODE_EMERGENCY=0 +VIASH_LOGCODE_ALERT=1 +VIASH_LOGCODE_CRITICAL=2 +VIASH_LOGCODE_ERROR=3 +VIASH_LOGCODE_WARNING=4 +VIASH_LOGCODE_NOTICE=5 +VIASH_LOGCODE_INFO=6 +VIASH_LOGCODE_DEBUG=7 +VIASH_VERBOSITY=$VIASH_LOGCODE_NOTICE + +# ViashLog: Log events depending on the verbosity level +# usage: ViashLog 1 alert Oh no something went wrong! +# $1: required verbosity level +# $2: display tag +# $3+: messages to display +# stdout: Your input, prepended by '[$2] '. +function ViashLog { + local required_level="$1" + local display_tag="$2" + shift 2 + if [ $VIASH_VERBOSITY -ge $required_level ]; then + >&2 echo "[$display_tag]" "$@" + fi +} + +# ViashEmergency: log events when the system is unstable +# usage: ViashEmergency Oh no something went wrong. +# stdout: Your input, prepended by '[emergency] '. +function ViashEmergency { + ViashLog $VIASH_LOGCODE_EMERGENCY emergency "$@" +} + +# ViashAlert: log events when actions must be taken immediately (e.g. corrupted system database) +# usage: ViashAlert Oh no something went wrong. +# stdout: Your input, prepended by '[alert] '. +function ViashAlert { + ViashLog $VIASH_LOGCODE_ALERT alert "$@" +} + +# ViashCritical: log events when a critical condition occurs +# usage: ViashCritical Oh no something went wrong. +# stdout: Your input, prepended by '[critical] '. +function ViashCritical { + ViashLog $VIASH_LOGCODE_CRITICAL critical "$@" +} + +# ViashError: log events when an error condition occurs +# usage: ViashError Oh no something went wrong. +# stdout: Your input, prepended by '[error] '. +function ViashError { + ViashLog $VIASH_LOGCODE_ERROR error "$@" +} + +# ViashWarning: log potentially abnormal events +# usage: ViashWarning Something may have gone wrong. +# stdout: Your input, prepended by '[warning] '. +function ViashWarning { + ViashLog $VIASH_LOGCODE_WARNING warning "$@" +} + +# ViashNotice: log significant but normal events +# usage: ViashNotice This just happened. +# stdout: Your input, prepended by '[notice] '. +function ViashNotice { + ViashLog $VIASH_LOGCODE_NOTICE notice "$@" +} + +# ViashInfo: log normal events +# usage: ViashInfo This just happened. +# stdout: Your input, prepended by '[info] '. +function ViashInfo { + ViashLog $VIASH_LOGCODE_INFO info "$@" +} + +# ViashDebug: log all events, for debugging purposes +# usage: ViashDebug This just happened. +# stdout: Your input, prepended by '[debug] '. +function ViashDebug { + ViashLog $VIASH_LOGCODE_DEBUG debug "$@" +} + +# find source folder of this component +VIASH_META_RESOURCES_DIR=`ViashSourceDir ${BASH_SOURCE[0]}` + +# find the root of the built components & dependencies +VIASH_TARGET_DIR=`ViashFindTargetDir $VIASH_META_RESOURCES_DIR` + +# define meta fields +VIASH_META_NAME="save_params" +VIASH_META_FUNCTIONALITY_NAME="save_params" +VIASH_META_EXECUTABLE="$VIASH_META_RESOURCES_DIR/$VIASH_META_NAME" +VIASH_META_CONFIG="$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" +VIASH_META_TEMP_DIR="$VIASH_TEMP" + + + +# initialise variables +VIASH_MODE='run' +VIASH_ENGINE_ID='docker' + +######## Helper functions for setting up Docker images for viash ######## +# expects: ViashDockerBuild + +# ViashDockerInstallationCheck: check whether Docker is installed correctly +# +# examples: +# ViashDockerInstallationCheck +function ViashDockerInstallationCheck { + ViashDebug "Checking whether Docker is installed" + if [ ! command -v docker &> /dev/null ]; then + ViashCritical "Docker doesn't seem to be installed. See 'https://docs.docker.com/get-docker/' for instructions." + exit 1 + fi + + ViashDebug "Checking whether the Docker daemon is running" + local save=$-; set +e + local docker_version=$(docker version --format '{{.Client.APIVersion}}' 2> /dev/null) + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashCritical "Docker daemon does not seem to be running. Try one of the following:" + ViashCritical "- Try running 'dockerd' in the command line" + ViashCritical "- See https://docs.docker.com/config/daemon/" + exit 1 + fi +} + +# ViashDockerRemoteTagCheck: check whether a Docker image is available +# on a remote. Assumes `docker login` has been performed, if relevant. +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerRemoteTagCheck python:latest +# echo $? # returns '0' +# ViashDockerRemoteTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerRemoteTagCheck { + docker manifest inspect $1 > /dev/null 2> /dev/null +} + +# ViashDockerLocalTagCheck: check whether a Docker image is available locally +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# docker pull python:latest +# ViashDockerLocalTagCheck python:latest +# echo $? # returns '0' +# ViashDockerLocalTagCheck sdaizudceahifu +# echo $? # returns '1' +function ViashDockerLocalTagCheck { + [ -n "$(docker images -q $1)" ] +} + +# ViashDockerPull: pull a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPull python:latest +# echo $? # returns '0' +# ViashDockerPull sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPull { + ViashNotice "Checking if Docker image is available at '$1'" + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker pull $1 && return 0 || return 1 + else + local save=$-; set +e + docker pull $1 2> /dev/null > /dev/null + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashWarning "Could not pull from '$1'. Docker image doesn't exist or is not accessible." + fi + return $out + fi +} + +# ViashDockerPush: push a Docker image +# +# $1 : image identifier with format `[registry/]image[:tag]` +# exit code $? : whether or not the image was found +# examples: +# ViashDockerPush python:latest +# echo $? # returns '0' +# ViashDockerPush sdaizudceahifu +# echo $? # returns '1' +function ViashDockerPush { + ViashNotice "Pushing image to '$1'" + local save=$-; set +e + local out + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + docker push $1 + out=$? + else + docker push $1 2> /dev/null > /dev/null + out=$? + fi + [[ $save =~ e ]] && set -e + if [ $out -eq 0 ]; then + ViashNotice "Container '$1' push succeeded." + else + ViashError "Container '$1' push errored. You might not be logged in or have the necessary permissions." + fi + return $out +} + +# ViashDockerPullElseBuild: pull a Docker image, else build it +# +# $1 : image identifier with format `[registry/]image[:tag]` +# ViashDockerBuild : a Bash function which builds a docker image, takes image identifier as argument. +# examples: +# ViashDockerPullElseBuild mynewcomponent +function ViashDockerPullElseBuild { + local save=$-; set +e + ViashDockerPull $1 + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashDockerBuild $@ + fi +} + +# ViashDockerSetup: create a Docker image, according to specified docker setup strategy +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $2 : docker setup strategy, see DockerSetupStrategy.scala +# examples: +# ViashDockerSetup mynewcomponent alwaysbuild +function ViashDockerSetup { + local image_id="$1" + local setup_strategy="$2" + if [ "$setup_strategy" == "alwaysbuild" -o "$setup_strategy" == "build" -o "$setup_strategy" == "b" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspull" -o "$setup_strategy" == "pull" -o "$setup_strategy" == "p" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "alwayspullelsebuild" -o "$setup_strategy" == "pullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayspullelsecachedbuild" -o "$setup_strategy" == "pullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "alwayscachedbuild" -o "$setup_strategy" == "cachedbuild" -o "$setup_strategy" == "cb" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [[ "$setup_strategy" =~ ^ifneedbe ]]; then + local save=$-; set +e + ViashDockerLocalTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashInfo "Image $image_id already exists" + elif [ "$setup_strategy" == "ifneedbebuild" ]; then + ViashDockerBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbecachedbuild" ]; then + ViashDockerBuild $image_id $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepull" ]; then + ViashDockerPull $image_id + elif [ "$setup_strategy" == "ifneedbepullelsebuild" ]; then + ViashDockerPullElseBuild $image_id --no-cache $(ViashDockerBuildArgs "$engine_id") + elif [ "$setup_strategy" == "ifneedbepullelsecachedbuild" ]; then + ViashDockerPullElseBuild $image_id $(ViashDockerBuildArgs "$engine_id") + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi + elif [ "$setup_strategy" == "push" -o "$setup_strategy" == "forcepush" -o "$setup_strategy" == "alwayspush" ]; then + ViashDockerPush "$image_id" + elif [ "$setup_strategy" == "pushifnotpresent" -o "$setup_strategy" == "gentlepush" -o "$setup_strategy" == "maybepush" ]; then + local save=$-; set +e + ViashDockerRemoteTagCheck $image_id + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -eq 0 ]; then + ViashNotice "Container '$image_id' exists, doing nothing." + else + ViashNotice "Container '$image_id' does not yet exist." + ViashDockerPush "$image_id" + fi + elif [ "$setup_strategy" == "donothing" -o "$setup_strategy" == "meh" ]; then + ViashNotice "Skipping setup." + else + ViashError "Unrecognised Docker strategy: $setup_strategy" + exit 1 + fi +} + +# ViashDockerCheckCommands: Check whether a docker container has the required commands +# +# $1 : image identifier with format `[registry/]image[:tag]` +# $@ : commands to verify being present +# examples: +# ViashDockerCheckCommands bash:4.0 bash ps foo +function ViashDockerCheckCommands { + local image_id="$1" + shift 1 + local commands="$@" + local save=$-; set +e + local missing # mark 'missing' as local in advance, otherwise the exit code of the command will be missing and always be '0' + missing=$(docker run --rm --entrypoint=sh "$image_id" -c "for command in $commands; do command -v \$command >/dev/null 2>&1; if [ \$? -ne 0 ]; then echo \$command; exit 1; fi; done") + local outCheck=$? + [[ $save =~ e ]] && set -e + if [ $outCheck -ne 0 ]; then + ViashError "Docker container '$image_id' does not contain command '$missing'." + exit 1 + fi +} + +# ViashDockerBuild: build a docker image +# $1 : image identifier with format `[registry/]image[:tag]` +# $... : additional arguments to pass to docker build +# $VIASH_META_TEMP_DIR : temporary directory to store dockerfile & optional resources in +# $VIASH_META_NAME : name of the component +# $VIASH_META_RESOURCES_DIR : directory containing the resources +# $VIASH_VERBOSITY : verbosity level +# exit code $? : whether or not the image was built successfully +function ViashDockerBuild { + local image_id="$1" + shift 1 + + # create temporary directory to store dockerfile & optional resources in + local tmpdir=$(mktemp -d "$VIASH_META_TEMP_DIR/dockerbuild-$VIASH_META_NAME-XXXXXX") + local dockerfile="$tmpdir/Dockerfile" + function clean_up { + rm -rf "$tmpdir" + } + trap clean_up EXIT + + # store dockerfile and resources + ViashDockerfile "$VIASH_ENGINE_ID" > "$dockerfile" + + # generate the build command + local docker_build_cmd="docker build -t '$image_id' $@ '$VIASH_META_RESOURCES_DIR' -f '$dockerfile'" + + # build the container + ViashNotice "Building container '$image_id' with Dockerfile" + ViashInfo "$docker_build_cmd" + local save=$-; set +e + if [ $VIASH_VERBOSITY -ge $VIASH_LOGCODE_INFO ]; then + eval $docker_build_cmd + else + eval $docker_build_cmd &> "$tmpdir/docker_build.log" + fi + + # check exit code + local out=$? + [[ $save =~ e ]] && set -e + if [ $out -ne 0 ]; then + ViashError "Error occurred while building container '$image_id'" + if [ $VIASH_VERBOSITY -lt $VIASH_LOGCODE_INFO ]; then + ViashError "Transcript: --------------------------------" + cat "$tmpdir/docker_build.log" + ViashError "End of transcript --------------------------" + fi + exit 1 + fi +} + +######## End of helper functions for setting up Docker images for viash ######## + +# ViashDockerFile: print the dockerfile to stdout +# $1 : engine identifier +# return : dockerfile required to run this component +# examples: +# ViashDockerFile +function ViashDockerfile { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + cat << 'VIASHDOCKER' +FROM python:3.12-slim +ENTRYPOINT [] +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y procps && \ + rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade pip && \ + pip install --upgrade --no-cache-dir "pyyaml" + +LABEL org.opencontainers.image.description="Companion container for running component utils save_params" +LABEL org.opencontainers.image.created="2025-09-02T13:33:10Z" +LABEL org.opencontainers.image.source="https://github.com/viash-hub/htrnaseq" +LABEL org.opencontainers.image.revision="e6da525fc57aaec74f348eb974b68faa647bf800" +LABEL org.opencontainers.image.version="updatecraftbox" + +VIASHDOCKER + fi +} + +# ViashDockerBuildArgs: return the arguments to pass to docker build +# $1 : engine identifier +# return : arguments to pass to docker build +function ViashDockerBuildArgs { + local engine_id="$1" + + if [[ "$engine_id" == "docker" ]]; then + echo "" + fi +} + +# ViashAbsolutePath: generate absolute path from relative path +# borrowed from https://stackoverflow.com/a/21951256 +# $1 : relative filename +# return : absolute path +# examples: +# ViashAbsolutePath some_file.txt # returns /path/to/some_file.txt +# ViashAbsolutePath /foo/bar/.. # returns /foo +function ViashAbsolutePath { + local thePath + local parr + local outp + local len + if [[ ! "$1" =~ ^/ ]]; then + thePath="$PWD/$1" + else + thePath="$1" + fi + echo "$thePath" | ( + IFS=/ + read -a parr + declare -a outp + for i in "${parr[@]}"; do + case "$i" in + ''|.) continue ;; + ..) + len=${#outp[@]} + if ((len==0)); then + continue + else + unset outp[$((len-1))] + fi + ;; + *) + len=${#outp[@]} + outp[$len]="$i" + ;; + esac + done + echo /"${outp[*]}" + ) +} +# ViashDockerAutodetectMount: auto configuring docker mounts from parameters +# $1 : The parameter value +# returns : New parameter +# $VIASH_DIRECTORY_MOUNTS : Added another parameter to be passed to docker +# $VIASH_DOCKER_AUTOMOUNT_PREFIX : The prefix to be used for the automounts +# examples: +# ViashDockerAutodetectMount /path/to/bar # returns '/viash_automount/path/to/bar' +# ViashDockerAutodetectMountArg /path/to/bar # returns '--volume="/path/to:/viash_automount/path/to"' +function ViashDockerAutodetectMount { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + if [ -z "$base_name" ]; then + echo "$mount_target" + else + echo "$mount_target/$base_name" + fi +} +function ViashDockerAutodetectMountArg { + local abs_path=$(ViashAbsolutePath "$1") + local mount_source + local base_name + if [ -d "$abs_path" ]; then + mount_source="$abs_path" + base_name="" + else + mount_source=`dirname "$abs_path"` + base_name=`basename "$abs_path"` + fi + local mount_target="$VIASH_DOCKER_AUTOMOUNT_PREFIX$mount_source" + ViashDebug "ViashDockerAutodetectMountArg $1 -> $mount_source -> $mount_target" + echo "--volume=\"$mount_source:$mount_target\"" +} +function ViashDockerStripAutomount { + local abs_path=$(ViashAbsolutePath "$1") + echo "${abs_path#$VIASH_DOCKER_AUTOMOUNT_PREFIX}" +} +# initialise variables +VIASH_DIRECTORY_MOUNTS=() + +# configure default docker automount prefix if it is unset +if [ -z "${VIASH_DOCKER_AUTOMOUNT_PREFIX+x}" ]; then + VIASH_DOCKER_AUTOMOUNT_PREFIX="/viash_automount" +fi + +# initialise docker variables +VIASH_DOCKER_RUN_ARGS=(-i --rm) + + +# ViashHelp: Display helpful explanation about this executable +function ViashHelp { + echo "save_params updatecraftbox" + echo "" + echo "Save parameters to a YAML file" + echo "" + echo "Inputs:" + echo " --id" + echo " type: string, required parameter" + echo " The id of the job" + echo "" + echo " --params_yaml" + echo " type: string, required parameter" + echo " base64 encoded yaml containing the state" + echo "" + echo "Outputs:" + echo " --output" + echo " type: file, required parameter, output, file must exist" + echo " example: output.yaml" + echo " The output YAML file" + echo "" + echo "Viash built in Computational Requirements:" + echo " ---cpus=INT" + echo " Number of CPUs to use" + echo " ---memory=STRING" + echo " Amount of memory to use. Examples: 4GB, 3MiB." + echo "" + echo "Viash built in Docker:" + echo " ---setup=STRATEGY" + echo " Setup the docker container. Options are: alwaysbuild, alwayscachedbuild, ifneedbebuild, ifneedbecachedbuild, alwayspull, alwayspullelsebuild, alwayspullelsecachedbuild, ifneedbepull, ifneedbepullelsebuild, ifneedbepullelsecachedbuild, push, pushifnotpresent, donothing." + echo " Default: ifneedbepullelsecachedbuild" + echo " ---dockerfile" + echo " Print the dockerfile to stdout." + echo " ---docker_run_args=ARG" + echo " Provide runtime arguments to Docker. See the documentation on \`docker run\` for more information." + echo " ---docker_image_id" + echo " Print the docker image id to stdout." + echo " ---debug" + echo " Enter the docker container for debugging purposes." + echo "" + echo "Viash built in Engines:" + echo " ---engine=ENGINE_ID" + echo " Specify the engine to use. Options are: docker, native." + echo " Default: docker" +} + +# initialise array +VIASH_POSITIONAL_ARGS='' + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + ViashHelp + exit + ;; + ---v|---verbose) + let "VIASH_VERBOSITY=VIASH_VERBOSITY+1" + shift 1 + ;; + ---verbosity) + VIASH_VERBOSITY="$2" + shift 2 + ;; + ---verbosity=*) + VIASH_VERBOSITY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + --version) + echo "save_params updatecraftbox" + exit + ;; + --id) + [ -n "$VIASH_PAR_ID" ] && ViashError Bad arguments for option \'--id\': \'$VIASH_PAR_ID\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ID="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --id. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --id=*) + [ -n "$VIASH_PAR_ID" ] && ViashError Bad arguments for option \'--id=*\': \'$VIASH_PAR_ID\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_ID=$(ViashRemoveFlags "$1") + shift 1 + ;; + --params_yaml) + [ -n "$VIASH_PAR_PARAMS_YAML" ] && ViashError Bad arguments for option \'--params_yaml\': \'$VIASH_PAR_PARAMS_YAML\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PARAMS_YAML="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --params_yaml. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --params_yaml=*) + [ -n "$VIASH_PAR_PARAMS_YAML" ] && ViashError Bad arguments for option \'--params_yaml=*\': \'$VIASH_PAR_PARAMS_YAML\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_PARAMS_YAML=$(ViashRemoveFlags "$1") + shift 1 + ;; + --output) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to --output. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + --output=*) + [ -n "$VIASH_PAR_OUTPUT" ] && ViashError Bad arguments for option \'--output=*\': \'$VIASH_PAR_OUTPUT\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_PAR_OUTPUT=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---engine) + VIASH_ENGINE_ID="$2" + shift 2 + ;; + ---engine=*) + VIASH_ENGINE_ID="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---setup) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$2" + shift 2 + ;; + ---setup=*) + VIASH_MODE='setup' + VIASH_SETUP_STRATEGY="$(ViashRemoveFlags "$1")" + shift 1 + ;; + ---dockerfile) + VIASH_MODE='dockerfile' + shift 1 + ;; + ---docker_run_args) + VIASH_DOCKER_RUN_ARGS+=("$2") + shift 2 + ;; + ---docker_run_args=*) + VIASH_DOCKER_RUN_ARGS+=("$(ViashRemoveFlags "$1")") + shift 1 + ;; + ---docker_image_id) + VIASH_MODE='docker_image_id' + shift 1 + ;; + ---debug) + VIASH_MODE='debug' + shift 1 + ;; + ---cpus) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---cpus. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---cpus=*) + [ -n "$VIASH_META_CPUS" ] && ViashError Bad arguments for option \'---cpus=*\': \'$VIASH_META_CPUS\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_CPUS=$(ViashRemoveFlags "$1") + shift 1 + ;; + ---memory) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY="$2" + [ $# -lt 2 ] && ViashError Not enough arguments passed to ---memory. Use "--help" to get more information on the parameters. && exit 1 + shift 2 + ;; + ---memory=*) + [ -n "$VIASH_META_MEMORY" ] && ViashError Bad arguments for option \'---memory=*\': \'$VIASH_META_MEMORY\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1 + VIASH_META_MEMORY=$(ViashRemoveFlags "$1") + shift 1 + ;; + *) # positional arg or unknown option + # since the positional args will be eval'd, can we always quote, instead of using ViashQuote + VIASH_POSITIONAL_ARGS="$VIASH_POSITIONAL_ARGS '$1'" + [[ $1 == -* ]] && ViashWarning $1 looks like a parameter but is not a defined parameter and will instead be treated as a positional argument. Use "--help" to get more information on the parameters. + shift # past argument + ;; + esac +done + +# parse positional parameters +eval set -- $VIASH_POSITIONAL_ARGS + + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + VIASH_ENGINE_TYPE='native' +elif [ "$VIASH_ENGINE_ID" == "docker" ] ; then + VIASH_ENGINE_TYPE='docker' +else + ViashError "Engine '$VIASH_ENGINE_ID' is not recognized. Options are: docker, native." + exit 1 +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # check if docker is installed properly + ViashDockerInstallationCheck + + # determine docker image id + if [[ "$VIASH_ENGINE_ID" == 'docker' ]]; then + VIASH_DOCKER_IMAGE_ID='images.viash-hub.com/vsh/htrnaseq/utils/save_params:updatecraftbox' + fi + + # print dockerfile + if [ "$VIASH_MODE" == "dockerfile" ]; then + ViashDockerfile "$VIASH_ENGINE_ID" + exit 0 + + elif [ "$VIASH_MODE" == "docker_image_id" ]; then + echo "$VIASH_DOCKER_IMAGE_ID" + exit 0 + + # enter docker container + elif [[ "$VIASH_MODE" == "debug" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} -v '$(pwd)':/pwd --workdir /pwd -t $VIASH_DOCKER_IMAGE_ID" + ViashNotice "+ $VIASH_CMD" + eval $VIASH_CMD + exit + + # build docker image + elif [ "$VIASH_MODE" == "setup" ]; then + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" "$VIASH_SETUP_STRATEGY" + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'ps' 'bash' + exit 0 + fi + + # check if docker image exists + ViashDockerSetup "$VIASH_DOCKER_IMAGE_ID" ifneedbepullelsecachedbuild + ViashDockerCheckCommands "$VIASH_DOCKER_IMAGE_ID" 'ps' 'bash' +fi + +# setting computational defaults + +# helper function for parsing memory strings +function ViashMemoryAsBytes { + local memory=`echo "$1" | tr '[:upper:]' '[:lower:]' | tr -d '[:space:]'` + local memory_regex='^([0-9]+)([kmgtp]i?b?|b)$' + if [[ $memory =~ $memory_regex ]]; then + local number=${memory/[^0-9]*/} + local symbol=${memory/*[0-9]/} + + case $symbol in + b) memory_b=$number ;; + kb|k) memory_b=$(( $number * 1000 )) ;; + mb|m) memory_b=$(( $number * 1000 * 1000 )) ;; + gb|g) memory_b=$(( $number * 1000 * 1000 * 1000 )) ;; + tb|t) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 )) ;; + pb|p) memory_b=$(( $number * 1000 * 1000 * 1000 * 1000 * 1000 )) ;; + kib|ki) memory_b=$(( $number * 1024 )) ;; + mib|mi) memory_b=$(( $number * 1024 * 1024 )) ;; + gib|gi) memory_b=$(( $number * 1024 * 1024 * 1024 )) ;; + tib|ti) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 )) ;; + pib|pi) memory_b=$(( $number * 1024 * 1024 * 1024 * 1024 * 1024 )) ;; + esac + echo "$memory_b" + fi +} +# compute memory in different units +if [ ! -z ${VIASH_META_MEMORY+x} ]; then + VIASH_META_MEMORY_B=`ViashMemoryAsBytes $VIASH_META_MEMORY` + # do not define other variables if memory_b is an empty string + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_META_MEMORY_KB=$(( ($VIASH_META_MEMORY_B+999) / 1000 )) + VIASH_META_MEMORY_MB=$(( ($VIASH_META_MEMORY_KB+999) / 1000 )) + VIASH_META_MEMORY_GB=$(( ($VIASH_META_MEMORY_MB+999) / 1000 )) + VIASH_META_MEMORY_TB=$(( ($VIASH_META_MEMORY_GB+999) / 1000 )) + VIASH_META_MEMORY_PB=$(( ($VIASH_META_MEMORY_TB+999) / 1000 )) + VIASH_META_MEMORY_KIB=$(( ($VIASH_META_MEMORY_B+1023) / 1024 )) + VIASH_META_MEMORY_MIB=$(( ($VIASH_META_MEMORY_KIB+1023) / 1024 )) + VIASH_META_MEMORY_GIB=$(( ($VIASH_META_MEMORY_MIB+1023) / 1024 )) + VIASH_META_MEMORY_TIB=$(( ($VIASH_META_MEMORY_GIB+1023) / 1024 )) + VIASH_META_MEMORY_PIB=$(( ($VIASH_META_MEMORY_TIB+1023) / 1024 )) + else + # unset memory if string is empty + unset $VIASH_META_MEMORY_B + fi +fi +# unset nproc if string is empty +if [ -z "$VIASH_META_CPUS" ]; then + unset $VIASH_META_CPUS +fi + + +# check whether required parameters exist +if [ -z ${VIASH_PAR_ID+x} ]; then + ViashError '--id' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_PARAMS_YAML+x} ]; then + ViashError '--params_yaml' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_PAR_OUTPUT+x} ]; then + ViashError '--output' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_NAME+x} ]; then + ViashError 'name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then + ViashError 'functionality_name' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_RESOURCES_DIR+x} ]; then + ViashError 'resources_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_EXECUTABLE+x} ]; then + ViashError 'executable' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_CONFIG+x} ]; then + ViashError 'config' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi +if [ -z ${VIASH_META_TEMP_DIR+x} ]; then + ViashError 'temp_dir' is a required argument. Use "--help" to get more information on the parameters. + exit 1 +fi + +# check whether parameters values are of the right type +if [[ -n "$VIASH_META_CPUS" ]]; then + if ! [[ "$VIASH_META_CPUS" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'cpus' has to be an integer. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_B" ]]; then + if ! [[ "$VIASH_META_MEMORY_B" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_b' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pb' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_KIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_KIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_kib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_MIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_MIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_mib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_GIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_GIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_gib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_TIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_TIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_tib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi +if [[ -n "$VIASH_META_MEMORY_PIB" ]]; then + if ! [[ "$VIASH_META_MEMORY_PIB" =~ ^[-+]?[0-9]+$ ]]; then + ViashError 'memory_pib' has to be a long. Use "--help" to get more information on the parameters. + exit 1 + fi +fi + +# create parent directories of output files, if so desired +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -d "$(dirname "$VIASH_PAR_OUTPUT")" ]; then + mkdir -p "$(dirname "$VIASH_PAR_OUTPUT")" +fi + +if [ "$VIASH_ENGINE_ID" == "native" ] ; then + if [ "$VIASH_MODE" == "run" ]; then + VIASH_CMD="bash" + else + ViashError "Engine '$VIASH_ENGINE_ID' does not support mode '$VIASH_MODE'." + exit 1 + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # detect volumes from file arguments + VIASH_CHOWN_VARS=() +if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_OUTPUT")" ) + VIASH_PAR_OUTPUT=$(ViashDockerAutodetectMount "$VIASH_PAR_OUTPUT") + VIASH_CHOWN_VARS+=( "$VIASH_PAR_OUTPUT" ) +fi +if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_RESOURCES_DIR")" ) + VIASH_META_RESOURCES_DIR=$(ViashDockerAutodetectMount "$VIASH_META_RESOURCES_DIR") +fi +if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_EXECUTABLE")" ) + VIASH_META_EXECUTABLE=$(ViashDockerAutodetectMount "$VIASH_META_EXECUTABLE") +fi +if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_CONFIG")" ) + VIASH_META_CONFIG=$(ViashDockerAutodetectMount "$VIASH_META_CONFIG") +fi +if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_META_TEMP_DIR")" ) + VIASH_META_TEMP_DIR=$(ViashDockerAutodetectMount "$VIASH_META_TEMP_DIR") +fi + + # get unique mounts + VIASH_UNIQUE_MOUNTS=($(for val in "${VIASH_DIRECTORY_MOUNTS[@]}"; do echo "$val"; done | sort -u)) +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # change file ownership + function ViashPerformChown { + if (( ${#VIASH_CHOWN_VARS[@]} )); then + set +e + VIASH_CMD="docker run --entrypoint=bash --rm ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID -c 'chown $(id -u):$(id -g) --silent --recursive ${VIASH_CHOWN_VARS[@]}'" + ViashDebug "+ $VIASH_CMD" + eval $VIASH_CMD + set -e + fi + } + trap ViashPerformChown EXIT +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # helper function for filling in extra docker args + if [ ! -z "$VIASH_META_MEMORY_B" ]; then + VIASH_DOCKER_RUN_ARGS+=("--memory=${VIASH_META_MEMORY_B}") + fi + if [ ! -z "$VIASH_META_CPUS" ]; then + VIASH_DOCKER_RUN_ARGS+=("--cpus=${VIASH_META_CPUS}") + fi +fi + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + VIASH_CMD="docker run --entrypoint=bash ${VIASH_DOCKER_RUN_ARGS[@]} ${VIASH_UNIQUE_MOUNTS[@]} $VIASH_DOCKER_IMAGE_ID" +fi + + +# set dependency paths + + +ViashDebug "Running command: $(echo $VIASH_CMD)" +cat << VIASHEOF | eval $VIASH_CMD +set -e +tempscript=\$(mktemp "$VIASH_META_TEMP_DIR/viash-run-save_params-XXXXXX").py +function clean_up { + rm "\$tempscript" +} +function interrupt { + echo -e "\nCTRL-C Pressed..." + exit 1 +} +trap clean_up EXIT +trap interrupt INT SIGINT +cat > "\$tempscript" << 'VIASHMAIN' +import re +import yaml +import base64 + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'id': $( if [ ! -z ${VIASH_PAR_ID+x} ]; then echo "r'${VIASH_PAR_ID//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'params_yaml': $( if [ ! -z ${VIASH_PAR_PARAMS_YAML+x} ]; then echo "r'${VIASH_PAR_PARAMS_YAML//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\'/\'\"\'\"r\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\'/\'\"\'\"r\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\'/\'\"\'\"r\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\'/\'\"\'\"r\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +class Dumper(yaml.Dumper): + def increase_indent(self, flow=False, indentless=False): + return super(Dumper, self).increase_indent(flow, False) + +def decode_params_yaml(encoded_yaml): + yaml_bytes = base64.b64decode(encoded_yaml) + yaml_string = yaml_bytes.decode('utf-8') + yaml_data = yaml.safe_load(yaml_string) + + return yaml_data + +params = decode_params_yaml(par['params_yaml']) + +with open(par["output"], 'w') as f: + yaml.dump(params, f, default_flow_style=False, Dumper=Dumper) +VIASHMAIN +python -B "\$tempscript" & +wait "\$!" + +VIASHEOF + + +if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then + # strip viash automount from file paths + + if [ ! -z "$VIASH_PAR_OUTPUT" ]; then + VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT") + fi + if [ ! -z "$VIASH_META_RESOURCES_DIR" ]; then + VIASH_META_RESOURCES_DIR=$(ViashDockerStripAutomount "$VIASH_META_RESOURCES_DIR") + fi + if [ ! -z "$VIASH_META_EXECUTABLE" ]; then + VIASH_META_EXECUTABLE=$(ViashDockerStripAutomount "$VIASH_META_EXECUTABLE") + fi + if [ ! -z "$VIASH_META_CONFIG" ]; then + VIASH_META_CONFIG=$(ViashDockerStripAutomount "$VIASH_META_CONFIG") + fi + if [ ! -z "$VIASH_META_TEMP_DIR" ]; then + VIASH_META_TEMP_DIR=$(ViashDockerStripAutomount "$VIASH_META_TEMP_DIR") + fi +fi + + +# check whether required files exist +if [ ! -z "$VIASH_PAR_OUTPUT" ] && [ ! -e "$VIASH_PAR_OUTPUT" ]; then + ViashError "Output file '$VIASH_PAR_OUTPUT' does not exist." + exit 1 +fi + + +exit 0 diff --git a/target/nextflow/eset/create_eset/.config.vsh.yaml b/target/nextflow/eset/create_eset/.config.vsh.yaml new file mode 100644 index 00000000..fc2fc06c --- /dev/null +++ b/target/nextflow/eset/create_eset/.config.vsh.yaml @@ -0,0 +1,259 @@ +name: "create_eset" +namespace: "eset" +version: "updatecraftbox" +authors: +- name: "Dries Schaumont" + roles: + - "maintainer" + info: + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Marijke Van Moerbeke" + roles: + - "author" + info: + links: + github: "mvanmoerbeke" + orcid: "0000-0002-3097-5621" + linkedin: "marijke-van-moerbeke-84303a34" + organizations: + - name: "OpenAnalytics" + href: "https://www.openanalytics.eu" + role: "Statistical Consultant" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--pDataFile" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--fDataFile" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--mappingDir" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--poolName" + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + info: null + default: + - "eset.$id.rds" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "r_script" + path: "script.R" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +- type: "file" + path: "_viash.yaml" + dest: "_viash.yaml" +test_resources: +- type: "r_script" + path: "test.R" + is_executable: true +- type: "file" + path: "pData.tsv" +- type: "file" + path: "fData.tsv" +- type: "file" + path: "mapping_dir" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +requirements: + commands: + - "ps" +license: "MIT" +links: + repository: "https://github.com/viash-hub/htrnaseq" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "rocker/r2u:24.04" + target_registry: "images.viash-hub.com" + target_tag: "updatecraftbox" + namespace_separator: "/" + setup: + - type: "r" + cran: + - "data.table" + - "nlcv" + bioc: + - "Seurat" + bioc_force_install: false + warnings_as_errors: true + test_setup: + - type: "r" + cran: + - "testthat" + bioc_force_install: false + warnings_as_errors: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/eset/create_eset/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/eset/create_eset" + executable: "target/nextflow/eset/create_eset/main.nf" + viash_version: "0.9.4" + git_commit: "e6da525fc57aaec74f348eb974b68faa647bf800" + git_remote: "https://github.com/viash-hub/htrnaseq" +package_config: + name: "htrnaseq" + version: "updatecraftbox" + summary: "A workflow for high-throughput RNA-seq data analyses.\n" + description: "This workflow is designed to process high-throughput RNA-seq data,\ + \ where every\nwell of a microarray plate is a sample. A fasta file provided as\ + \ input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow\ + \ is built in a modular fashion, where most of the base functionality\nis provided\ + \ by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\n\ + supplemented by custom base components and workflow components in this package.\n\ + \nThe full workflow is split in two major subworkflows that can be run independently:\n\ + \n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per\ + \ well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate\ + \ QC reports.\n\nEach of those can be started individually, or the full workflow\ + \ can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq)\ + \ \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner)\ + \ where a\nnumber of choices (input/output structure and location) have been made.\n\ + \nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other\ + \ formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex)\ + \ first.\n" + info: + test_resources: + - path: "gs://viash-hub-resources/htrnaseq/v2" + dest: "resources_test" + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script\ + \ += 'includeConfig(\"nextflow_labels.config\")'\n.resources += {path: '/src/config/labels.config',\ + \ dest: 'nextflow_labels.config'}\n.resources += {path: '/_viash.yaml', dest:\ + \ '_viash.yaml'}\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'updatecraftbox'" + keywords: + - "bioinformatics" + - "sequencing" + - "high-throughput" + - "RNAseq" + - "mapping" + - "counting" + - "pipeline" + - "workflow" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/viash-hub/htrnaseq" + issue_tracker: "https://github.com/viash-hub/htrnaseq/issues" diff --git a/target/nextflow/eset/create_eset/_viash.yaml b/target/nextflow/eset/create_eset/_viash.yaml new file mode 100644 index 00000000..d08de7b1 --- /dev/null +++ b/target/nextflow/eset/create_eset/_viash.yaml @@ -0,0 +1,21 @@ +name: htrnaseq +summary: | + A workflow for high-throughput RNA-seq data analyses. +description: "This workflow is designed to process high-throughput RNA-seq data, where every\nwell of a microarray plate is a sample. A fasta file provided as input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow is built in a modular fashion, where most of the base functionality\nis provided by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\nsupplemented by custom base components and workflow components in this package.\n\nThe full workflow is split in two major subworkflows that can be run independently:\n\n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate QC reports.\n\nEach of those can be started individually, or the full workflow can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq) \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner) where a\nnumber of choices (input/output structure and location) have been made.\n\nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex) first.\n" +license: MIT +keywords: [bioinformatics, sequencing, high-throughput, RNAseq, mapping, counting, pipeline, workflow] +links: + issue_tracker: https://github.com/viash-hub/htrnaseq/issues + repository: https://github.com/viash-hub/htrnaseq +viash_version: 0.9.4 +info: + test_resources: + - path: gs://viash-hub-resources/htrnaseq/v2 + dest: resources_test +config_mods: | + .requirements.commands := ['ps'] + .runners[.type == 'nextflow'].config.script += 'includeConfig("nextflow_labels.config")' + .resources += {path: '/src/config/labels.config', dest: 'nextflow_labels.config'} + .resources += {path: '/_viash.yaml', dest: '_viash.yaml'} +version: updatecraftbox +organization: vsh diff --git a/target/nextflow/eset/create_eset/main.nf b/target/nextflow/eset/create_eset/main.nf new file mode 100644 index 00000000..96a50262 --- /dev/null +++ b/target/nextflow/eset/create_eset/main.nf @@ -0,0 +1,4329 @@ +// create_eset updatecraftbox +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries Schaumont (maintainer) +// * Marijke Van Moerbeke (author) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "create_eset", + "namespace" : "eset", + "version" : "updatecraftbox", + "authors" : [ + { + "name" : "Dries Schaumont", + "roles" : [ + "maintainer" + ], + "info" : { + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + }, + { + "name" : "Marijke Van Moerbeke", + "roles" : [ + "author" + ], + "info" : { + "links" : { + "github" : "mvanmoerbeke", + "orcid" : "0000-0002-3097-5621", + "linkedin" : "marijke-van-moerbeke-84303a34" + }, + "organizations" : [ + { + "name" : "OpenAnalytics", + "href" : "https://www.openanalytics.eu", + "role" : "Statistical Consultant" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--pDataFile", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--fDataFile", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--mappingDir", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--poolName", + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "default" : [ + "eset.$id.rds" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "r_script", + "path" : "script.R", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/config/labels.config", + "dest" : "nextflow_labels.config" + }, + { + "type" : "file", + "path" : "/_viash.yaml", + "dest" : "_viash.yaml" + } + ], + "test_resources" : [ + { + "type" : "r_script", + "path" : "test.R", + "is_executable" : true + }, + { + "type" : "file", + "path" : "test_data/pData.tsv" + }, + { + "type" : "file", + "path" : "test_data/fData.tsv" + }, + { + "type" : "file", + "path" : "test_data/mapping_dir" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "requirements" : { + "commands" : [ + "ps" + ] + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/viash-hub/htrnaseq" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "rocker/r2u:24.04", + "target_registry" : "images.viash-hub.com", + "target_tag" : "updatecraftbox", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "r", + "cran" : [ + "data.table", + "nlcv" + ], + "bioc" : [ + "Seurat" + ], + "bioc_force_install" : false, + "warnings_as_errors" : true + } + ], + "test_setup" : [ + { + "type" : "r", + "cran" : [ + "testthat" + ], + "bioc_force_install" : false, + "warnings_as_errors" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/eset/create_eset/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "target/nextflow/eset/create_eset", + "viash_version" : "0.9.4", + "git_commit" : "e6da525fc57aaec74f348eb974b68faa647bf800", + "git_remote" : "https://github.com/viash-hub/htrnaseq" + }, + "package_config" : { + "name" : "htrnaseq", + "version" : "updatecraftbox", + "summary" : "A workflow for high-throughput RNA-seq data analyses.\n", + "description" : "This workflow is designed to process high-throughput RNA-seq data, where every\nwell of a microarray plate is a sample. A fasta file provided as input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow is built in a modular fashion, where most of the base functionality\nis provided by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\nsupplemented by custom base components and workflow components in this package.\n\nThe full workflow is split in two major subworkflows that can be run independently:\n\n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate QC reports.\n\nEach of those can be started individually, or the full workflow can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq) \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner) where a\nnumber of choices (input/output structure and location) have been made.\n\nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex) first.\n", + "info" : { + "test_resources" : [ + { + "path" : "gs://viash-hub-resources/htrnaseq/v2", + "dest" : "resources_test" + } + ] + }, + "viash_version" : "0.9.4", + "source" : "src", + "target" : "target", + "config_mods" : [ + ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script += 'includeConfig(\\"nextflow_labels.config\\")'\n.resources += {path: '/src/config/labels.config', dest: 'nextflow_labels.config'}\n.resources += {path: '/_viash.yaml', dest: '_viash.yaml'}\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'updatecraftbox'" + ], + "keywords" : [ + "bioinformatics", + "sequencing", + "high-throughput", + "RNAseq", + "mapping", + "counting", + "pipeline", + "workflow" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/viash-hub/htrnaseq", + "issue_tracker" : "https://github.com/viash-hub/htrnaseq/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.R" +cat > "$tempscript" << VIASHMAIN +library(Biobase) +library(data.table) +library(nlcv) +library(Matrix) +library(Seurat) + +### VIASH START +# The following code has been auto-generated by Viash. +# treat warnings as errors +.viash_orig_warn <- options(warn = 2) + +par <- list( + "pDataFile" = $( if [ ! -z ${VIASH_PAR_PDATAFILE+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_PDATAFILE" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "fDataFile" = $( if [ ! -z ${VIASH_PAR_FDATAFILE+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_FDATAFILE" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "mappingDir" = $( if [ ! -z ${VIASH_PAR_MAPPINGDIR+x} ]; then echo -n "strsplit('"; echo -n "$VIASH_PAR_MAPPINGDIR" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "', split = ';')[[1]]"; else echo NULL; fi ), + "poolName" = $( if [ ! -z ${VIASH_PAR_POOLNAME+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_POOLNAME" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "output" = $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_OUTPUT" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ) +) +meta <- list( + "name" = $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_META_NAME" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "functionality_name" = $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_META_FUNCTIONALITY_NAME" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "resources_dir" = $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_META_RESOURCES_DIR" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "executable" = $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo -n "'"; echo -n "$VIASH_META_EXECUTABLE" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "config" = $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo -n "'"; echo -n "$VIASH_META_CONFIG" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "temp_dir" = $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_META_TEMP_DIR" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "cpus" = $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_META_CPUS" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_b" = $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_B" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_kb" = $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_KB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_mb" = $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_MB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_gb" = $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_GB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_tb" = $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_TB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_pb" = $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_PB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_kib" = $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_KIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_mib" = $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_MIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_gib" = $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_GIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_tib" = $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_TIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_pib" = $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_PIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ) +) +dep <- list( + +) + + +# restore original warn setting +options(.viash_orig_warn) +rm(.viash_orig_warn) + +### VIASH END + + +Read10X <- function(data_dir = NULL, gene_column = 2, unique_features = TRUE) { + full.data <- list() + for (i in seq_along(along.with = data_dir)) { + run <- data_dir[i] + if (!dir.exists(paths = run)) { + stop("Directory provided does not exist") + } + barcode.loc <- file.path(run, "barcodes.tsv") + gene.loc <- file.path(run, "features.tsv") + features.loc <- file.path(run, "features.tsv.gz") + matrix.loc <- file.path(run, "matrix.mtx") + pre_ver_3 <- file.exists(gene.loc) + if (!pre_ver_3) { + addgz <- function(s) { + return(paste0(s, ".gz")) + } + barcode.loc <- addgz(s = barcode.loc) + matrix.loc <- addgz(s = matrix.loc) + } + if (!file.exists(barcode.loc)) { + stop("Barcode file missing") + } + if (!pre_ver_3 && !file.exists(features.loc)) { + stop("Gene name or features file missing") + } + if (!file.exists(matrix.loc)) { + stop("Expression matrix file missing") + } + data <- readMM(file = matrix.loc) + cell.names <- readLines(barcode.loc) + if (all(grepl(pattern = "\\\\\\\\-1\\$", x = cell.names))) { + cell.names <- as.vector(x = as.character(x = sapply(X = cell.names, + FUN = ExtractField, field = 1, delim = "-"))) + } + if (is.null(x = names(x = data_dir))) { + if (i < 2) { + colnames(x = data) <- cell.names + } + else { + colnames(x = data) <- paste0(i, "_", cell.names) + } + } + else { + colnames(x = data) <- paste0(names(x = data_dir)[i], + "_", cell.names) + } + feature.names <- read.delim(file = ifelse(test = pre_ver_3, + yes = gene.loc, no = features.loc), header = FALSE, + stringsAsFactors = FALSE) + if (any(is.na(x = feature.names[, gene_column]))) { + warning("Some features names are NA. Replacing NA names with ID from the opposite column requested", + call. = FALSE, immediate. = TRUE) + na.features <- which(x = is.na(x = feature.names[, + gene_column])) + replacement.column <- ifelse(test = gene_column == + 2, yes = 1, no = 2) + feature.names[na.features, gene_column] <- feature.names[na.features, + replacement.column] + } + if (unique_features) { + fcols = ncol(x = feature.names) + if (fcols < gene_column) { + stop(paste0("gene_column was set to ", gene_column, + " but feature.tsv.gz (or genes.tsv) only has ", + fcols, " columns.", " Try setting the gene_column ", + "argument to a value <= to ", + fcols, ".")) + } + rownames(x = data) <- make.unique(names = feature.names[, + gene_column]) + } + if (ncol(x = feature.names) > 2) { + data_types <- factor(x = feature.names\\$V3) + lvls <- levels(x = data_types) + if (length(x = lvls) > 1 && length(x = full.data) == 0) { + message(paste0("10X data contains more than one type and is ", + "being returned as a list containing matrices ", + "of each type.")) + } + expr_name <- "Gene Expression" + if (expr_name %in% lvls) { + lvls <- c(expr_name, lvls[-which(x = lvls == + expr_name)]) + } + data <- lapply(X = lvls, FUN = function(l) { + return(data[data_types == l, , drop = FALSE]) + }) + names(x = data) <- lvls + } else { + data <- list(data) + } + full.data[[length(x = full.data) + 1]] <- data + } + list_of_data <- list() + for (j in 1:length(x = full.data[[1]])) { + list_of_data[[j]] <- do.call(cbind, lapply(X = full.data, + FUN = \\`[[\\`, j)) + list_of_data[[j]] <- as(object = list_of_data[[j]], Class = "CsparseMatrix") + } + names(x = list_of_data) <- names(x = full.data[[1]]) + if (length(x = list_of_data) == 1) { + return(list_of_data[[1]]) + } else { + return(list_of_data) + } +} + +match_features <- function(exprs_matrix, fdata) { + + identical_features <- all(rownames(exprs_matrix) == rownames(fdata)) + + if (nrow(exprs_matrix) != nrow(fdata) || !identical_features) { + message(paste0("Features in 'fData' and expression matrix differ. ", + "Only matching features are returned.")) + } + + features <- intersect(rownames(exprs_matrix), rownames(fdata)) + exprs_matrix <- exprs_matrix[which(rownames(exprs_matrix) %in% features), ] + fdata <- fdata[which(rownames(fdata) %in% features), ] + + fdata[, seq_len(ncol(fdata))] <- lapply(fdata[, seq_len(ncol(fdata)), drop = FALSE], as.character) + # order features in exprs mat according to fdata + exprs_matrix <- exprs_matrix[match(rownames(fdata), rownames(exprs_matrix)), ] + + list(exprs_matrix = exprs_matrix, fdata = fdata) + +} + + +create_pdata <- function(sample_file, pool_name, barcodes) { + cols_to_remove <- c("SampleFileName", "Output", "Measure", "Strandedness") + pData <- sample_file[, !colnames(sample_file) %in% cols_to_remove, + drop = FALSE] + rownames(pData) <- lapply(sample_file\\$WellBC, + \\\\(x) paste(pool_name, x, sep = "_")) + # pData[, ] <- lapply(pData, as.factor) + pData\\$PoolName <- pool_name + pData <- pData[match(barcodes, pData\\$WellBC), ] + return(pData) +} + +check_sample_file <- function(mapping_dir, sample_file){ + + message("Checking sample annotation:") + + requireNamespace("tools") + mapping_dir <- unlist(lapply(mapping_dir, function(x) { + if (!dir.exists(x)) { + stop(sprintf(paste0("Could not find directory ", + "provided in 'mappingDir' argument (%s)."), x)) + } + tools::file_path_as_absolute(x) + })) + + + # additional check for STARsolo + check_STARsolo_output <- function(x) { + files <- c("barcodes.tsv", "features.tsv", "matrix.mtx") + test <- list.files(x) %in% c(files, paste0(files, ".gz")) + length(test) != 0 && all(test) + } + + + if (!"WellBC" %in% colnames(sample_file)) { + stop(paste0("STARsolo output is used. The sample annotation must ", + "contain 'WellBC' column providing cell barcodes.")) + } + + mapping_dir <- unique(mapping_dir) + all_STARsolo_files_present <- all( + unlist( + lapply(mapping_dir, function(x) { + check_STARsolo_output(x) + }) + ) + ) + if (!all_STARsolo_files_present) { + stop(paste0("Could not find files: 'barcodes', 'features' and 'matrix'", + " for STARsolo output. Please check 'mappingDir' argument.")) + } + + message("- 'SampleFileName' column - OK") + + + + list(sample_expression_files = mapping_dir) +} + +create_exprs_matrix <- function(exprs_matrix_path, exprs_file_paths, + output, measure, col_names, cell_barcodes) { + + read_matrix <- Read10X(data_dir = exprs_file_paths, gene_column = 1) + # keep index of feature names containing "_" because Seurat + #changes them to "-" and they no longer match with fdata[, "gene_id"] + idx <- grep("_", rownames(read_matrix)) + + requireNamespace("Seurat") + seurat_object <- Seurat::CreateSeuratObject(counts = read_matrix) + + exprs_matrix <- as.matrix(seurat_object[['RNA']]\\$counts) + # replace "-" with "_" for features with "_" + # before converting to Seurat object + rownames(exprs_matrix)[idx] <- gsub("-", "_", rownames(exprs_matrix)[idx]) + requireNamespace("stringr") + exprs_matrix <- exprs_matrix[, stringr::str_detect(colnames(exprs_matrix), + paste(cell_barcodes, collapse = "|"))] + + + # check if rownames are ENSEMBL and remove version suffix + isENSEMBL <- all(grepl("ENS", rownames(exprs_matrix))) + if (isENSEMBL) { + # do not use gsub("(.+)[.]\\\\\\\\d+", "\\\\\\\\1", rownames(exprs_matrix)), + # so that ENS000000.1_PAR_Y can be kept + rownames(exprs_matrix) <- gsub("\\\\\\\\.\\\\\\\\d+\\$", "", rownames(exprs_matrix)) + } + + + colnames(exprs_matrix) <- col_names + + exprs_matrix +} + +create_eset <- function(feature_annotation_path, + sample_annotation_path, + mapping_dir, + barcodes, + output_path, + pool_name, + exprs_matrix_path = NULL, + path = NULL, + add_eset_annotation = NULL) { + if (!file.exists(feature_annotation_path)) { + stop("Could not find feature annotation at '", feature_annotation_path, "'") + } + + if (!file.exists(sample_annotation_path)) { + stop("Could not find sample annotation at '", sample_annotation_path, "'") + } + + if(!is.null(exprs_matrix_path)) { + if(!file.exists(exprs_matrix_path)) { + stop("Could not find expression matrix at '", exprs_matrix_path, "'") + } + } + + if(!is.null(path)) { + if(!dir.exists(path)) { + stop("Provided 'path': '", path, "' does not exist.") + } + } + + ##### Import annotation files ##### + message("Importing feature annotation") + fdata_file <- read.table(feature_annotation_path, header = TRUE, + sep = "\\\\t", quote = "\\\\"", + comment.char = "", stringsAsFactors = FALSE) + + # for backwards compatibility + if("ENSEMBL" %in% colnames(fdata_file) && !all(grepl("ENS", fdata_file[, "ENSEMBL"])) & !"gene_id" %in% colnames(fdata_file)) { + colnames(fdata_file)[which(colnames(fdata_file) == "ENSEMBL")] <- "gene_id" + } + + # Check gene annotation + if(!"gene_id" %in% colnames(fdata_file)) + stop("'gene_id' column with unique feature identifiers must be present in 'feature_annotation_path'.") + + # check if duplicated ids are present + if(any(duplicated(fdata_file\\$gene_id))) + stop("Duplicated features ids are not allowed. Please check the 'gene_id' column in 'feature_annotation_path'.") + + message("Importing sample annotation") + sample_file <- read.table(sample_annotation_path, header = TRUE, + sep = "\\\\t", quote = "\\\\"", + comment.char = "", stringsAsFactors = FALSE) + # Check sample annotation + check_sample_file_list <- check_sample_file(mapping_dir = mapping_dir, + sample_file = sample_file) + output <- "STARsolo" + measure <- "counts" + sample_expression_files <- check_sample_file_list\\$sample_expression_files + + ##### Create phenodata ##### + pdata_eset <- create_pdata(sample_file = sample_file, pool_name = pool_name, + barcodes = barcodes) + + ##### Create expression matrix ##### + message("Creating expression matrix") + + exprs_matrix_eset <- create_exprs_matrix( + exprs_matrix_path = exprs_matrix_path, + exprs_file_paths = sample_expression_files, + output = output, + measure = measure, + col_names = rownames(pdata_eset), + cell_barcodes = barcodes + ) + + + ##### Create featuredata ##### + message("Creating feature data") + + fdata_eset <- fdata_file + rownames(fdata_eset) <- fdata_eset[, "gene_id"] + + # intersect features between exprs matrix and fdata + feature_files <- match_features(exprs_matrix = exprs_matrix_eset, + fdata = fdata_eset) + + fdata_eset <- feature_files\\$fdata + exprs_matrix_eset <- feature_files\\$exprs_matrix + + ##### Create eSet ##### + message("Creating eset") + + if (nrow(pdata_eset) != ncol(exprs_matrix_eset)) { + stop("nrow(pData) and ncol(exprsMatrix) differ") + } + + if (nrow(fdata_eset) != nrow(exprs_matrix_eset)) { + stop("nrow(fData) and nrow(exprsMatrix) differ") + } + + if (!all(rownames(pdata_eset) == colnames(exprs_matrix_eset))) { + stop("rownames(pData) and colnames(exprsMatrix) differ") + } + + if (!all(rownames(fdata_eset) == rownames(exprs_matrix_eset))) { + stop("rownames(fData) and rownames(exprsMatrix) differ") + } + + if (!inherits(exprs_matrix_eset, "matrix")) { + stop("exprsMatrix must be of class 'matrix'") + } + + + + additional_info <- paste0("Additional information about eSet \\\\n", + " Expression matrix created from ", + output, " output. \\\\n", + " Expression matrix contains non-transformed ", + ifelse(output %in% c("STAR", "STARsolo"), + "counts", + ifelse(measure == "expected_count", + "counts", measure)), ".") + + + if (isTRUE(!is.null(add_eset_annotation) & + is.character(add_eset_annotation))) { + additional_info <- paste0(additional_info, "\\\\n", " ", add_eset_annotation) + } + + fdata_eset <- new("AnnotatedDataFrame", data = fdata_eset) + pdata_eset <- new("AnnotatedDataFrame", data = pdata_eset) + + requireNamespace("Biobase") + eset <- Biobase::ExpressionSet(assayData = exprs_matrix_eset, + phenoData = pdata_eset, + featureData = fdata_eset, + annotation = additional_info) + + eset <- eset[, colSums(exprs(eset)) != 0] + saveRDS(eset, file = output_path) + + message(paste0("eset created succesfully for ", ncol(eset), + " samples and ", nrow(eset), + " genes and saved at ", output_path, ".")) + + eset +} + + +p_data_file <- par\\$pDataFile +f_data_file <- par\\$fDataFile +pool_name <- par\\$poolName +mapping_dir <- lapply(par\\$mappingDir, + \\\\(x) file.path(x, "Solo.out", "Gene", "raw")) + +get_barcode_from_mapping_dir <- function(raw_dir) { + barcodes_file <- file.path(raw_dir, "barcodes.tsv") + if (!file.exists(barcodes_file)) { + stop(paste0("Expected the 'Solo.out/Gene/raw' directory at ", + raw_dir, " to contain a 'barcodes.tsv' file.")) + } + barcodes <- readLines(barcodes_file) + if (length(barcodes) != 1) { + stop(paste0("A single STAR Solo folder should only have ", + "mapped one (1) barcode, but found '", + length(barcodes), "'for mapping directory ", raw_dir)) + } + return(barcodes) +} + +barcodes <- lapply(mapping_dir, get_barcode_from_mapping_dir) + +print(paste0("mappingDir: ", mapping_dir)) +print(paste0("pDataFile: ", p_data_file)) +print(paste0("fDataFile: ", f_data_file)) +print(paste0("poolName: ", pool_name)) +print(paste0("barcodes: ", barcodes)) + + + +# CREATE ESET WITH RAW UMI COUNTS + +eset <- create_eset(feature_annotation_path = f_data_file, + sample_annotation_path = p_data_file, + mapping_dir = mapping_dir, + barcodes = barcodes, + output_path = par\\$output, + pool_name = pool_name, + path = NULL, + exprs_matrix_path = NULL) +VIASHMAIN +Rscript "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/htrnaseq/eset/create_eset", + "tag" : "updatecraftbox" + }, + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/eset/create_eset/nextflow.config b/target/nextflow/eset/create_eset/nextflow.config new file mode 100644 index 00000000..97809f5c --- /dev/null +++ b/target/nextflow/eset/create_eset/nextflow.config @@ -0,0 +1,125 @@ +manifest { + name = 'eset/create_eset' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'updatecraftbox' + author = 'Dries Schaumont, Marijke Van Moerbeke' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/eset/create_eset/nextflow_labels.config b/target/nextflow/eset/create_eset/nextflow_labels.config new file mode 100644 index 00000000..2821ec46 --- /dev/null +++ b/target/nextflow/eset/create_eset/nextflow_labels.config @@ -0,0 +1,108 @@ +executor { + $k8s { + submitRateLimit = '10sec' + pollInterval = '1 sec' + } +} + +process { + container = 'nextflow/bash:latest' + + // default resources + memory = { 8.Gb * task.attempt } + cpus = 8 + maxForks = 36 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + maxMemory = 192.GB + + // Resource labels + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 8 } + withLabel: midcpu { cpus = 16 } + withLabel: highcpu { cpus = 32 } + + withLabel: verylowmem { memory = { get_memory( 4.GB * task.attempt ) } } + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 16.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 64.GB * task.attempt ) } } + +} + +profiles { + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.fixOwnership = true + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + + local { + // This config is for local processing. + process { + withName: ".*parallel_map_process" { + maxForks = 1 + } + maxMemory = 25.GB + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 6 } + withLabel: highcpu { cpus = 8 } + + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 12.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 20.GB * task.attempt ) } } + } + } +} + +def get_memory(to_compare) { + if (!process.containsKey("maxMemory") || !process.maxMemory) { + return to_compare + } + + try { + if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) { + return process.maxMemory + } + else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) { + return max_memory as nextflow.util.MemoryUnit + } + else { + return to_compare + } + } catch (all) { + println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!" + System.exit(1) + } + } diff --git a/target/nextflow/eset/create_eset/nextflow_schema.json b/target/nextflow/eset/create_eset/nextflow_schema.json new file mode 100644 index 00000000..c0bc8910 --- /dev/null +++ b/target/nextflow/eset/create_eset/nextflow_schema.json @@ -0,0 +1,71 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "create_eset", + "description": "No description", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "pDataFile": { + "type": "string", + "format": "path", + "exists": true, + "description": "", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`. " + }, + "fDataFile": { + "type": "string", + "format": "path", + "exists": true, + "description": "", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`. " + }, + "mappingDir": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "exists": true, + "description": "", + "help_text": "Type: `file`, multiple: `True`, required, direction: `input`. " + }, + "poolName": { + "type": "string", + "description": "", + "help_text": "Type: `string`, multiple: `False`, required. " + }, + "output": { + "type": "string", + "format": "path", + "description": "", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"eset.$id.rds\"`, direction: `output`. ", + "default": "eset.$id.rds" + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/eset/create_fdata/.config.vsh.yaml b/target/nextflow/eset/create_fdata/.config.vsh.yaml new file mode 100644 index 00000000..3be26007 --- /dev/null +++ b/target/nextflow/eset/create_fdata/.config.vsh.yaml @@ -0,0 +1,236 @@ +name: "create_fdata" +namespace: "eset" +version: "updatecraftbox" +authors: +- name: "Dries Schaumont" + roles: + - "maintainer" + info: + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Marijke Van Moerbeke" + roles: + - "contributor" + info: + links: + github: "mvanmoerbeke" + orcid: "0000-0002-3097-5621" + linkedin: "marijke-van-moerbeke-84303a34" + organizations: + - name: "OpenAnalytics" + href: "https://www.openanalytics.eu" + role: "Statistical Consultant" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--gtf" + description: "Genome annotation file in GTF format." + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + description: "Tab-delimited text file containing information about the 'gene'\ + \ or 'transcript'\nentries from the input GTF file. The 'transcript' entries\ + \ are used in case the source\nof the GTF was 'refGene' or 'ncbiRefSeq'. \n" + info: null + default: + - "fData.$id.txt" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "create_fdata.py" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +- type: "file" + path: "_viash.yaml" + dest: "_viash.yaml" +description: "Create a fdata file\n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "test_annotation.gtf" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +requirements: + commands: + - "ps" +license: "MIT" +links: + repository: "https://github.com/viash-hub/htrnaseq" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "updatecraftbox" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "pandas" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/eset/create_fdata/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/eset/create_fdata" + executable: "target/nextflow/eset/create_fdata/main.nf" + viash_version: "0.9.4" + git_commit: "e6da525fc57aaec74f348eb974b68faa647bf800" + git_remote: "https://github.com/viash-hub/htrnaseq" +package_config: + name: "htrnaseq" + version: "updatecraftbox" + summary: "A workflow for high-throughput RNA-seq data analyses.\n" + description: "This workflow is designed to process high-throughput RNA-seq data,\ + \ where every\nwell of a microarray plate is a sample. A fasta file provided as\ + \ input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow\ + \ is built in a modular fashion, where most of the base functionality\nis provided\ + \ by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\n\ + supplemented by custom base components and workflow components in this package.\n\ + \nThe full workflow is split in two major subworkflows that can be run independently:\n\ + \n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per\ + \ well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate\ + \ QC reports.\n\nEach of those can be started individually, or the full workflow\ + \ can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq)\ + \ \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner)\ + \ where a\nnumber of choices (input/output structure and location) have been made.\n\ + \nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other\ + \ formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex)\ + \ first.\n" + info: + test_resources: + - path: "gs://viash-hub-resources/htrnaseq/v2" + dest: "resources_test" + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script\ + \ += 'includeConfig(\"nextflow_labels.config\")'\n.resources += {path: '/src/config/labels.config',\ + \ dest: 'nextflow_labels.config'}\n.resources += {path: '/_viash.yaml', dest:\ + \ '_viash.yaml'}\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'updatecraftbox'" + keywords: + - "bioinformatics" + - "sequencing" + - "high-throughput" + - "RNAseq" + - "mapping" + - "counting" + - "pipeline" + - "workflow" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/viash-hub/htrnaseq" + issue_tracker: "https://github.com/viash-hub/htrnaseq/issues" diff --git a/target/nextflow/eset/create_fdata/_viash.yaml b/target/nextflow/eset/create_fdata/_viash.yaml new file mode 100644 index 00000000..d08de7b1 --- /dev/null +++ b/target/nextflow/eset/create_fdata/_viash.yaml @@ -0,0 +1,21 @@ +name: htrnaseq +summary: | + A workflow for high-throughput RNA-seq data analyses. +description: "This workflow is designed to process high-throughput RNA-seq data, where every\nwell of a microarray plate is a sample. A fasta file provided as input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow is built in a modular fashion, where most of the base functionality\nis provided by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\nsupplemented by custom base components and workflow components in this package.\n\nThe full workflow is split in two major subworkflows that can be run independently:\n\n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate QC reports.\n\nEach of those can be started individually, or the full workflow can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq) \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner) where a\nnumber of choices (input/output structure and location) have been made.\n\nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex) first.\n" +license: MIT +keywords: [bioinformatics, sequencing, high-throughput, RNAseq, mapping, counting, pipeline, workflow] +links: + issue_tracker: https://github.com/viash-hub/htrnaseq/issues + repository: https://github.com/viash-hub/htrnaseq +viash_version: 0.9.4 +info: + test_resources: + - path: gs://viash-hub-resources/htrnaseq/v2 + dest: resources_test +config_mods: | + .requirements.commands := ['ps'] + .runners[.type == 'nextflow'].config.script += 'includeConfig("nextflow_labels.config")' + .resources += {path: '/src/config/labels.config', dest: 'nextflow_labels.config'} + .resources += {path: '/_viash.yaml', dest: '_viash.yaml'} +version: updatecraftbox +organization: vsh diff --git a/target/nextflow/eset/create_fdata/main.nf b/target/nextflow/eset/create_fdata/main.nf new file mode 100644 index 00000000..c9fea6e9 --- /dev/null +++ b/target/nextflow/eset/create_fdata/main.nf @@ -0,0 +1,3994 @@ +// create_fdata updatecraftbox +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries Schaumont (maintainer) +// * Marijke Van Moerbeke (contributor) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "create_fdata", + "namespace" : "eset", + "version" : "updatecraftbox", + "authors" : [ + { + "name" : "Dries Schaumont", + "roles" : [ + "maintainer" + ], + "info" : { + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + }, + { + "name" : "Marijke Van Moerbeke", + "roles" : [ + "contributor" + ], + "info" : { + "links" : { + "github" : "mvanmoerbeke", + "orcid" : "0000-0002-3097-5621", + "linkedin" : "marijke-van-moerbeke-84303a34" + }, + "organizations" : [ + { + "name" : "OpenAnalytics", + "href" : "https://www.openanalytics.eu", + "role" : "Statistical Consultant" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--gtf", + "description" : "Genome annotation file in GTF format.", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "description" : "Tab-delimited text file containing information about the 'gene' or 'transcript'\nentries from the input GTF file. The 'transcript' entries are used in case the source\nof the GTF was 'refGene' or 'ncbiRefSeq'. \n", + "default" : [ + "fData.$id.txt" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "create_fdata.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/config/labels.config", + "dest" : "nextflow_labels.config" + }, + { + "type" : "file", + "path" : "/_viash.yaml", + "dest" : "_viash.yaml" + } + ], + "description" : "Create a fdata file\n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "test_annotation.gtf" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "requirements" : { + "commands" : [ + "ps" + ] + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/viash-hub/htrnaseq" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "updatecraftbox", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "pandas" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/eset/create_fdata/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "target/nextflow/eset/create_fdata", + "viash_version" : "0.9.4", + "git_commit" : "e6da525fc57aaec74f348eb974b68faa647bf800", + "git_remote" : "https://github.com/viash-hub/htrnaseq" + }, + "package_config" : { + "name" : "htrnaseq", + "version" : "updatecraftbox", + "summary" : "A workflow for high-throughput RNA-seq data analyses.\n", + "description" : "This workflow is designed to process high-throughput RNA-seq data, where every\nwell of a microarray plate is a sample. A fasta file provided as input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow is built in a modular fashion, where most of the base functionality\nis provided by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\nsupplemented by custom base components and workflow components in this package.\n\nThe full workflow is split in two major subworkflows that can be run independently:\n\n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate QC reports.\n\nEach of those can be started individually, or the full workflow can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq) \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner) where a\nnumber of choices (input/output structure and location) have been made.\n\nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex) first.\n", + "info" : { + "test_resources" : [ + { + "path" : "gs://viash-hub-resources/htrnaseq/v2", + "dest" : "resources_test" + } + ] + }, + "viash_version" : "0.9.4", + "source" : "src", + "target" : "target", + "config_mods" : [ + ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script += 'includeConfig(\\"nextflow_labels.config\\")'\n.resources += {path: '/src/config/labels.config', dest: 'nextflow_labels.config'}\n.resources += {path: '/_viash.yaml', dest: '_viash.yaml'}\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'updatecraftbox'" + ], + "keywords" : [ + "bioinformatics", + "sequencing", + "high-throughput", + "RNAseq", + "mapping", + "counting", + "pipeline", + "workflow" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/viash-hub/htrnaseq", + "issue_tracker" : "https://github.com/viash-hub/htrnaseq/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import logging +import pandas as pd +import numpy as np +from textwrap import fill + + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'gtf': $( if [ ! -z ${VIASH_PAR_GTF+x} ]; then echo "r'${VIASH_PAR_GTF//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + +logger = logging.getLogger() +console_handler = logging.StreamHandler() +logger.addHandler(console_handler) +logger.setLevel(logging.DEBUG) + + +def read_gtf(gtf_path: str) -> pd.DataFrame: + logger.info("Reading %s", gtf_path) + result = pd.read_csv(gtf_path, sep="\\\\t", + header=None, names=("seqname", "source", + "feature", "start", "end", + "score", "strand", "frame", + "attribute"), + dtype={ + "seqname": pd.StringDtype(), + "source": pd.StringDtype(), + "feature": pd.StringDtype(), + "start": pd.Int64Dtype(), + "end": pd.Int64Dtype(), + "score": pd.StringDtype(), + "strand": pd.CategoricalDtype(categories=["+", "-"], + ordered=False), + "frame": pd.StringDtype(), + "attribute": pd.StringDtype(), + }, + comment='#' + ) + logger.info("Done reading %s. Found %d GTF entries ", par["gtf"], result.shape[0]) + logger.info("GTF file is providing information for the following chromosomes: \\\\n%s", + fill(", ".join(result['seqname'].unique()), width=100)) + logger.info("The following sources were specified in the GTF file:\\\\n%s", + ", ".join(result["source"].unique())) + return result + + +def parse_attributes(attributes_series: pd.Series): + attribute_dict = dict() + attributes_list = [attr.strip().split(" ") + for attr in attributes_series["attribute"].strip(";").split(";")] + for (attr_name, attr_value) in attributes_list: + attribute_dict.setdefault(attr_name, []).append(attr_value.strip('"')) + attribute_dict = {attr_name: "|".join(attr_value) + for attr_name, attr_value in attribute_dict.items()} + return pd.Series(attribute_dict) + + +def main(par): + logger.info(f"{meta['name']} started.") + parameters_str = [f'\\\\t{param}: {param_val}\\\\n' for param, param_val in par.items()] + logger.info("Parameters:\\\\n%s", "".join(parameters_str).rstrip()) + gtf_file = read_gtf(par["gtf"]) + sources = set(source for source in gtf_file["source"].unique() if source != "ERCC") + specific_gtf = False + feature = "gene" + if len(sources) == 1 and (source := sources[0]) \\\\ + and (source == "refGene" or source == "ncbiRefSeq"): + feature = "transcript" + specific_gtf = True + logger.info("Found specific GTF from %s, forcing filtering on feature type %s", source, feature) + logger.info("Filtering GTF entries for feature type '%s'.", feature) + gtf_file = gtf_file[gtf_file["feature"] == feature] + logger.info("After filtering %d entries are left.", gtf_file.shape[0]) + logger.info("Parsing the GTF attributes") + annotation = gtf_file[["attribute"]].apply(parse_attributes, result_type="expand", axis=1) + logger.info("Found the following attributes in the GTF:\\\\n%s", ", ".join(annotation.columns)) + annotation = pd.concat([gtf_file.drop(["attribute"], axis=1), annotation], axis=1) + if specific_gtf: + logger.info("Because the source of the GTF is either 'ncbiRefSeq' or 'refGene', which" + "caused forced filtering based on %s, the duplicate genes still need to be dropped.", + feature) + annotation = annotation.drop_duplicates(subset=("gene_id", "gene_name"), keep=False) + logger.info("After dropping duplicates, %d entries are left", annotation.shape[0]) + + # detect ensembl ids + # some GTF files contain version in ENSEMBL, e.g. ENS00000000046319.1 + # we remove the version, because the annotation packages don't contain the version + if "gene_id" in annotation.columns: + logger.info("'gene_id' column was detected in attributes. Performing extra parsing of ENSEMBL ids.") + annotation["ENSEMBL_with_version"] = annotation["gene_id"].where(annotation["gene_id"].str.startswith("ENS")) + annotation["ENSEMBL"] = annotation["ENSEMBL_with_version"].str.replace(r"\\\\.\\\\d+\\$", "", regex=True) + annotation["gene_id"] = annotation["gene_id"].str.replace(r"\\\\.\\\\d+\\$", "", regex=True) + + possible_name_columns = ("Name", "name", "gene_name") + found_columns = list(filter(lambda col_name: col_name in annotation, possible_name_columns)) + # The following code allows to select a value for the SYMBOL column based on the first non-na column + if found_columns: + logger.info("Found one the following columns: %s; which can be used to populate the SYMBOL column", + ", ".join(possible_name_columns)) + # For each row (gtf entry), get the name of the first column that actually holds a value. + column_to_get = annotation.loc[:,found_columns].apply(pd.Series.first_valid_index, axis=1) + counts_per_column = column_to_get.value_counts(dropna=False).to_dict() + counts_per_column_str = [f'\\\\t{col}: {counts}\\\\n' for col, counts in counts_per_column.items()] + logger.info("Frequencies of the origin for the entries in the SYMBOL column:\\\\n%s", + "".join(counts_per_column_str).rstrip()) + # If all columns hold NA for a certain row, first_valid_index will return None. + # Just use the name of the first column. + column_to_get = column_to_get.fillna(found_columns[0]) + # We now have a list one column name per row, use it so select the values + # Loc cannot be used here because 1 value per row is required, + # and loc will select for each row all the columns in columns_to_get + idx, cols = pd.factorize(column_to_get) + symbol_values = annotation.reindex(cols, axis=1).to_numpy()[np.arange(len(annotation)), idx] + annotation["SYMBOL"] = symbol_values + logger.info("Dropping unused columns") + annotation = annotation.drop(["score", "source", "frame", "feature"], axis=1) + logger.info("Looking for duplicate rows and removing them. Starting with %i entries", annotation.shape[0]) + annotation = annotation.drop_duplicates(keep="first", ignore_index=True) + logger.info("After removing duplicates: %i entries", annotation.shape[0]) + logger.info("Writing to %s", par["output"]) + annotation.to_csv(par["output"], sep="\\\\t", header=True, index=False, na_rep="NA") + # Do these checks *after* writing the csv in order to be able to check the data + logger.info("Checking for unique gene IDs") + if not annotation["gene_id"].is_unique: + raise ValueError("Values from the 'gene_id' column are not unique after processing!") + logger.info("%s finished", meta['name']) + + +if __name__ == "__main__": + main(par) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/htrnaseq/eset/create_fdata", + "tag" : "updatecraftbox" + }, + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/eset/create_fdata/nextflow.config b/target/nextflow/eset/create_fdata/nextflow.config new file mode 100644 index 00000000..b4f32454 --- /dev/null +++ b/target/nextflow/eset/create_fdata/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'eset/create_fdata' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'updatecraftbox' + description = 'Create a fdata file\n' + author = 'Dries Schaumont, Marijke Van Moerbeke' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/eset/create_fdata/nextflow_labels.config b/target/nextflow/eset/create_fdata/nextflow_labels.config new file mode 100644 index 00000000..2821ec46 --- /dev/null +++ b/target/nextflow/eset/create_fdata/nextflow_labels.config @@ -0,0 +1,108 @@ +executor { + $k8s { + submitRateLimit = '10sec' + pollInterval = '1 sec' + } +} + +process { + container = 'nextflow/bash:latest' + + // default resources + memory = { 8.Gb * task.attempt } + cpus = 8 + maxForks = 36 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + maxMemory = 192.GB + + // Resource labels + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 8 } + withLabel: midcpu { cpus = 16 } + withLabel: highcpu { cpus = 32 } + + withLabel: verylowmem { memory = { get_memory( 4.GB * task.attempt ) } } + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 16.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 64.GB * task.attempt ) } } + +} + +profiles { + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.fixOwnership = true + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + + local { + // This config is for local processing. + process { + withName: ".*parallel_map_process" { + maxForks = 1 + } + maxMemory = 25.GB + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 6 } + withLabel: highcpu { cpus = 8 } + + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 12.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 20.GB * task.attempt ) } } + } + } +} + +def get_memory(to_compare) { + if (!process.containsKey("maxMemory") || !process.maxMemory) { + return to_compare + } + + try { + if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) { + return process.maxMemory + } + else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) { + return max_memory as nextflow.util.MemoryUnit + } + else { + return to_compare + } + } catch (all) { + println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!" + System.exit(1) + } + } diff --git a/target/nextflow/eset/create_fdata/nextflow_schema.json b/target/nextflow/eset/create_fdata/nextflow_schema.json new file mode 100644 index 00000000..ccc97ab7 --- /dev/null +++ b/target/nextflow/eset/create_fdata/nextflow_schema.json @@ -0,0 +1,49 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "create_fdata", + "description": "Create a fdata file\n", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "gtf": { + "type": "string", + "format": "path", + "exists": true, + "description": "Genome annotation file in GTF format.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`. " + }, + "output": { + "type": "string", + "format": "path", + "description": "Tab-delimited text file containing information about the 'gene' or 'transcript'\nentries from the input GTF file", + "help_text": "Type: `file`, multiple: `False`, default: `\"fData.$id.txt\"`, direction: `output`. ", + "default": "fData.$id.txt" + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/eset/create_pdata/.config.vsh.yaml b/target/nextflow/eset/create_pdata/.config.vsh.yaml new file mode 100644 index 00000000..b3dfc694 --- /dev/null +++ b/target/nextflow/eset/create_pdata/.config.vsh.yaml @@ -0,0 +1,250 @@ +name: "create_pdata" +namespace: "eset" +version: "updatecraftbox" +authors: +- name: "Dries Schaumont" + roles: + - "maintainer" + info: + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Marijke Van Moerbeke" + roles: + - "contributor" + info: + links: + github: "mvanmoerbeke" + orcid: "0000-0002-3097-5621" + linkedin: "marijke-van-moerbeke-84303a34" + organizations: + - name: "OpenAnalytics" + href: "https://www.openanalytics.eu" + role: "Statistical Consultant" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--star_stats_file" + description: "Tab-delimited text file containing statistics (per column) that\ + \ were generated\nfrom the STAR log files (Log.final.out, Summary.csv, ReadsPerGene.out.tab).\n\ + Each entry (row) in the file describes the values for one well (barcode).\n" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--nrReadsNrGenesPerChromPool" + description: "Pivot table in tsv format of the combined nrReadsNrGenesPerChrom\ + \ files from STAR. \nDescribes per chromosome (as columns) the number of reads,\ + \ as well as the total number \nof reads per cell barcode and the percentage\ + \ of nuclear, ERCC and mitochondrial\nreads.\n" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output" + info: null + default: + - "pData.$id.txt" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "create_pdata.py" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +- type: "file" + path: "_viash.yaml" + dest: "_viash.yaml" +description: "Create a pdata file by combining the mapping statistics \n" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "nrReadsNrGenesPerChromPool.txt" +- type: "file" + path: "starLogs.txt" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +requirements: + commands: + - "ps" +license: "MIT" +links: + repository: "https://github.com/viash-hub/htrnaseq" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "updatecraftbox" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "pandas" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/eset/create_pdata/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/eset/create_pdata" + executable: "target/nextflow/eset/create_pdata/main.nf" + viash_version: "0.9.4" + git_commit: "e6da525fc57aaec74f348eb974b68faa647bf800" + git_remote: "https://github.com/viash-hub/htrnaseq" +package_config: + name: "htrnaseq" + version: "updatecraftbox" + summary: "A workflow for high-throughput RNA-seq data analyses.\n" + description: "This workflow is designed to process high-throughput RNA-seq data,\ + \ where every\nwell of a microarray plate is a sample. A fasta file provided as\ + \ input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow\ + \ is built in a modular fashion, where most of the base functionality\nis provided\ + \ by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\n\ + supplemented by custom base components and workflow components in this package.\n\ + \nThe full workflow is split in two major subworkflows that can be run independently:\n\ + \n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per\ + \ well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate\ + \ QC reports.\n\nEach of those can be started individually, or the full workflow\ + \ can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq)\ + \ \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner)\ + \ where a\nnumber of choices (input/output structure and location) have been made.\n\ + \nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other\ + \ formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex)\ + \ first.\n" + info: + test_resources: + - path: "gs://viash-hub-resources/htrnaseq/v2" + dest: "resources_test" + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script\ + \ += 'includeConfig(\"nextflow_labels.config\")'\n.resources += {path: '/src/config/labels.config',\ + \ dest: 'nextflow_labels.config'}\n.resources += {path: '/_viash.yaml', dest:\ + \ '_viash.yaml'}\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'updatecraftbox'" + keywords: + - "bioinformatics" + - "sequencing" + - "high-throughput" + - "RNAseq" + - "mapping" + - "counting" + - "pipeline" + - "workflow" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/viash-hub/htrnaseq" + issue_tracker: "https://github.com/viash-hub/htrnaseq/issues" diff --git a/target/nextflow/eset/create_pdata/_viash.yaml b/target/nextflow/eset/create_pdata/_viash.yaml new file mode 100644 index 00000000..d08de7b1 --- /dev/null +++ b/target/nextflow/eset/create_pdata/_viash.yaml @@ -0,0 +1,21 @@ +name: htrnaseq +summary: | + A workflow for high-throughput RNA-seq data analyses. +description: "This workflow is designed to process high-throughput RNA-seq data, where every\nwell of a microarray plate is a sample. A fasta file provided as input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow is built in a modular fashion, where most of the base functionality\nis provided by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\nsupplemented by custom base components and workflow components in this package.\n\nThe full workflow is split in two major subworkflows that can be run independently:\n\n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate QC reports.\n\nEach of those can be started individually, or the full workflow can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq) \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner) where a\nnumber of choices (input/output structure and location) have been made.\n\nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex) first.\n" +license: MIT +keywords: [bioinformatics, sequencing, high-throughput, RNAseq, mapping, counting, pipeline, workflow] +links: + issue_tracker: https://github.com/viash-hub/htrnaseq/issues + repository: https://github.com/viash-hub/htrnaseq +viash_version: 0.9.4 +info: + test_resources: + - path: gs://viash-hub-resources/htrnaseq/v2 + dest: resources_test +config_mods: | + .requirements.commands := ['ps'] + .runners[.type == 'nextflow'].config.script += 'includeConfig("nextflow_labels.config")' + .resources += {path: '/src/config/labels.config', dest: 'nextflow_labels.config'} + .resources += {path: '/_viash.yaml', dest: '_viash.yaml'} +version: updatecraftbox +organization: vsh diff --git a/target/nextflow/eset/create_pdata/main.nf b/target/nextflow/eset/create_pdata/main.nf new file mode 100644 index 00000000..3051e7b3 --- /dev/null +++ b/target/nextflow/eset/create_pdata/main.nf @@ -0,0 +1,3934 @@ +// create_pdata updatecraftbox +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries Schaumont (maintainer) +// * Marijke Van Moerbeke (contributor) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "create_pdata", + "namespace" : "eset", + "version" : "updatecraftbox", + "authors" : [ + { + "name" : "Dries Schaumont", + "roles" : [ + "maintainer" + ], + "info" : { + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + }, + { + "name" : "Marijke Van Moerbeke", + "roles" : [ + "contributor" + ], + "info" : { + "links" : { + "github" : "mvanmoerbeke", + "orcid" : "0000-0002-3097-5621", + "linkedin" : "marijke-van-moerbeke-84303a34" + }, + "organizations" : [ + { + "name" : "OpenAnalytics", + "href" : "https://www.openanalytics.eu", + "role" : "Statistical Consultant" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--star_stats_file", + "description" : "Tab-delimited text file containing statistics (per column) that were generated\nfrom the STAR log files (Log.final.out, Summary.csv, ReadsPerGene.out.tab).\nEach entry (row) in the file describes the values for one well (barcode).\n", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--nrReadsNrGenesPerChromPool", + "description" : "Pivot table in tsv format of the combined nrReadsNrGenesPerChrom files from STAR. \nDescribes per chromosome (as columns) the number of reads, as well as the total number \nof reads per cell barcode and the percentage of nuclear, ERCC and mitochondrial\nreads.\n", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "default" : [ + "pData.$id.txt" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "create_pdata.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/config/labels.config", + "dest" : "nextflow_labels.config" + }, + { + "type" : "file", + "path" : "/_viash.yaml", + "dest" : "_viash.yaml" + } + ], + "description" : "Create a pdata file by combining the mapping statistics \n", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "nrReadsNrGenesPerChromPool.txt" + }, + { + "type" : "file", + "path" : "starLogs.txt" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "requirements" : { + "commands" : [ + "ps" + ] + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/viash-hub/htrnaseq" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "updatecraftbox", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "pandas" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/eset/create_pdata/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "target/nextflow/eset/create_pdata", + "viash_version" : "0.9.4", + "git_commit" : "e6da525fc57aaec74f348eb974b68faa647bf800", + "git_remote" : "https://github.com/viash-hub/htrnaseq" + }, + "package_config" : { + "name" : "htrnaseq", + "version" : "updatecraftbox", + "summary" : "A workflow for high-throughput RNA-seq data analyses.\n", + "description" : "This workflow is designed to process high-throughput RNA-seq data, where every\nwell of a microarray plate is a sample. A fasta file provided as input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow is built in a modular fashion, where most of the base functionality\nis provided by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\nsupplemented by custom base components and workflow components in this package.\n\nThe full workflow is split in two major subworkflows that can be run independently:\n\n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate QC reports.\n\nEach of those can be started individually, or the full workflow can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq) \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner) where a\nnumber of choices (input/output structure and location) have been made.\n\nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex) first.\n", + "info" : { + "test_resources" : [ + { + "path" : "gs://viash-hub-resources/htrnaseq/v2", + "dest" : "resources_test" + } + ] + }, + "viash_version" : "0.9.4", + "source" : "src", + "target" : "target", + "config_mods" : [ + ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script += 'includeConfig(\\"nextflow_labels.config\\")'\n.resources += {path: '/src/config/labels.config', dest: 'nextflow_labels.config'}\n.resources += {path: '/_viash.yaml', dest: '_viash.yaml'}\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'updatecraftbox'" + ], + "keywords" : [ + "bioinformatics", + "sequencing", + "high-throughput", + "RNAseq", + "mapping", + "counting", + "pipeline", + "workflow" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/viash-hub/htrnaseq", + "issue_tracker" : "https://github.com/viash-hub/htrnaseq/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +from itertools import batched +import pandas as pd +import logging + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'star_stats_file': $( if [ ! -z ${VIASH_PAR_STAR_STATS_FILE+x} ]; then echo "r'${VIASH_PAR_STAR_STATS_FILE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'nrReadsNrGenesPerChromPool': $( if [ ! -z ${VIASH_PAR_NRREADSNRGENESPERCHROMPOOL+x} ]; then echo "r'${VIASH_PAR_NRREADSNRGENESPERCHROMPOOL//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + +logger = logging.getLogger() +console_handler = logging.StreamHandler() +logger.addHandler(console_handler) +logger.setLevel(logging.DEBUG) + +def main(par): + logger.info(f"{meta['name']} started.") + parameters_str = [f'\\\\t{param}: {param_val}\\\\n' for param, param_val in par.items()] + logger.info("Parameters:\\\\n%s", "".join(parameters_str).rstrip()) + logger.info("Reading %s", par["star_stats_file"]) + star_log_stats = pd.read_csv(par["star_stats_file"], sep="\\\\t", index_col=0) + logger.info("STAR log statics file contains information for the following barcodes: %s", + ", ".join(star_log_stats.index)) + logger.info("Reading %s", par["nrReadsNrGenesPerChromPool"]) + reads_and_genes_per_chr_stats = pd.read_csv(par["nrReadsNrGenesPerChromPool"], sep="\\\\t", index_col=0) + logger.info("Reads per gene and chromosome table contains information for the following barcodes: %s", + ", ".join(reads_and_genes_per_chr_stats.index)) + logger.info("Filtering mapping statistics file columns.") + cols_to_keep = ("WellID", "NumberOfMTReads", "pctMT", "NumberOfERCCReads", + "pctERCC", "NumberOfChromReads", "pctChrom") + try: + reads_and_genes_per_chr_stats = reads_and_genes_per_chr_stats.loc[:,cols_to_keep] + except KeyError as e: + raise KeyError("When trying to subset the reads per genes and chromosomes file, " + "a column was missing. Available columns in the file: " + f"{', '.join(reads_and_genes_per_chr_stats.columns)}.") from e + # Each barcode should be present. An alternative approach could be to just + # do the concatenation and check for NA values that are filled for non-overlapping + # index values, but there are already NA values present in the dataframes + if not star_log_stats.index.sort_values().equals(reads_and_genes_per_chr_stats.index.sort_values()): + raise ValueError("Error while combining two log files. It seems that the entries (barcodes) " + f"do not fully overlap. Barcodes in '{par['star_stats_file']}: " + f"{', '.join(reads_and_genes_per_chr_stats.index)}. Barcodes in " + f"'{par['nrReadsNrGenesPerChromPool']}': " + f"{', '.join(star_log_stats.index)}") + combined_stats = pd.concat([reads_and_genes_per_chr_stats, star_log_stats], axis=1) + logger.info("Summary of final output:\\\\n%s\\\\n", + "\\\\n".join(repr(combined_stats.loc[:,columns].describe()) + for columns in batched(combined_stats.columns, 3))) + logger.info("Writing to %s", par["output"]) + combined_stats.reset_index("WellBC").to_csv(par["output"], sep="\\\\t", header=True, index=False) + logger.info("Finished %s.", meta["name"]) + + +if __name__ == "__main__": + main(par) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/htrnaseq/eset/create_pdata", + "tag" : "updatecraftbox" + }, + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/eset/create_pdata/nextflow.config b/target/nextflow/eset/create_pdata/nextflow.config new file mode 100644 index 00000000..94dfb883 --- /dev/null +++ b/target/nextflow/eset/create_pdata/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'eset/create_pdata' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'updatecraftbox' + description = 'Create a pdata file by combining the mapping statistics \n' + author = 'Dries Schaumont, Marijke Van Moerbeke' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/eset/create_pdata/nextflow_labels.config b/target/nextflow/eset/create_pdata/nextflow_labels.config new file mode 100644 index 00000000..2821ec46 --- /dev/null +++ b/target/nextflow/eset/create_pdata/nextflow_labels.config @@ -0,0 +1,108 @@ +executor { + $k8s { + submitRateLimit = '10sec' + pollInterval = '1 sec' + } +} + +process { + container = 'nextflow/bash:latest' + + // default resources + memory = { 8.Gb * task.attempt } + cpus = 8 + maxForks = 36 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + maxMemory = 192.GB + + // Resource labels + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 8 } + withLabel: midcpu { cpus = 16 } + withLabel: highcpu { cpus = 32 } + + withLabel: verylowmem { memory = { get_memory( 4.GB * task.attempt ) } } + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 16.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 64.GB * task.attempt ) } } + +} + +profiles { + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.fixOwnership = true + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + + local { + // This config is for local processing. + process { + withName: ".*parallel_map_process" { + maxForks = 1 + } + maxMemory = 25.GB + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 6 } + withLabel: highcpu { cpus = 8 } + + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 12.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 20.GB * task.attempt ) } } + } + } +} + +def get_memory(to_compare) { + if (!process.containsKey("maxMemory") || !process.maxMemory) { + return to_compare + } + + try { + if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) { + return process.maxMemory + } + else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) { + return max_memory as nextflow.util.MemoryUnit + } + else { + return to_compare + } + } catch (all) { + println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!" + System.exit(1) + } + } diff --git a/target/nextflow/eset/create_pdata/nextflow_schema.json b/target/nextflow/eset/create_pdata/nextflow_schema.json new file mode 100644 index 00000000..e28629e6 --- /dev/null +++ b/target/nextflow/eset/create_pdata/nextflow_schema.json @@ -0,0 +1,56 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "create_pdata", + "description": "Create a pdata file by combining the mapping statistics \n", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "star_stats_file": { + "type": "string", + "format": "path", + "exists": true, + "description": "Tab-delimited text file containing statistics (per column) that were generated\nfrom the STAR log files (Log.final.out, Summary.csv, ReadsPerGene.out.tab).\nEach entry (row) in the file describes the values for one well (barcode).\n", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`. " + }, + "nrReadsNrGenesPerChromPool": { + "type": "string", + "format": "path", + "exists": true, + "description": "Pivot table in tsv format of the combined nrReadsNrGenesPerChrom files from STAR", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`. " + }, + "output": { + "type": "string", + "format": "path", + "description": "", + "help_text": "Type: `file`, multiple: `False`, default: `\"pData.$id.txt\"`, direction: `output`. ", + "default": "pData.$id.txt" + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/integration_test_components/htrnaseq/check_eset/.config.vsh.yaml b/target/nextflow/integration_test_components/htrnaseq/check_eset/.config.vsh.yaml new file mode 100644 index 00000000..97b490d8 --- /dev/null +++ b/target/nextflow/integration_test_components/htrnaseq/check_eset/.config.vsh.yaml @@ -0,0 +1,208 @@ +name: "check_eset" +namespace: "integration_test_components/htrnaseq" +version: "updatecraftbox" +authors: +- name: "Dries Schaumont" + roles: + - "author" + - "maintainer" + info: + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--eset" + description: "Path to an ExpressionSet object." + info: null + example: + - "eset.rds" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--star_output" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" +resources: +- type: "r_script" + path: "script.R" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +- type: "file" + path: "_viash.yaml" + dest: "_viash.yaml" +description: "This component test the ExpressionSet object as output by the main pipeline." +info: null +status: "enabled" +scope: + image: "public" + target: "public" +requirements: + commands: + - "ps" +license: "MIT" +links: + repository: "https://github.com/viash-hub/htrnaseq" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "bioconductor/bioconductor_docker:3.19" + target_registry: "images.viash-hub.com" + target_tag: "updatecraftbox" + namespace_separator: "/" + setup: + - type: "r" + cran: + - "bit64" + bioc: + - "Biobase" + bioc_force_install: false + warnings_as_errors: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/integration_test_components/htrnaseq/check_eset/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/integration_test_components/htrnaseq/check_eset" + executable: "target/nextflow/integration_test_components/htrnaseq/check_eset/main.nf" + viash_version: "0.9.4" + git_commit: "e6da525fc57aaec74f348eb974b68faa647bf800" + git_remote: "https://github.com/viash-hub/htrnaseq" +package_config: + name: "htrnaseq" + version: "updatecraftbox" + summary: "A workflow for high-throughput RNA-seq data analyses.\n" + description: "This workflow is designed to process high-throughput RNA-seq data,\ + \ where every\nwell of a microarray plate is a sample. A fasta file provided as\ + \ input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow\ + \ is built in a modular fashion, where most of the base functionality\nis provided\ + \ by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\n\ + supplemented by custom base components and workflow components in this package.\n\ + \nThe full workflow is split in two major subworkflows that can be run independently:\n\ + \n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per\ + \ well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate\ + \ QC reports.\n\nEach of those can be started individually, or the full workflow\ + \ can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq)\ + \ \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner)\ + \ where a\nnumber of choices (input/output structure and location) have been made.\n\ + \nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other\ + \ formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex)\ + \ first.\n" + info: + test_resources: + - path: "gs://viash-hub-resources/htrnaseq/v2" + dest: "resources_test" + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script\ + \ += 'includeConfig(\"nextflow_labels.config\")'\n.resources += {path: '/src/config/labels.config',\ + \ dest: 'nextflow_labels.config'}\n.resources += {path: '/_viash.yaml', dest:\ + \ '_viash.yaml'}\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'updatecraftbox'" + keywords: + - "bioinformatics" + - "sequencing" + - "high-throughput" + - "RNAseq" + - "mapping" + - "counting" + - "pipeline" + - "workflow" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/viash-hub/htrnaseq" + issue_tracker: "https://github.com/viash-hub/htrnaseq/issues" diff --git a/target/nextflow/integration_test_components/htrnaseq/check_eset/_viash.yaml b/target/nextflow/integration_test_components/htrnaseq/check_eset/_viash.yaml new file mode 100644 index 00000000..d08de7b1 --- /dev/null +++ b/target/nextflow/integration_test_components/htrnaseq/check_eset/_viash.yaml @@ -0,0 +1,21 @@ +name: htrnaseq +summary: | + A workflow for high-throughput RNA-seq data analyses. +description: "This workflow is designed to process high-throughput RNA-seq data, where every\nwell of a microarray plate is a sample. A fasta file provided as input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow is built in a modular fashion, where most of the base functionality\nis provided by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\nsupplemented by custom base components and workflow components in this package.\n\nThe full workflow is split in two major subworkflows that can be run independently:\n\n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate QC reports.\n\nEach of those can be started individually, or the full workflow can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq) \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner) where a\nnumber of choices (input/output structure and location) have been made.\n\nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex) first.\n" +license: MIT +keywords: [bioinformatics, sequencing, high-throughput, RNAseq, mapping, counting, pipeline, workflow] +links: + issue_tracker: https://github.com/viash-hub/htrnaseq/issues + repository: https://github.com/viash-hub/htrnaseq +viash_version: 0.9.4 +info: + test_resources: + - path: gs://viash-hub-resources/htrnaseq/v2 + dest: resources_test +config_mods: | + .requirements.commands := ['ps'] + .runners[.type == 'nextflow'].config.script += 'includeConfig("nextflow_labels.config")' + .resources += {path: '/src/config/labels.config', dest: 'nextflow_labels.config'} + .resources += {path: '/_viash.yaml', dest: '_viash.yaml'} +version: updatecraftbox +organization: vsh diff --git a/target/nextflow/integration_test_components/htrnaseq/check_eset/main.nf b/target/nextflow/integration_test_components/htrnaseq/check_eset/main.nf new file mode 100644 index 00000000..a352a8d4 --- /dev/null +++ b/target/nextflow/integration_test_components/htrnaseq/check_eset/main.nf @@ -0,0 +1,4028 @@ +// check_eset updatecraftbox +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries Schaumont (author, maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "check_eset", + "namespace" : "integration_test_components/htrnaseq", + "version" : "updatecraftbox", + "authors" : [ + { + "name" : "Dries Schaumont", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "file", + "name" : "--eset", + "description" : "Path to an ExpressionSet object.", + "example" : [ + "eset.rds" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--star_output", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "r_script", + "path" : "script.R", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/config/labels.config", + "dest" : "nextflow_labels.config" + }, + { + "type" : "file", + "path" : "/_viash.yaml", + "dest" : "_viash.yaml" + } + ], + "description" : "This component test the ExpressionSet object as output by the main pipeline.", + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "requirements" : { + "commands" : [ + "ps" + ] + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/viash-hub/htrnaseq" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "bioconductor/bioconductor_docker:3.19", + "target_registry" : "images.viash-hub.com", + "target_tag" : "updatecraftbox", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "r", + "cran" : [ + "bit64" + ], + "bioc" : [ + "Biobase" + ], + "bioc_force_install" : false, + "warnings_as_errors" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/integration_test_components/htrnaseq/check_eset/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "target/nextflow/integration_test_components/htrnaseq/check_eset", + "viash_version" : "0.9.4", + "git_commit" : "e6da525fc57aaec74f348eb974b68faa647bf800", + "git_remote" : "https://github.com/viash-hub/htrnaseq" + }, + "package_config" : { + "name" : "htrnaseq", + "version" : "updatecraftbox", + "summary" : "A workflow for high-throughput RNA-seq data analyses.\n", + "description" : "This workflow is designed to process high-throughput RNA-seq data, where every\nwell of a microarray plate is a sample. A fasta file provided as input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow is built in a modular fashion, where most of the base functionality\nis provided by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\nsupplemented by custom base components and workflow components in this package.\n\nThe full workflow is split in two major subworkflows that can be run independently:\n\n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate QC reports.\n\nEach of those can be started individually, or the full workflow can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq) \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner) where a\nnumber of choices (input/output structure and location) have been made.\n\nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex) first.\n", + "info" : { + "test_resources" : [ + { + "path" : "gs://viash-hub-resources/htrnaseq/v2", + "dest" : "resources_test" + } + ] + }, + "viash_version" : "0.9.4", + "source" : "src", + "target" : "target", + "config_mods" : [ + ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script += 'includeConfig(\\"nextflow_labels.config\\")'\n.resources += {path: '/src/config/labels.config', dest: 'nextflow_labels.config'}\n.resources += {path: '/_viash.yaml', dest: '_viash.yaml'}\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'updatecraftbox'" + ], + "keywords" : [ + "bioinformatics", + "sequencing", + "high-throughput", + "RNAseq", + "mapping", + "counting", + "pipeline", + "workflow" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/viash-hub/htrnaseq", + "issue_tracker" : "https://github.com/viash-hub/htrnaseq/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.R" +cat > "$tempscript" << VIASHMAIN +## VIASH START +# The following code has been auto-generated by Viash. +# treat warnings as errors +.viash_orig_warn <- options(warn = 2) + +par <- list( + "eset" = $( if [ ! -z ${VIASH_PAR_ESET+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_ESET" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "star_output" = $( if [ ! -z ${VIASH_PAR_STAR_OUTPUT+x} ]; then echo -n "strsplit('"; echo -n "$VIASH_PAR_STAR_OUTPUT" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "', split = ';')[[1]]"; else echo NULL; fi ) +) +meta <- list( + "name" = $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_META_NAME" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "functionality_name" = $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_META_FUNCTIONALITY_NAME" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "resources_dir" = $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_META_RESOURCES_DIR" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "executable" = $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo -n "'"; echo -n "$VIASH_META_EXECUTABLE" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "config" = $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo -n "'"; echo -n "$VIASH_META_CONFIG" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "temp_dir" = $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_META_TEMP_DIR" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "cpus" = $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_META_CPUS" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_b" = $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_B" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_kb" = $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_KB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_mb" = $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_MB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_gb" = $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_GB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_tb" = $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_TB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_pb" = $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_PB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_kib" = $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_KIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_mib" = $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_MIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_gib" = $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_GIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_tib" = $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_TIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_pib" = $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_PIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ) +) +dep <- list( + +) + + +# restore original warn setting +options(.viash_orig_warn) +rm(.viash_orig_warn) + +## VIASH END +library(Biobase) +library(testthat) +library(Matrix) + +sample_1_result <- readRDS(par\\$eset) +expected_sample_names <- c( + "sample_one_AACAAGGTAC", "sample_one_AACAATCAGG", "sample_one_AACACCTAGT", + "sample_one_AACAGGCAAT", "sample_one_AACATGGAGA", "sample_one_AACATTACCG", + "sample_one_AACCAGCCAG", "sample_one_AACCAGTTGA", "sample_one_AACCGCGACT", + "sample_one_AACCGGAAGG", "sample_one_AACCGGCGTA", "sample_one_AACCTAGTCC", + "sample_one_AACCTCATAG", "sample_one_AACGTAAGCT", "sample_one_AACTCTACAC", + "sample_one_AACTGTGTCA", "sample_one_AAGACGGATT", "sample_one_AAGATCGGCG", + "sample_one_AAGATGTCCA", "sample_one_AAGCATATGG", "sample_one_AAGCGATGTT", + "sample_one_AAGCGTTCAG", "sample_one_AAGCTCACCT", "sample_one_AAGGCATGCG", + "sample_one_AAGGTCTGGA", "sample_one_AAGTTAGCGC", "sample_one_AAGTTCCTTG", + "sample_one_AATACCGGTA", "sample_one_AATAGCCACA", "sample_one_AATCACGCGA", + "sample_one_AATCCATCTG", "sample_one_AATCCGCTCC", "sample_one_AATCCTACCA", + "sample_one_AATCGTCCGC", "sample_one_AATGAACACG", "sample_one_AATGACCTTC", + "sample_one_AATGAGAGCA", "sample_one_AATGTCAGTG", "sample_one_AATTAGGCCG", + "sample_one_AATTGCGATG", "sample_one_ACAACAGTCG", "sample_one_ACAACCATAC", + "sample_one_ACAACGGAGC", "sample_one_ACAAGCGCGA", "sample_one_ACACAATCTC", + "sample_one_ACACAGTGAA", "sample_one_ACACCGAATT", "sample_one_ACACGCAGTA", + "sample_one_ACACGGTCCT", "sample_one_ACACTTGCTG", "sample_one_ACAGTGCCAA", + "sample_one_ACATGTGTGC", "sample_one_ACCAGGACCA", "sample_one_ACCATAACAC", + "sample_one_ACCGAACCGT", "sample_one_ACCGAGAGTC", "sample_one_ACCGGTACAG", + "sample_one_ACCGTACTTC", "sample_one_ACCTCCGACA", "sample_one_ACCTCTCTCC", + "sample_one_ACCTGTCCGA", "sample_one_ACCTTATGTG", "sample_one_ACGAATGACA", + "sample_one_ACGCCTCAAC", "sample_one_ACGCCTTCGT", "sample_one_ACGCTGGATA", + "sample_one_ACGGTCCGTT", "sample_one_ACGTAGGCAC", "sample_one_ACGTGCTGAT", + "sample_one_ACTCCAAGCC", "sample_one_ACTGGCGCAT", "sample_one_ACTGGCTTCC", + "sample_one_ACTTAACTGC", "sample_one_ACTTCATCAC", "sample_one_ACTTCGTTGA", + "sample_one_ACTTCTCCTG", "sample_one_ACTTGAGGAA", "sample_one_ACTTGTAAGG", + "sample_one_AGAACCACGG", "sample_one_AGAAGCAATC", "sample_one_AGACCGTTAT", + "sample_one_AGACTAGCAT", "sample_one_AGAGATGCAG", "sample_one_AGAGCTTACA", + "sample_one_AGAGTGTAAC", "sample_one_AGAGTTCTGC", "sample_one_AGATAGTGCT", + "sample_one_AGCAATGCGC", "sample_one_AGCATGTCAT", "sample_one_AGCCACTAGC", + "sample_one_AGCCAGAATA", "sample_one_AGCCAGCTCT", "sample_one_AGCGATAACG", + "sample_one_AGCGTACAAT", "sample_one_AGCTATTCCA", "sample_one_AGCTCCTCAG", + "sample_one_AGGAGGCATA", "sample_one_AGGCGTCTGT", "sample_one_AGTAACTCAC", + "sample_one_AGTAAGCGTT", "sample_one_AGTCTGTACG", "sample_one_AGTGCAATGT", + "sample_one_ATAAGGTGCA", "sample_one_ATACACGACA", "sample_one_ATAGGCCATT", + "sample_one_ATATCCGCAT", "sample_one_ATCAGCACTT", "sample_one_ATCAGCGAGG", + "sample_one_ATCCAATACG", "sample_one_ATCCGCTGTG", "sample_one_ATCCGTCCAT", + "sample_one_ATCGACGGCT", "sample_one_ATCGCGATTA", "sample_one_ATCGGTAGGC", + "sample_one_ATCTAAGGAG", "sample_one_ATGACGGTAA", "sample_one_ATGACTCAGT", + "sample_one_ATGCACCGGA", "sample_one_ATGCGGACTG", "sample_one_ATGCTTCCTA", + "sample_one_ATGGACCAAC", "sample_one_ATGGTCTTAG", "sample_one_ATGGTGAGCG", + "sample_one_ATGTGGAAGC", "sample_one_ATTATCGGAC", "sample_one_ATTCGGAACA", + "sample_one_CAACAATCCA", "sample_one_CAAGAAGCAT", "sample_one_CAAGATGAGG", + "sample_one_CAAGCCAACG", "sample_one_CAAGTGGATC", "sample_one_CACAGTTCAT", + "sample_one_CACGAGTCTG", "sample_one_CACGCTCCAA", "sample_one_CACTGAGCAC", + "sample_one_CAGATCAATG", "sample_one_CAGTGCTCTT", "sample_one_CAGTTAAGCA", + "sample_one_CATAGCTATC", "sample_one_CATCACCACC", "sample_one_CATGTACGCC", + "sample_one_CATTACACTG", "sample_one_CATTCGACGA", "sample_one_CCAACTATGG", + "sample_one_CCAAGGAGTT", "sample_one_CCAATTGTTC", "sample_one_CCACAAGTGC", + "sample_one_CCAGCTTAGT", "sample_one_CCATAACTTG", "sample_one_CCATACTGAC", + "sample_one_CCATAGATCA", "sample_one_CCATGTGCTT", "sample_one_CCATTCAGCG", + "sample_one_CCGAACAAGC", "sample_one_CCGAACCTAA", "sample_one_CCGAAGACCT", + "sample_one_CCGAATAGTG", "sample_one_CCGACTTCTC", "sample_one_CCGATCCACT", + "sample_one_CCGATGATAC", "sample_one_CCGCGTTATG", "sample_one_CCGCTAGCTT", + "sample_one_CCGGAGTATC", "sample_one_CCGGCCAATT", "sample_one_CCGGTCTCTA", + "sample_one_CCGTACGATG", "sample_one_CCGTCAGAAC", "sample_one_CCTAGACACG", + "sample_one_CCTAGTTGAG", "sample_one_CCTATTCTGT", "sample_one_CCTCAACCGA", + "sample_one_CCTCCATAAG", "sample_one_CCTGATGCCA", "sample_one_CCTGCAATAC", + "sample_one_CCTTGTATTC", "sample_one_CGAGATCTCT", "sample_one_CGAGGAACAA", + "sample_one_CGATAACCGC", "sample_one_CGATCCTGTG", "sample_one_CGCCAACCAT", + "sample_one_CGCCAGTGTT", "sample_one_CGCCTTGTAC", "sample_one_CGCGGATTCA", + "sample_one_CGCTTAAGGC", "sample_one_CGCTTACTAA", "sample_one_CGCTTCTTGG", + "sample_one_CGGAAGCTGT", "sample_one_CGGAATACAC", "sample_one_CGGAGATTGG", + "sample_one_CGGAGCTCAA", "sample_one_CGGATCGGTA", "sample_one_CGGATTCTAG", + "sample_one_CGGCAACTTA", "sample_one_CGGCTCATCA", "sample_one_CGGTCGTATT", + "sample_one_CGGTGACATC", "sample_one_CGTAACGGAT", "sample_one_CGTAAGATTC", + "sample_one_CGTACTGTAA", "sample_one_CGTAGAAGAC", "sample_one_CGTCCTAGGA", + "sample_one_CGTCGGCAAT", "sample_one_CGTGAGTTAT", "sample_one_CGTGTCAAGC", + "sample_one_CTAACTTCAG", "sample_one_CTAATAGCGT", "sample_one_CTACACCAGG", + "sample_one_CTAGCACAAT", "sample_one_CTATGAACGG", "sample_one_CTCAAGGACC", + "sample_one_CTCACCTGTC", "sample_one_CTCCTATTGT", "sample_one_CTCGCAACGT", + "sample_one_CTCGTGCCTA", "sample_one_CTGGATTGAC", "sample_one_CTGTAGTCAG", + "sample_one_CTGTCGCTTC", "sample_one_CTGTCTGTGT", "sample_one_CTTCATATCG", + "sample_one_CTTGCTGACG", "sample_one_GAAGGATTAG", "sample_one_GAATCGAGCC", + "sample_one_GACCATCTAA", "sample_one_GACGACCACA", "sample_one_GAGACATCTT", + "sample_one_GAGCGAGTCA", "sample_one_GAGTAGACCA", "sample_one_GATACGCTTA", + "sample_one_GATAGACTGT", "sample_one_GATAGAGGCG", "sample_one_GATAGGTCAA", + "sample_one_GATATCAGGA", "sample_one_GATCTCATTC", "sample_one_GATCTGGTCG", + "sample_one_GATGAGTGAC", "sample_one_GATGGATACA", "sample_one_GATGTGACAG", + "sample_one_GATTAAGTCC", "sample_one_GATTGCACGC", "sample_one_GCAAGCGAAT", + "sample_one_GCAATGTAAG", "sample_one_GCACACTATA", "sample_one_GCACTCGGAA", + "sample_one_GCACTGCGTT", "sample_one_GCACTTAATC", "sample_one_GCAGGAGATG", + "sample_one_GCAGTACTGG", "sample_one_GCATATGAGT", "sample_one_GCATCCGATC", + "sample_one_GCCAAGTACA", "sample_one_GCCACGATTC", "sample_one_GCCATAGGTT", + "sample_one_GCCATATCGA", "sample_one_GCCGTCAATA", "sample_one_GCCTGGACAT", + "sample_one_GCGTAATTAC", "sample_one_GCTATTATCC", "sample_one_GCTCAGTAAT", + "sample_one_GCTGCTTATA", "sample_one_GGAATAAGCA", "sample_one_GGACGATGCT", + "sample_one_GGCATCGTGA", "sample_one_GGCATTATTG", "sample_one_GGCCGAGATT", + "sample_one_GGCGCTATAA", "sample_one_GGCGTTAAGT", "sample_one_GGCTATTGAT", + "sample_one_GGCTGCTACT", "sample_one_GGTAATGTGT", "sample_one_GGTGGTTGGA", + "sample_one_GGTGTTCACC", "sample_one_GGTTAGATCT", "sample_one_GGTTATGGCG", + "sample_one_GGTTCACTGG", "sample_one_GGTTGTGCAA", "sample_one_GTAACCAGTA", + "sample_one_GTAACCTTGG", "sample_one_GTAAGAACCT", "sample_one_GTAAGGCTCC", + "sample_one_GTAATCCACG", "sample_one_GTATTGTGGA", "sample_one_GTCCGCATCA", + "sample_one_GTCCTTCGGT", "sample_one_GTCGCTCTCT", "sample_one_GTCGGTGACA", + "sample_one_GTCTCGAGTG", "sample_one_GTCTCTTAAG", "sample_one_GTCTTCCGAG", + "sample_one_GTGACTATAC", "sample_one_GTGGTTAATG", "sample_one_GTGTGCCTGT", + "sample_one_GTGTGTGTCC", "sample_one_GTTCATTGCC", "sample_one_GTTCCGGTGA", + "sample_one_GTTCGTCGAA", "sample_one_GTTGAATTGG", "sample_one_GTTGATCCGC", + "sample_one_GTTGTATGCT", "sample_one_TAACCGTAGC", "sample_one_TAACGTCGAT", + "sample_one_TAAGGTACGG", "sample_one_TACGGACATA", "sample_one_TACTACCGCC", + "sample_one_TACTGTCAAG", "sample_one_TAGCGAACGC", "sample_one_TAGCGCCAAC", + "sample_one_TAGGACGCCT", "sample_one_TAGGTTGCAA", "sample_one_TAGTAGTCTC", + "sample_one_TAGTCCGCTG", "sample_one_TAGTGGAACT", "sample_one_TATCATGCAG", + "sample_one_TATCGTTACG", "sample_one_TCAAGTGCAG", "sample_one_TCACAGATAC", + "sample_one_TCACCGCCTA", "sample_one_TCACGCCACT", "sample_one_TCACGTTGGC", + "sample_one_TCATTGTCCA", "sample_one_TCCACACTAG", "sample_one_TCCACGGTCA", + "sample_one_TCCACTCGCT", "sample_one_TCCGACTAAC", "sample_one_TCCGTTATCT", + "sample_one_TCCTAAGAGA", "sample_one_TCCTCTAGTA", "sample_one_TCGAAGCATT", + "sample_one_TCGAGAGAGC", "sample_one_TCGCACTTGA", "sample_one_TCGCCTACTG", + "sample_one_TCGCGTAGCA", "sample_one_TCGGCGTTAA", "sample_one_TCTACATCCG", + "sample_one_TCTCCACATT", "sample_one_TCTCTCCTAT", "sample_one_TCTTGCTCGG", + "sample_one_TGAACTAACC", "sample_one_TGAAGAAGGT", "sample_one_TGAGCGTTCC", + "sample_one_TGAGTACGTA", "sample_one_TGGAATGGAG", "sample_one_TGTCATTCGC", + "sample_one_TGTGCTTCAG", "sample_one_TGTTCAGGAT", "sample_one_TTACACACGT", + "sample_one_TTACTGTGAC", "sample_one_TTATAGGAGG", "sample_one_TTATCGCGTT", + "sample_one_TTATGCCGCG", "sample_one_TTCACGGAAG", "sample_one_TTCAGGAGTA", + "sample_one_TTCCATCGAG", "sample_one_TTCGAGTGAT", "sample_one_TTCTGTACCT", + "sample_one_TTGGCAATTC", "sample_one_TTGGCTCCAC", "sample_one_TTGGTAACAG", + "sample_one_TTGGTCAGTA", "sample_one_TTGTCGGCCA", "sample_one_TTGTGTTCGA" +) +stopifnot(identical(sampleNames(sample_1_result), expected_sample_names)) + +expected_var_labels <- c( + "WellBC", + "WellID", + "NumberOfMTReads", + "pctMT", + "NumberOfERCCReads", + "pctERCC", + "NumberOfChromReads", + "pctChrom", + "NumberOfInputReads", + "NumberOfMappedReads", + "PctMappedReads", + "NumberOfReadsMappedToMultipleLoci", + "PectOfReadsMappedToMultipleLoci", + "NumberOfReadsMappedToTooManyLoci", + "PectOfReadsMappedToTooManyLoci", + "NumberOfReadsUnmappedTooManyMismatches", + "PectOfReadsUnmappedTooManyMismatches", + "NumberOfReadsUnmappedTooShort", + "PectOfReadsUnmappedTooShort", + "NumberOfReadsUnmappedOther", + "PectOfReadsUnmappedOther", + "ReadsWithValidBarcodes", + "SequencingSaturation", + "Q30BasesInCB.UMI", + "ReadsMappedToTranscriptome.Unique.MultipeGenes", + "EstimatedNumberOfCells", + "FractionOfReadsInCells", + "MeanReadsPerCell", + "NumberOfUMIs", + "NumberOfGenes", + "NumberOfCountedReads", + "PoolName" +) +stopifnot(identical(varLabels(sample_1_result), expected_var_labels)) + +read_mm <- function(mapping_dir) { + market_matrix_file <- file.path(mapping_dir, "Solo.out", + "Gene", "raw", "matrix.mtx") + result <- readMM(market_matrix_file) + feature_file <- file.path(mapping_dir, "Solo.out", + "Gene", "raw", "features.tsv") + features <- read.table(feature_file, sep = "\\\\t", header = FALSE, + col.names = c("ID", "Name", "Type"))\\$ID + rownames(result) <- gsub("\\\\\\\\.\\\\\\\\d+\\$", "", features) + barcodes_file <- file.path(mapping_dir, + "Solo.out", "Gene", "raw", "barcodes.tsv") + if (!file.exists(barcodes_file)) { + stop(paste0("Expected the 'Solo.out/Gene/raw' directory at ", + mapping_dir, " to contain a 'barcodes.tsv' file.")) + } + barcodes <- readLines(barcodes_file) + if (length(barcodes) != 1) { + stop(paste0("A single STAR Solo folder should only have ", + "mapped one (1) barcode, but found '", + length(barcodes), "'for mapping directory ", mapping_dir)) + } + colnames(result) <- paste0("sample_one_", barcodes) + return(result) +} +expected_matrices <- lapply(par\\$star_output, read_mm) +expected_matrix <- as.matrix(do.call(cbind, expected_matrices)) +result_counts <- exprs(sample_1_result) +stopifnot(length(setdiff(colnames(expected_matrix), + colnames(exprs(sample_1_result)))) == 0) +stopifnot(length(setdiff(rownames(expected_matrix), + rownames(exprs(sample_1_result)))) == 0) +expected_matrix_sorted <- expected_matrix[, colnames(exprs(sample_1_result))] +stopifnot(identical(exprs(sample_1_result), expected_matrix_sorted)) +VIASHMAIN +Rscript "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/htrnaseq/integration_test_components/htrnaseq/check_eset", + "tag" : "updatecraftbox" + }, + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/integration_test_components/htrnaseq/check_eset/nextflow.config b/target/nextflow/integration_test_components/htrnaseq/check_eset/nextflow.config new file mode 100644 index 00000000..ed87d643 --- /dev/null +++ b/target/nextflow/integration_test_components/htrnaseq/check_eset/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'integration_test_components/htrnaseq/check_eset' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'updatecraftbox' + description = 'This component test the ExpressionSet object as output by the main pipeline.' + author = 'Dries Schaumont' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/integration_test_components/htrnaseq/check_eset/nextflow_labels.config b/target/nextflow/integration_test_components/htrnaseq/check_eset/nextflow_labels.config new file mode 100644 index 00000000..2821ec46 --- /dev/null +++ b/target/nextflow/integration_test_components/htrnaseq/check_eset/nextflow_labels.config @@ -0,0 +1,108 @@ +executor { + $k8s { + submitRateLimit = '10sec' + pollInterval = '1 sec' + } +} + +process { + container = 'nextflow/bash:latest' + + // default resources + memory = { 8.Gb * task.attempt } + cpus = 8 + maxForks = 36 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + maxMemory = 192.GB + + // Resource labels + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 8 } + withLabel: midcpu { cpus = 16 } + withLabel: highcpu { cpus = 32 } + + withLabel: verylowmem { memory = { get_memory( 4.GB * task.attempt ) } } + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 16.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 64.GB * task.attempt ) } } + +} + +profiles { + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.fixOwnership = true + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + + local { + // This config is for local processing. + process { + withName: ".*parallel_map_process" { + maxForks = 1 + } + maxMemory = 25.GB + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 6 } + withLabel: highcpu { cpus = 8 } + + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 12.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 20.GB * task.attempt ) } } + } + } +} + +def get_memory(to_compare) { + if (!process.containsKey("maxMemory") || !process.maxMemory) { + return to_compare + } + + try { + if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) { + return process.maxMemory + } + else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) { + return max_memory as nextflow.util.MemoryUnit + } + else { + return to_compare + } + } catch (all) { + println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!" + System.exit(1) + } + } diff --git a/target/nextflow/integration_test_components/htrnaseq/check_eset/nextflow_schema.json b/target/nextflow/integration_test_components/htrnaseq/check_eset/nextflow_schema.json new file mode 100644 index 00000000..976505a1 --- /dev/null +++ b/target/nextflow/integration_test_components/htrnaseq/check_eset/nextflow_schema.json @@ -0,0 +1,52 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "check_eset", + "description": "This component test the ExpressionSet object as output by the main pipeline.", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "No description", + "properties": { + "eset": { + "type": "string", + "format": "path", + "exists": true, + "description": "Path to an ExpressionSet object.", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"eset.rds\"`. " + }, + "star_output": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "exists": true, + "description": "", + "help_text": "Type: `file`, multiple: `True`, required, direction: `input`. " + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/integration_test_components/well_demultiplexing/check_cutadapt_output/.config.vsh.yaml b/target/nextflow/integration_test_components/well_demultiplexing/check_cutadapt_output/.config.vsh.yaml new file mode 100644 index 00000000..085463ff --- /dev/null +++ b/target/nextflow/integration_test_components/well_demultiplexing/check_cutadapt_output/.config.vsh.yaml @@ -0,0 +1,217 @@ +name: "check_cutadapt_output" +namespace: "integration_test_components/well_demultiplexing" +version: "updatecraftbox" +authors: +- name: "Dries Schaumont" + roles: + - "author" + - "maintainer" + info: + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Inputs" + arguments: + - type: "file" + name: "--fastq_r1" + description: "Path to the forward reads to test." + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--fastq_r2" + description: "Path to the reverse reads to test." + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--ids" + description: "Well IDs for the corresponding fastq input" + info: null + required: true + direction: "input" + multiple: true + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +- type: "file" + path: "_viash.yaml" + dest: "_viash.yaml" +description: "This component test the cutadapt output from the well_demultiplex subworkflow." +info: null +status: "enabled" +scope: + image: "public" + target: "public" +requirements: + commands: + - "ps" +license: "MIT" +links: + repository: "https://github.com/viash-hub/htrnaseq" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "updatecraftbox" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "dnaio" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/integration_test_components/well_demultiplexing/check_cutatapt_output/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/integration_test_components/well_demultiplexing/check_cutadapt_output" + executable: "target/nextflow/integration_test_components/well_demultiplexing/check_cutadapt_output/main.nf" + viash_version: "0.9.4" + git_commit: "e6da525fc57aaec74f348eb974b68faa647bf800" + git_remote: "https://github.com/viash-hub/htrnaseq" +package_config: + name: "htrnaseq" + version: "updatecraftbox" + summary: "A workflow for high-throughput RNA-seq data analyses.\n" + description: "This workflow is designed to process high-throughput RNA-seq data,\ + \ where every\nwell of a microarray plate is a sample. A fasta file provided as\ + \ input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow\ + \ is built in a modular fashion, where most of the base functionality\nis provided\ + \ by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\n\ + supplemented by custom base components and workflow components in this package.\n\ + \nThe full workflow is split in two major subworkflows that can be run independently:\n\ + \n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per\ + \ well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate\ + \ QC reports.\n\nEach of those can be started individually, or the full workflow\ + \ can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq)\ + \ \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner)\ + \ where a\nnumber of choices (input/output structure and location) have been made.\n\ + \nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other\ + \ formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex)\ + \ first.\n" + info: + test_resources: + - path: "gs://viash-hub-resources/htrnaseq/v2" + dest: "resources_test" + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script\ + \ += 'includeConfig(\"nextflow_labels.config\")'\n.resources += {path: '/src/config/labels.config',\ + \ dest: 'nextflow_labels.config'}\n.resources += {path: '/_viash.yaml', dest:\ + \ '_viash.yaml'}\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'updatecraftbox'" + keywords: + - "bioinformatics" + - "sequencing" + - "high-throughput" + - "RNAseq" + - "mapping" + - "counting" + - "pipeline" + - "workflow" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/viash-hub/htrnaseq" + issue_tracker: "https://github.com/viash-hub/htrnaseq/issues" diff --git a/target/nextflow/integration_test_components/well_demultiplexing/check_cutadapt_output/_viash.yaml b/target/nextflow/integration_test_components/well_demultiplexing/check_cutadapt_output/_viash.yaml new file mode 100644 index 00000000..d08de7b1 --- /dev/null +++ b/target/nextflow/integration_test_components/well_demultiplexing/check_cutadapt_output/_viash.yaml @@ -0,0 +1,21 @@ +name: htrnaseq +summary: | + A workflow for high-throughput RNA-seq data analyses. +description: "This workflow is designed to process high-throughput RNA-seq data, where every\nwell of a microarray plate is a sample. A fasta file provided as input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow is built in a modular fashion, where most of the base functionality\nis provided by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\nsupplemented by custom base components and workflow components in this package.\n\nThe full workflow is split in two major subworkflows that can be run independently:\n\n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate QC reports.\n\nEach of those can be started individually, or the full workflow can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq) \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner) where a\nnumber of choices (input/output structure and location) have been made.\n\nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex) first.\n" +license: MIT +keywords: [bioinformatics, sequencing, high-throughput, RNAseq, mapping, counting, pipeline, workflow] +links: + issue_tracker: https://github.com/viash-hub/htrnaseq/issues + repository: https://github.com/viash-hub/htrnaseq +viash_version: 0.9.4 +info: + test_resources: + - path: gs://viash-hub-resources/htrnaseq/v2 + dest: resources_test +config_mods: | + .requirements.commands := ['ps'] + .runners[.type == 'nextflow'].config.script += 'includeConfig("nextflow_labels.config")' + .resources += {path: '/src/config/labels.config', dest: 'nextflow_labels.config'} + .resources += {path: '/_viash.yaml', dest: '_viash.yaml'} +version: updatecraftbox +organization: vsh diff --git a/target/nextflow/integration_test_components/well_demultiplexing/check_cutadapt_output/main.nf b/target/nextflow/integration_test_components/well_demultiplexing/check_cutadapt_output/main.nf new file mode 100644 index 00000000..8bd45994 --- /dev/null +++ b/target/nextflow/integration_test_components/well_demultiplexing/check_cutadapt_output/main.nf @@ -0,0 +1,3908 @@ +// check_cutadapt_output updatecraftbox +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries Schaumont (author, maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "check_cutadapt_output", + "namespace" : "integration_test_components/well_demultiplexing", + "version" : "updatecraftbox", + "authors" : [ + { + "name" : "Dries Schaumont", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "file", + "name" : "--fastq_r1", + "description" : "Path to the forward reads to test.", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--fastq_r2", + "description" : "Path to the reverse reads to test.", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--ids", + "description" : "Well IDs for the corresponding fastq input", + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/config/labels.config", + "dest" : "nextflow_labels.config" + }, + { + "type" : "file", + "path" : "/_viash.yaml", + "dest" : "_viash.yaml" + } + ], + "description" : "This component test the cutadapt output from the well_demultiplex subworkflow.", + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "requirements" : { + "commands" : [ + "ps" + ] + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/viash-hub/htrnaseq" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "updatecraftbox", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "dnaio" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/integration_test_components/well_demultiplexing/check_cutatapt_output/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "target/nextflow/integration_test_components/well_demultiplexing/check_cutadapt_output", + "viash_version" : "0.9.4", + "git_commit" : "e6da525fc57aaec74f348eb974b68faa647bf800", + "git_remote" : "https://github.com/viash-hub/htrnaseq" + }, + "package_config" : { + "name" : "htrnaseq", + "version" : "updatecraftbox", + "summary" : "A workflow for high-throughput RNA-seq data analyses.\n", + "description" : "This workflow is designed to process high-throughput RNA-seq data, where every\nwell of a microarray plate is a sample. A fasta file provided as input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow is built in a modular fashion, where most of the base functionality\nis provided by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\nsupplemented by custom base components and workflow components in this package.\n\nThe full workflow is split in two major subworkflows that can be run independently:\n\n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate QC reports.\n\nEach of those can be started individually, or the full workflow can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq) \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner) where a\nnumber of choices (input/output structure and location) have been made.\n\nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex) first.\n", + "info" : { + "test_resources" : [ + { + "path" : "gs://viash-hub-resources/htrnaseq/v2", + "dest" : "resources_test" + } + ] + }, + "viash_version" : "0.9.4", + "source" : "src", + "target" : "target", + "config_mods" : [ + ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script += 'includeConfig(\\"nextflow_labels.config\\")'\n.resources += {path: '/src/config/labels.config', dest: 'nextflow_labels.config'}\n.resources += {path: '/_viash.yaml', dest: '_viash.yaml'}\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'updatecraftbox'" + ], + "keywords" : [ + "bioinformatics", + "sequencing", + "high-throughput", + "RNAseq", + "mapping", + "counting", + "pipeline", + "workflow" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/viash-hub/htrnaseq", + "issue_tracker" : "https://github.com/viash-hub/htrnaseq/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import dnaio +from operator import itemgetter +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'fastq_r1': $( if [ ! -z ${VIASH_PAR_FASTQ_R1+x} ]; then echo "r'${VIASH_PAR_FASTQ_R1//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'fastq_r2': $( if [ ! -z ${VIASH_PAR_FASTQ_R2+x} ]; then echo "r'${VIASH_PAR_FASTQ_R2//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'ids': $( if [ ! -z ${VIASH_PAR_IDS+x} ]; then echo "r'${VIASH_PAR_IDS//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +def assert_number_of_reads(reads): + expected_number_of_reads = { + "SRR14730301__A1": 165, + "SRR14730301__B1": 194, + "SRR14730302__A1": 141, + "SRR14730302__B1": 213, + "SRR14730302__unknown": 99646, + "SRR14730301__unknown": 99641, + } + for input_id, expected_reads in expected_number_of_reads.items(): + num_reads = len(reads[input_id]) + assert num_reads == expected_reads, \\\\ + f"Expected number of ouput reads for {input_id} to be {expected_reads}, was {num_reads}." + + +def string_difference(string1, string2): + result = 0 + for char1, char2 in zip(string1, string2, strict=True): + if char1.lower() != char2.lower(): + result += 1 + return result + + +def assert_barcodes_not_removed(reads): + barcodes = { + "SRR14730301__A1": "ACACCGAATT", + "SRR14730302__A1": "ACACCGAATT", + "SRR14730301__B1": "GGCTATTGAT", + "SRR14730302__B1": "GGCTATTGAT" + } + for sample_id, barcode in barcodes.items(): + sample_reads = reads[sample_id] + forward_reads = map(itemgetter(0), sample_reads) + for i, forward_read in enumerate(forward_reads): + read_sequence = forward_read.sequence + read_barcode_start = read_sequence[: len(barcode)] + # A 10% difference is allowed. + assert string_difference(read_barcode_start, barcode) <= (0.1 * len(barcode)), \\\\ + (f"Expected barcode {barcode} to be present for sample {sample_id} " + f"in read {i}. Found {read_barcode_start}") + +def create_input_mapping(sample_ids, inputs_r1, inputs_r2): + return {sample_id: [input_r1, input_r2] + for sample_id, input_r1, input_r2 + in zip(sample_ids, inputs_r1, inputs_r2, strict=True)} + +def read_input_files(input_mapping): + expected_keys = {"SRR14730301__A1", "SRR14730301__B1", + "SRR14730302__A1", "SRR14730302__B1", + "SRR14730301__unknown", "SRR14730302__unknown"} + difference = set(input_mapping.keys()) - expected_keys + assert not difference, f"Found unexpected output id(s): {difference}" + result = {} + for input_id, input_files in input_mapping.items(): + input_r1, input_r2 = input_files + # This reads the files into memory, + # but they are reasonably small + with dnaio.open(input_r1) as r1_reads, dnaio.open(input_r2) as r2_reads: + for r1_read, r2_read in zip(r1_reads, r2_reads, strict=True): + result.setdefault(input_id, []).append((r1_read, r2_read)) + return result + + +def main(par): + inputs = create_input_mapping(par["ids"], par["fastq_r1"], par["fastq_r2"]) + reads = read_input_files(inputs) + assert_number_of_reads(reads) + assert_barcodes_not_removed(reads) + +if __name__ == "__main__": + main(par) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/htrnaseq/integration_test_components/well_demultiplexing/check_cutadapt_output", + "tag" : "updatecraftbox" + }, + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/integration_test_components/well_demultiplexing/check_cutadapt_output/nextflow.config b/target/nextflow/integration_test_components/well_demultiplexing/check_cutadapt_output/nextflow.config new file mode 100644 index 00000000..acf1a084 --- /dev/null +++ b/target/nextflow/integration_test_components/well_demultiplexing/check_cutadapt_output/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'integration_test_components/well_demultiplexing/check_cutadapt_output' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'updatecraftbox' + description = 'This component test the cutadapt output from the well_demultiplex subworkflow.' + author = 'Dries Schaumont' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/integration_test_components/well_demultiplexing/check_cutadapt_output/nextflow_labels.config b/target/nextflow/integration_test_components/well_demultiplexing/check_cutadapt_output/nextflow_labels.config new file mode 100644 index 00000000..2821ec46 --- /dev/null +++ b/target/nextflow/integration_test_components/well_demultiplexing/check_cutadapt_output/nextflow_labels.config @@ -0,0 +1,108 @@ +executor { + $k8s { + submitRateLimit = '10sec' + pollInterval = '1 sec' + } +} + +process { + container = 'nextflow/bash:latest' + + // default resources + memory = { 8.Gb * task.attempt } + cpus = 8 + maxForks = 36 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + maxMemory = 192.GB + + // Resource labels + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 8 } + withLabel: midcpu { cpus = 16 } + withLabel: highcpu { cpus = 32 } + + withLabel: verylowmem { memory = { get_memory( 4.GB * task.attempt ) } } + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 16.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 64.GB * task.attempt ) } } + +} + +profiles { + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.fixOwnership = true + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + + local { + // This config is for local processing. + process { + withName: ".*parallel_map_process" { + maxForks = 1 + } + maxMemory = 25.GB + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 6 } + withLabel: highcpu { cpus = 8 } + + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 12.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 20.GB * task.attempt ) } } + } + } +} + +def get_memory(to_compare) { + if (!process.containsKey("maxMemory") || !process.maxMemory) { + return to_compare + } + + try { + if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) { + return process.maxMemory + } + else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) { + return max_memory as nextflow.util.MemoryUnit + } + else { + return to_compare + } + } catch (all) { + println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!" + System.exit(1) + } + } diff --git a/target/nextflow/integration_test_components/well_demultiplexing/check_cutadapt_output/nextflow_schema.json b/target/nextflow/integration_test_components/well_demultiplexing/check_cutadapt_output/nextflow_schema.json new file mode 100644 index 00000000..80233fb5 --- /dev/null +++ b/target/nextflow/integration_test_components/well_demultiplexing/check_cutadapt_output/nextflow_schema.json @@ -0,0 +1,63 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "check_cutadapt_output", + "description": "This component test the cutadapt output from the well_demultiplex subworkflow.", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "No description", + "properties": { + "fastq_r1": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "exists": true, + "description": "Path to the forward reads to test.", + "help_text": "Type: `file`, multiple: `True`, required, direction: `input`. " + }, + "fastq_r2": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "exists": true, + "description": "Path to the reverse reads to test.", + "help_text": "Type: `file`, multiple: `True`, required, direction: `input`. " + }, + "ids": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Well IDs for the corresponding fastq input", + "help_text": "Type: `string`, multiple: `True`, required. " + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/io/publish_fastqs/.config.vsh.yaml b/target/nextflow/io/publish_fastqs/.config.vsh.yaml new file mode 100644 index 00000000..dc270d70 --- /dev/null +++ b/target/nextflow/io/publish_fastqs/.config.vsh.yaml @@ -0,0 +1,192 @@ +name: "publish_fastqs" +namespace: "io" +version: "updatecraftbox" +argument_groups: +- name: "Input arguments" + arguments: + - type: "file" + name: "--input" + description: "Directory to write fastq data to" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" +- name: "Output arguments" + arguments: + - type: "file" + name: "--output" + info: null + default: + - "$id" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "bash_script" + path: "code.sh" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +- type: "file" + path: "_viash.yaml" + dest: "_viash.yaml" +description: "Publish the fastq files per well" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +requirements: + commands: + - "ps" +license: "MIT" +links: + repository: "https://github.com/viash-hub/htrnaseq" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "debian:stable-slim" + target_registry: "images.viash-hub.com" + target_tag: "updatecraftbox" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/io/publish_fastqs/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/io/publish_fastqs" + executable: "target/nextflow/io/publish_fastqs/main.nf" + viash_version: "0.9.4" + git_commit: "e6da525fc57aaec74f348eb974b68faa647bf800" + git_remote: "https://github.com/viash-hub/htrnaseq" +package_config: + name: "htrnaseq" + version: "updatecraftbox" + summary: "A workflow for high-throughput RNA-seq data analyses.\n" + description: "This workflow is designed to process high-throughput RNA-seq data,\ + \ where every\nwell of a microarray plate is a sample. A fasta file provided as\ + \ input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow\ + \ is built in a modular fashion, where most of the base functionality\nis provided\ + \ by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\n\ + supplemented by custom base components and workflow components in this package.\n\ + \nThe full workflow is split in two major subworkflows that can be run independently:\n\ + \n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per\ + \ well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate\ + \ QC reports.\n\nEach of those can be started individually, or the full workflow\ + \ can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq)\ + \ \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner)\ + \ where a\nnumber of choices (input/output structure and location) have been made.\n\ + \nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other\ + \ formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex)\ + \ first.\n" + info: + test_resources: + - path: "gs://viash-hub-resources/htrnaseq/v2" + dest: "resources_test" + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script\ + \ += 'includeConfig(\"nextflow_labels.config\")'\n.resources += {path: '/src/config/labels.config',\ + \ dest: 'nextflow_labels.config'}\n.resources += {path: '/_viash.yaml', dest:\ + \ '_viash.yaml'}\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'updatecraftbox'" + keywords: + - "bioinformatics" + - "sequencing" + - "high-throughput" + - "RNAseq" + - "mapping" + - "counting" + - "pipeline" + - "workflow" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/viash-hub/htrnaseq" + issue_tracker: "https://github.com/viash-hub/htrnaseq/issues" diff --git a/target/nextflow/io/publish_fastqs/_viash.yaml b/target/nextflow/io/publish_fastqs/_viash.yaml new file mode 100644 index 00000000..d08de7b1 --- /dev/null +++ b/target/nextflow/io/publish_fastqs/_viash.yaml @@ -0,0 +1,21 @@ +name: htrnaseq +summary: | + A workflow for high-throughput RNA-seq data analyses. +description: "This workflow is designed to process high-throughput RNA-seq data, where every\nwell of a microarray plate is a sample. A fasta file provided as input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow is built in a modular fashion, where most of the base functionality\nis provided by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\nsupplemented by custom base components and workflow components in this package.\n\nThe full workflow is split in two major subworkflows that can be run independently:\n\n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate QC reports.\n\nEach of those can be started individually, or the full workflow can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq) \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner) where a\nnumber of choices (input/output structure and location) have been made.\n\nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex) first.\n" +license: MIT +keywords: [bioinformatics, sequencing, high-throughput, RNAseq, mapping, counting, pipeline, workflow] +links: + issue_tracker: https://github.com/viash-hub/htrnaseq/issues + repository: https://github.com/viash-hub/htrnaseq +viash_version: 0.9.4 +info: + test_resources: + - path: gs://viash-hub-resources/htrnaseq/v2 + dest: resources_test +config_mods: | + .requirements.commands := ['ps'] + .runners[.type == 'nextflow'].config.script += 'includeConfig("nextflow_labels.config")' + .resources += {path: '/src/config/labels.config', dest: 'nextflow_labels.config'} + .resources += {path: '/_viash.yaml', dest: '_viash.yaml'} +version: updatecraftbox +organization: vsh diff --git a/target/nextflow/io/publish_fastqs/main.nf b/target/nextflow/io/publish_fastqs/main.nf new file mode 100644 index 00000000..5737aba0 --- /dev/null +++ b/target/nextflow/io/publish_fastqs/main.nf @@ -0,0 +1,3804 @@ +// publish_fastqs updatecraftbox +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "publish_fastqs", + "namespace" : "io", + "version" : "updatecraftbox", + "argument_groups" : [ + { + "name" : "Input arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "Directory to write fastq data to", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Output arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "default" : [ + "$id" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "bash_script", + "path" : "./code.sh", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/config/labels.config", + "dest" : "nextflow_labels.config" + }, + { + "type" : "file", + "path" : "/_viash.yaml", + "dest" : "_viash.yaml" + } + ], + "description" : "Publish the fastq files per well", + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "requirements" : { + "commands" : [ + "ps" + ] + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/viash-hub/htrnaseq" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "debian:stable-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "updatecraftbox", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/io/publish_fastqs/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "target/nextflow/io/publish_fastqs", + "viash_version" : "0.9.4", + "git_commit" : "e6da525fc57aaec74f348eb974b68faa647bf800", + "git_remote" : "https://github.com/viash-hub/htrnaseq" + }, + "package_config" : { + "name" : "htrnaseq", + "version" : "updatecraftbox", + "summary" : "A workflow for high-throughput RNA-seq data analyses.\n", + "description" : "This workflow is designed to process high-throughput RNA-seq data, where every\nwell of a microarray plate is a sample. A fasta file provided as input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow is built in a modular fashion, where most of the base functionality\nis provided by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\nsupplemented by custom base components and workflow components in this package.\n\nThe full workflow is split in two major subworkflows that can be run independently:\n\n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate QC reports.\n\nEach of those can be started individually, or the full workflow can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq) \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner) where a\nnumber of choices (input/output structure and location) have been made.\n\nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex) first.\n", + "info" : { + "test_resources" : [ + { + "path" : "gs://viash-hub-resources/htrnaseq/v2", + "dest" : "resources_test" + } + ] + }, + "viash_version" : "0.9.4", + "source" : "src", + "target" : "target", + "config_mods" : [ + ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script += 'includeConfig(\\"nextflow_labels.config\\")'\n.resources += {path: '/src/config/labels.config', dest: 'nextflow_labels.config'}\n.resources += {path: '/_viash.yaml', dest: '_viash.yaml'}\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'updatecraftbox'" + ], + "keywords" : [ + "bioinformatics", + "sequencing", + "high-throughput", + "RNAseq", + "mapping", + "counting", + "pipeline", + "workflow" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/viash-hub/htrnaseq", + "issue_tracker" : "https://github.com/viash-hub/htrnaseq/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.sh" +cat > "$tempscript" << VIASHMAIN +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "${VIASH_PAR_INPUT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_input='&'#" ; else echo "# par_input="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "${VIASH_PAR_OUTPUT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_output='&'#" ; else echo "# par_output="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END +#!/bin/bash + +echo "Publishing \\$par_input -> \\$par_output" + +echo +echo "Creating directory if it does not exist:" +mkdir -p "\\$par_output" && echo "\\$par_output created" + +echo +echo "Copying files..." +IFS=";" read -ra input <<<\\$par_input + +for i in "\\${input[@]}"; do + cp -rL "\\$i" "\\$par_output/" +done +VIASHMAIN +bash "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/htrnaseq/io/publish_fastqs", + "tag" : "updatecraftbox" + }, + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/io/publish_fastqs/nextflow.config b/target/nextflow/io/publish_fastqs/nextflow.config new file mode 100644 index 00000000..52903e1d --- /dev/null +++ b/target/nextflow/io/publish_fastqs/nextflow.config @@ -0,0 +1,125 @@ +manifest { + name = 'io/publish_fastqs' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'updatecraftbox' + description = 'Publish the fastq files per well' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/io/publish_fastqs/nextflow_labels.config b/target/nextflow/io/publish_fastqs/nextflow_labels.config new file mode 100644 index 00000000..2821ec46 --- /dev/null +++ b/target/nextflow/io/publish_fastqs/nextflow_labels.config @@ -0,0 +1,108 @@ +executor { + $k8s { + submitRateLimit = '10sec' + pollInterval = '1 sec' + } +} + +process { + container = 'nextflow/bash:latest' + + // default resources + memory = { 8.Gb * task.attempt } + cpus = 8 + maxForks = 36 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + maxMemory = 192.GB + + // Resource labels + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 8 } + withLabel: midcpu { cpus = 16 } + withLabel: highcpu { cpus = 32 } + + withLabel: verylowmem { memory = { get_memory( 4.GB * task.attempt ) } } + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 16.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 64.GB * task.attempt ) } } + +} + +profiles { + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.fixOwnership = true + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + + local { + // This config is for local processing. + process { + withName: ".*parallel_map_process" { + maxForks = 1 + } + maxMemory = 25.GB + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 6 } + withLabel: highcpu { cpus = 8 } + + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 12.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 20.GB * task.attempt ) } } + } + } +} + +def get_memory(to_compare) { + if (!process.containsKey("maxMemory") || !process.maxMemory) { + return to_compare + } + + try { + if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) { + return process.maxMemory + } + else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) { + return max_memory as nextflow.util.MemoryUnit + } + else { + return to_compare + } + } catch (all) { + println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!" + System.exit(1) + } + } diff --git a/target/nextflow/io/publish_fastqs/nextflow_schema.json b/target/nextflow/io/publish_fastqs/nextflow_schema.json new file mode 100644 index 00000000..0150c949 --- /dev/null +++ b/target/nextflow/io/publish_fastqs/nextflow_schema.json @@ -0,0 +1,62 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "publish_fastqs", + "description": "Publish the fastq files per well", + "type": "object", + "$defs": { + "input arguments": { + "title": "Input arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "exists": true, + "description": "Directory to write fastq data to", + "help_text": "Type: `file`, multiple: `True`, required, direction: `input`. " + } + } + }, + "output arguments": { + "title": "Output arguments", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id\"`, direction: `output`. ", + "default": "$id" + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/input arguments" + }, + { + "$ref": "#/$defs/output arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/io/publish_results/.config.vsh.yaml b/target/nextflow/io/publish_results/.config.vsh.yaml new file mode 100644 index 00000000..fcd78a47 --- /dev/null +++ b/target/nextflow/io/publish_results/.config.vsh.yaml @@ -0,0 +1,332 @@ +name: "publish_results" +namespace: "io" +version: "updatecraftbox" +argument_groups: +- name: "Input arguments" + arguments: + - type: "file" + name: "--star_output" + description: "Output from mapping with STAR" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--nrReadsNrGenesPerChrom" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--star_qc_metrics" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--eset" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--f_data" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--p_data" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--html_report" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--run_params" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output directory" + description: "Determines the name of output directories\n" + arguments: + - type: "file" + name: "--star_output_dir" + info: null + default: + - "star_output" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--nrReadsNrGenesPerChrom_dir" + info: null + default: + - "nrReadsNrGenesPerChrom" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--star_qc_metrics_dir" + info: null + default: + - "starLogs" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--eset_dir" + info: null + default: + - "esets" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--f_data_dir" + info: null + default: + - "fData" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--p_data_dir" + info: null + default: + - "pData" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +- name: "Output file arguments" + description: "Determines the name of output files" + arguments: + - type: "file" + name: "--run_params_output" + info: null + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--html_report_output" + info: null + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "bash_script" + path: "code.sh" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +- type: "file" + path: "_viash.yaml" + dest: "_viash.yaml" +description: "Publish the results" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +requirements: + commands: + - "ps" +license: "MIT" +links: + repository: "https://github.com/viash-hub/htrnaseq" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "debian:stable-slim" + target_registry: "images.viash-hub.com" + target_tag: "updatecraftbox" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/io/publish_results/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/io/publish_results" + executable: "target/nextflow/io/publish_results/main.nf" + viash_version: "0.9.4" + git_commit: "e6da525fc57aaec74f348eb974b68faa647bf800" + git_remote: "https://github.com/viash-hub/htrnaseq" +package_config: + name: "htrnaseq" + version: "updatecraftbox" + summary: "A workflow for high-throughput RNA-seq data analyses.\n" + description: "This workflow is designed to process high-throughput RNA-seq data,\ + \ where every\nwell of a microarray plate is a sample. A fasta file provided as\ + \ input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow\ + \ is built in a modular fashion, where most of the base functionality\nis provided\ + \ by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\n\ + supplemented by custom base components and workflow components in this package.\n\ + \nThe full workflow is split in two major subworkflows that can be run independently:\n\ + \n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per\ + \ well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate\ + \ QC reports.\n\nEach of those can be started individually, or the full workflow\ + \ can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq)\ + \ \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner)\ + \ where a\nnumber of choices (input/output structure and location) have been made.\n\ + \nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other\ + \ formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex)\ + \ first.\n" + info: + test_resources: + - path: "gs://viash-hub-resources/htrnaseq/v2" + dest: "resources_test" + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script\ + \ += 'includeConfig(\"nextflow_labels.config\")'\n.resources += {path: '/src/config/labels.config',\ + \ dest: 'nextflow_labels.config'}\n.resources += {path: '/_viash.yaml', dest:\ + \ '_viash.yaml'}\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'updatecraftbox'" + keywords: + - "bioinformatics" + - "sequencing" + - "high-throughput" + - "RNAseq" + - "mapping" + - "counting" + - "pipeline" + - "workflow" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/viash-hub/htrnaseq" + issue_tracker: "https://github.com/viash-hub/htrnaseq/issues" diff --git a/target/nextflow/io/publish_results/_viash.yaml b/target/nextflow/io/publish_results/_viash.yaml new file mode 100644 index 00000000..d08de7b1 --- /dev/null +++ b/target/nextflow/io/publish_results/_viash.yaml @@ -0,0 +1,21 @@ +name: htrnaseq +summary: | + A workflow for high-throughput RNA-seq data analyses. +description: "This workflow is designed to process high-throughput RNA-seq data, where every\nwell of a microarray plate is a sample. A fasta file provided as input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow is built in a modular fashion, where most of the base functionality\nis provided by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\nsupplemented by custom base components and workflow components in this package.\n\nThe full workflow is split in two major subworkflows that can be run independently:\n\n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate QC reports.\n\nEach of those can be started individually, or the full workflow can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq) \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner) where a\nnumber of choices (input/output structure and location) have been made.\n\nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex) first.\n" +license: MIT +keywords: [bioinformatics, sequencing, high-throughput, RNAseq, mapping, counting, pipeline, workflow] +links: + issue_tracker: https://github.com/viash-hub/htrnaseq/issues + repository: https://github.com/viash-hub/htrnaseq +viash_version: 0.9.4 +info: + test_resources: + - path: gs://viash-hub-resources/htrnaseq/v2 + dest: resources_test +config_mods: | + .requirements.commands := ['ps'] + .runners[.type == 'nextflow'].config.script += 'includeConfig("nextflow_labels.config")' + .resources += {path: '/src/config/labels.config', dest: 'nextflow_labels.config'} + .resources += {path: '/_viash.yaml', dest: '_viash.yaml'} +version: updatecraftbox +organization: vsh diff --git a/target/nextflow/io/publish_results/main.nf b/target/nextflow/io/publish_results/main.nf new file mode 100644 index 00000000..f541a4e5 --- /dev/null +++ b/target/nextflow/io/publish_results/main.nf @@ -0,0 +1,4058 @@ +// publish_results updatecraftbox +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "publish_results", + "namespace" : "io", + "version" : "updatecraftbox", + "argument_groups" : [ + { + "name" : "Input arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--star_output", + "description" : "Output from mapping with STAR", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--nrReadsNrGenesPerChrom", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--star_qc_metrics", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--eset", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--f_data", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--p_data", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--html_report", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--run_params", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Output directory", + "description" : "Determines the name of output directories\n", + "arguments" : [ + { + "type" : "file", + "name" : "--star_output_dir", + "default" : [ + "star_output" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--nrReadsNrGenesPerChrom_dir", + "default" : [ + "nrReadsNrGenesPerChrom" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--star_qc_metrics_dir", + "default" : [ + "starLogs" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--eset_dir", + "default" : [ + "esets" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--f_data_dir", + "default" : [ + "fData" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--p_data_dir", + "default" : [ + "pData" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Output file arguments", + "description" : "Determines the name of output files", + "arguments" : [ + { + "type" : "file", + "name" : "--run_params_output", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--html_report_output", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "bash_script", + "path" : "./code.sh", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/config/labels.config", + "dest" : "nextflow_labels.config" + }, + { + "type" : "file", + "path" : "/_viash.yaml", + "dest" : "_viash.yaml" + } + ], + "description" : "Publish the results", + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "requirements" : { + "commands" : [ + "ps" + ] + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/viash-hub/htrnaseq" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "debian:stable-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "updatecraftbox", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/io/publish_results/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "target/nextflow/io/publish_results", + "viash_version" : "0.9.4", + "git_commit" : "e6da525fc57aaec74f348eb974b68faa647bf800", + "git_remote" : "https://github.com/viash-hub/htrnaseq" + }, + "package_config" : { + "name" : "htrnaseq", + "version" : "updatecraftbox", + "summary" : "A workflow for high-throughput RNA-seq data analyses.\n", + "description" : "This workflow is designed to process high-throughput RNA-seq data, where every\nwell of a microarray plate is a sample. A fasta file provided as input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow is built in a modular fashion, where most of the base functionality\nis provided by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\nsupplemented by custom base components and workflow components in this package.\n\nThe full workflow is split in two major subworkflows that can be run independently:\n\n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate QC reports.\n\nEach of those can be started individually, or the full workflow can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq) \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner) where a\nnumber of choices (input/output structure and location) have been made.\n\nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex) first.\n", + "info" : { + "test_resources" : [ + { + "path" : "gs://viash-hub-resources/htrnaseq/v2", + "dest" : "resources_test" + } + ] + }, + "viash_version" : "0.9.4", + "source" : "src", + "target" : "target", + "config_mods" : [ + ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script += 'includeConfig(\\"nextflow_labels.config\\")'\n.resources += {path: '/src/config/labels.config', dest: 'nextflow_labels.config'}\n.resources += {path: '/_viash.yaml', dest: '_viash.yaml'}\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'updatecraftbox'" + ], + "keywords" : [ + "bioinformatics", + "sequencing", + "high-throughput", + "RNAseq", + "mapping", + "counting", + "pipeline", + "workflow" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/viash-hub/htrnaseq", + "issue_tracker" : "https://github.com/viash-hub/htrnaseq/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.sh" +cat > "$tempscript" << VIASHMAIN +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_STAR_OUTPUT+x} ]; then echo "${VIASH_PAR_STAR_OUTPUT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_star_output='&'#" ; else echo "# par_star_output="; fi ) +$( if [ ! -z ${VIASH_PAR_NRREADSNRGENESPERCHROM+x} ]; then echo "${VIASH_PAR_NRREADSNRGENESPERCHROM}" | sed "s#'#'\\"'\\"'#g;s#.*#par_nrReadsNrGenesPerChrom='&'#" ; else echo "# par_nrReadsNrGenesPerChrom="; fi ) +$( if [ ! -z ${VIASH_PAR_STAR_QC_METRICS+x} ]; then echo "${VIASH_PAR_STAR_QC_METRICS}" | sed "s#'#'\\"'\\"'#g;s#.*#par_star_qc_metrics='&'#" ; else echo "# par_star_qc_metrics="; fi ) +$( if [ ! -z ${VIASH_PAR_ESET+x} ]; then echo "${VIASH_PAR_ESET}" | sed "s#'#'\\"'\\"'#g;s#.*#par_eset='&'#" ; else echo "# par_eset="; fi ) +$( if [ ! -z ${VIASH_PAR_F_DATA+x} ]; then echo "${VIASH_PAR_F_DATA}" | sed "s#'#'\\"'\\"'#g;s#.*#par_f_data='&'#" ; else echo "# par_f_data="; fi ) +$( if [ ! -z ${VIASH_PAR_P_DATA+x} ]; then echo "${VIASH_PAR_P_DATA}" | sed "s#'#'\\"'\\"'#g;s#.*#par_p_data='&'#" ; else echo "# par_p_data="; fi ) +$( if [ ! -z ${VIASH_PAR_HTML_REPORT+x} ]; then echo "${VIASH_PAR_HTML_REPORT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_html_report='&'#" ; else echo "# par_html_report="; fi ) +$( if [ ! -z ${VIASH_PAR_RUN_PARAMS+x} ]; then echo "${VIASH_PAR_RUN_PARAMS}" | sed "s#'#'\\"'\\"'#g;s#.*#par_run_params='&'#" ; else echo "# par_run_params="; fi ) +$( if [ ! -z ${VIASH_PAR_STAR_OUTPUT_DIR+x} ]; then echo "${VIASH_PAR_STAR_OUTPUT_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#par_star_output_dir='&'#" ; else echo "# par_star_output_dir="; fi ) +$( if [ ! -z ${VIASH_PAR_NRREADSNRGENESPERCHROM_DIR+x} ]; then echo "${VIASH_PAR_NRREADSNRGENESPERCHROM_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#par_nrReadsNrGenesPerChrom_dir='&'#" ; else echo "# par_nrReadsNrGenesPerChrom_dir="; fi ) +$( if [ ! -z ${VIASH_PAR_STAR_QC_METRICS_DIR+x} ]; then echo "${VIASH_PAR_STAR_QC_METRICS_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#par_star_qc_metrics_dir='&'#" ; else echo "# par_star_qc_metrics_dir="; fi ) +$( if [ ! -z ${VIASH_PAR_ESET_DIR+x} ]; then echo "${VIASH_PAR_ESET_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#par_eset_dir='&'#" ; else echo "# par_eset_dir="; fi ) +$( if [ ! -z ${VIASH_PAR_F_DATA_DIR+x} ]; then echo "${VIASH_PAR_F_DATA_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#par_f_data_dir='&'#" ; else echo "# par_f_data_dir="; fi ) +$( if [ ! -z ${VIASH_PAR_P_DATA_DIR+x} ]; then echo "${VIASH_PAR_P_DATA_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#par_p_data_dir='&'#" ; else echo "# par_p_data_dir="; fi ) +$( if [ ! -z ${VIASH_PAR_RUN_PARAMS_OUTPUT+x} ]; then echo "${VIASH_PAR_RUN_PARAMS_OUTPUT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_run_params_output='&'#" ; else echo "# par_run_params_output="; fi ) +$( if [ ! -z ${VIASH_PAR_HTML_REPORT_OUTPUT+x} ]; then echo "${VIASH_PAR_HTML_REPORT_OUTPUT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_html_report_output='&'#" ; else echo "# par_html_report_output="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END +#!/bin/bash + +set -eo pipefail + +echo "Publishing results to multiple output directories" + +# Create output directories for multiple files +echo "Creating output directories..." + +path_pars=( + par_star_output_dir + par_nrReadsNrGenesPerChrom_dir + par_star_qc_metrics_dir + par_eset_dir + par_f_data_dir + par_p_data_dir + par_html_report_output + par_run_params_output +) + +for par in \\${path_pars[@]}; do + curr_val="\\${!par}" + new_value=\\$(realpath --canonicalize-missing "\\$curr_val") + declare -g "\\$par=\\$new_value" +done + +mkdir -p "\\$par_star_output_dir" && echo "\\$par_star_output_dir created" +mkdir -p "\\$par_nrReadsNrGenesPerChrom_dir" && echo "\\$par_nrReadsNrGenesPerChrom_dir created" +mkdir -p "\\$par_star_qc_metrics_dir" && echo "\\$par_star_qc_metrics_dir created" +mkdir -p "\\$par_eset_dir" && echo "\\$par_eset_dir created" +mkdir -p "\\$par_f_data_dir" && echo "\\$par_f_data_dir created" +mkdir -p "\\$par_p_data_dir" && echo "\\$par_p_data_dir created" + +echo +echo "Copying STAR output files..." +IFS=";" read -ra star_output <<<\\$par_star_output +for i in "\\${star_output[@]}"; do + echo "Copying \\$i to \\$par_star_output_dir/" + cp -rL "\\$i" "\\$par_star_output_dir/" +done + +echo +echo "Copying nrReadsNrGenesPerChrom files..." +IFS=";" read -ra nrReadsNrGenesPerChrom <<<\\$par_nrReadsNrGenesPerChrom +for i in "\\${nrReadsNrGenesPerChrom[@]}"; do + echo "Copying \\$i to \\$par_nrReadsNrGenesPerChrom_dir/" + cp -rL "\\$i" "\\$par_nrReadsNrGenesPerChrom_dir/" +done + +echo +echo "Copying STAR QC metrics files..." +IFS=";" read -ra star_qc_metrics <<<\\$par_star_qc_metrics +for i in "\\${star_qc_metrics[@]}"; do + echo "Copying \\$i to \\$par_star_qc_metrics_dir/" + cp -rL "\\$i" "\\$par_star_qc_metrics_dir/" +done + +echo +echo "Copying eset files..." +IFS=";" read -ra eset <<<\\$par_eset +for i in "\\${eset[@]}"; do + echo "Copying \\$i to \\$par_eset_dir/" + cp -rL "\\$i" "\\$par_eset_dir/" +done + +echo +echo "Copying f_data files..." +IFS=";" read -ra f_data <<<\\$par_f_data +for i in "\\${f_data[@]}"; do + echo "Copying \\$i to \\$par_f_data_dir/" + cp -rL "\\$i" "\\$par_f_data_dir/" +done + +echo +echo "Copying p_data files..." +IFS=";" read -ra p_data <<<\\$par_p_data +for i in "\\${p_data[@]}"; do + echo "Copying \\$i to \\$par_p_data_dir/" + cp -rL "\\$i" "\\$par_p_data_dir/" +done + +echo +echo "Copying single files directly..." +mkdir -p \\$(dirname "\\$par_html_report_output") +echo "Copying \\$par_html_report to \\$par_html_report_output" +cp -L "\\$par_html_report" "\\$par_html_report_output" + +echo "Copying \\$par_run_params to \\$par_run_params_output" +mkdir -p \\$(dirname "\\$par_run_params_output") +cp -L "\\$par_run_params" "\\$par_run_params_output" + +echo +echo "Publishing completed successfully!" +VIASHMAIN +bash "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/htrnaseq/io/publish_results", + "tag" : "updatecraftbox" + }, + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/io/publish_results/nextflow.config b/target/nextflow/io/publish_results/nextflow.config new file mode 100644 index 00000000..082a4fbd --- /dev/null +++ b/target/nextflow/io/publish_results/nextflow.config @@ -0,0 +1,125 @@ +manifest { + name = 'io/publish_results' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'updatecraftbox' + description = 'Publish the results' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/io/publish_results/nextflow_labels.config b/target/nextflow/io/publish_results/nextflow_labels.config new file mode 100644 index 00000000..2821ec46 --- /dev/null +++ b/target/nextflow/io/publish_results/nextflow_labels.config @@ -0,0 +1,108 @@ +executor { + $k8s { + submitRateLimit = '10sec' + pollInterval = '1 sec' + } +} + +process { + container = 'nextflow/bash:latest' + + // default resources + memory = { 8.Gb * task.attempt } + cpus = 8 + maxForks = 36 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + maxMemory = 192.GB + + // Resource labels + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 8 } + withLabel: midcpu { cpus = 16 } + withLabel: highcpu { cpus = 32 } + + withLabel: verylowmem { memory = { get_memory( 4.GB * task.attempt ) } } + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 16.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 64.GB * task.attempt ) } } + +} + +profiles { + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.fixOwnership = true + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + + local { + // This config is for local processing. + process { + withName: ".*parallel_map_process" { + maxForks = 1 + } + maxMemory = 25.GB + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 6 } + withLabel: highcpu { cpus = 8 } + + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 12.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 20.GB * task.attempt ) } } + } + } +} + +def get_memory(to_compare) { + if (!process.containsKey("maxMemory") || !process.maxMemory) { + return to_compare + } + + try { + if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) { + return process.maxMemory + } + else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) { + return max_memory as nextflow.util.MemoryUnit + } + else { + return to_compare + } + } catch (all) { + println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!" + System.exit(1) + } + } diff --git a/target/nextflow/io/publish_results/nextflow_schema.json b/target/nextflow/io/publish_results/nextflow_schema.json new file mode 100644 index 00000000..b83eb8e6 --- /dev/null +++ b/target/nextflow/io/publish_results/nextflow_schema.json @@ -0,0 +1,185 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "publish_results", + "description": "Publish the results", + "type": "object", + "$defs": { + "input arguments": { + "title": "Input arguments", + "type": "object", + "description": "No description", + "properties": { + "star_output": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "exists": true, + "description": "Output from mapping with STAR", + "help_text": "Type: `file`, multiple: `True`, required, direction: `input`. " + }, + "nrReadsNrGenesPerChrom": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "exists": true, + "description": "", + "help_text": "Type: `file`, multiple: `True`, required, direction: `input`. " + }, + "star_qc_metrics": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "exists": true, + "description": "", + "help_text": "Type: `file`, multiple: `True`, required, direction: `input`. " + }, + "eset": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "exists": true, + "description": "", + "help_text": "Type: `file`, multiple: `True`, required, direction: `input`. " + }, + "f_data": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "exists": true, + "description": "", + "help_text": "Type: `file`, multiple: `True`, required, direction: `input`. " + }, + "p_data": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "exists": true, + "description": "", + "help_text": "Type: `file`, multiple: `True`, required, direction: `input`. " + }, + "html_report": { + "type": "string", + "format": "path", + "exists": true, + "description": "", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`. " + }, + "run_params": { + "type": "string", + "format": "path", + "exists": true, + "description": "", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`. " + } + } + }, + "output directory": { + "title": "Output directory", + "type": "object", + "description": "Determines the name of output directories\n", + "properties": { + "star_output_dir": { + "type": "string", + "format": "path", + "description": "", + "help_text": "Type: `file`, multiple: `False`, default: `\"star_output\"`, direction: `output`. ", + "default": "star_output" + }, + "nrReadsNrGenesPerChrom_dir": { + "type": "string", + "format": "path", + "description": "", + "help_text": "Type: `file`, multiple: `False`, default: `\"nrReadsNrGenesPerChrom\"`, direction: `output`. ", + "default": "nrReadsNrGenesPerChrom" + }, + "star_qc_metrics_dir": { + "type": "string", + "format": "path", + "description": "", + "help_text": "Type: `file`, multiple: `False`, default: `\"starLogs\"`, direction: `output`. ", + "default": "starLogs" + }, + "eset_dir": { + "type": "string", + "format": "path", + "description": "", + "help_text": "Type: `file`, multiple: `False`, default: `\"esets\"`, direction: `output`. ", + "default": "esets" + }, + "f_data_dir": { + "type": "string", + "format": "path", + "description": "", + "help_text": "Type: `file`, multiple: `False`, default: `\"fData\"`, direction: `output`. ", + "default": "fData" + }, + "p_data_dir": { + "type": "string", + "format": "path", + "description": "", + "help_text": "Type: `file`, multiple: `False`, default: `\"pData\"`, direction: `output`. ", + "default": "pData" + } + } + }, + "output file arguments": { + "title": "Output file arguments", + "type": "object", + "description": "Determines the name of output files", + "properties": { + "run_params_output": { + "type": "string", + "format": "path", + "description": "", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.run_params_output\"`, direction: `output`. ", + "default": "$id.$key.run_params_output" + }, + "html_report_output": { + "type": "string", + "format": "path", + "description": "", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.html_report_output\"`, direction: `output`. ", + "default": "$id.$key.html_report_output" + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/input arguments" + }, + { + "$ref": "#/$defs/output directory" + }, + { + "$ref": "#/$defs/output file arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/parallel_map/.config.vsh.yaml b/target/nextflow/parallel_map/.config.vsh.yaml new file mode 100644 index 00000000..2f56646b --- /dev/null +++ b/target/nextflow/parallel_map/.config.vsh.yaml @@ -0,0 +1,338 @@ +name: "parallel_map" +version: "updatecraftbox" +authors: +- name: "Dries Schaumont" + roles: + - "maintainer" + info: + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Toni Verbeiren" + roles: + - "author" + - "maintainer" + info: + role: "Core Team Member" + links: + github: "tverbeiren" + linkedin: "verbeiren" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist and CEO" +argument_groups: +- name: "Input arguments" + arguments: + - type: "file" + name: "--input_r1" + description: "Input FASTQ files for the forward reads. All FASTQ file names must\ + \ start with the prefix '{well_id}_R1', where\n'well_id' can be found as the\ + \ sequence identifier in the barcodes FASTA file (see 'barcodesFasta' argument).\n\ + For each FASTQ file, a matching FASTQ file for the reverse reads must be provided\ + \ to the 'input_r2' argument,\nmeaning that their 'well_id' prefix must match.\ + \ The number of items provided for 'input_r1' must be equal\nto the number of\ + \ items for 'input_r2'.\n" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--input_r2" + description: "Input FASTQ files for the reverse reads. All FASTQ file names must\ + \ start with the prefix '{well_id}_R2', where\n'well_id' can be found as the\ + \ sequence identifier in the barcodes FASTA file (see 'barcodesFasta' argument).\n\ + For each FASTQ file, a matching FASTQ file for the reverse reads must be provided\ + \ to the 'input_r1' argument,\nmeaning that their 'well_id' prefix must match.\ + \ The number of items provided for 'input_r1' must be equal\nto the number of\ + \ items for 'input_r2'.\n" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--genomeDir" + description: "Reference genome to match to. Can be generated from genomic FASTA\ + \ sequences and a genome annotation\nby using STAR with '--runMode genomeGenerate'.\n" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--barcodesFasta" + description: "FASTA file where each entry specifies a unique barcode sequence\ + \ present at the start of the forward input reads\n(input_r1). The IDs of each\ + \ barcode (the start of the FASTA headers up until the first whitespace character)\ + \ must\nmatch with the start of one input FASTQ pair.\n" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Barcode arguments" + arguments: + - type: "integer" + name: "--umiLength" + description: "Length of the Unique Molecular Identifiers (UMI). The UMI are expected\ + \ to be located after the barcodes in the\nforwards reads.\n" + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--limitBAMsortRAM" + info: null + default: + - "10000000000" + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Runtime arguments" + arguments: + - type: "integer" + name: "--runThreadN" + description: "Number of threads to use for a single STAR execution." + info: null + default: + - 1 + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output arguments" + arguments: + - type: "file" + name: "--output" + description: "A list of output folders which are the result of using STAR to map\ + \ each input FASTQ pair STAR to the reference genome.\nThe order of the items\ + \ DO NOT match with the order of the entries in the barcodes FASTA file or the\ + \ input FASTQ pairs. \n" + info: null + default: + - "./*" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--joblog" + description: "Where to store the log file listing all the jobs." + info: null + default: + - "execution_log.txt" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "bash_script" + path: "script.sh" + is_executable: true +- type: "file" + path: "STAR" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +- type: "file" + path: "_viash.yaml" + dest: "_viash.yaml" +description: "Map wells in batch, using STAR\nSpliced Transcripts Alignment to a Reference\ + \ (C) Alexander Dobin\nhttps://github.com/alexdobin/STAR\n" +test_resources: +- type: "bash_script" + path: "test.sh" + is_executable: true +info: null +status: "enabled" +scope: + image: "public" + target: "public" +requirements: + commands: + - "ps" +license: "MIT" +links: + repository: "https://github.com/viash-hub/htrnaseq" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "debian:stable-slim" + target_registry: "images.viash-hub.com" + target_tag: "updatecraftbox" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + - "wget" + - "automake" + - "make" + - "gcc" + - "g++" + - "zlib1g-dev" + - "parallel" + - "file" + - "seqkit" + interactive: false + - type: "docker" + copy: + - "STAR /usr/local/bin/$STAR_BINARY" + build_args: + - "STAR_V=2.7.6a" + env: + - "STAR_SOURCE=\"https://github.com/alexdobin/STAR/archive/refs/tags/$STAR_V.tar.gz\"" + - "STAR_TARGET=\"/app/star-$STAR_V.tar.gz\"" + - "STAR_INSTALL_DIR=\"/app/STAR-$STAR_V\"" + - "STAR_BINARY=STAR" + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/parallel_map/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/parallel_map" + executable: "target/nextflow/parallel_map/main.nf" + viash_version: "0.9.4" + git_commit: "e6da525fc57aaec74f348eb974b68faa647bf800" + git_remote: "https://github.com/viash-hub/htrnaseq" +package_config: + name: "htrnaseq" + version: "updatecraftbox" + summary: "A workflow for high-throughput RNA-seq data analyses.\n" + description: "This workflow is designed to process high-throughput RNA-seq data,\ + \ where every\nwell of a microarray plate is a sample. A fasta file provided as\ + \ input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow\ + \ is built in a modular fashion, where most of the base functionality\nis provided\ + \ by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\n\ + supplemented by custom base components and workflow components in this package.\n\ + \nThe full workflow is split in two major subworkflows that can be run independently:\n\ + \n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per\ + \ well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate\ + \ QC reports.\n\nEach of those can be started individually, or the full workflow\ + \ can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq)\ + \ \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner)\ + \ where a\nnumber of choices (input/output structure and location) have been made.\n\ + \nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other\ + \ formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex)\ + \ first.\n" + info: + test_resources: + - path: "gs://viash-hub-resources/htrnaseq/v2" + dest: "resources_test" + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script\ + \ += 'includeConfig(\"nextflow_labels.config\")'\n.resources += {path: '/src/config/labels.config',\ + \ dest: 'nextflow_labels.config'}\n.resources += {path: '/_viash.yaml', dest:\ + \ '_viash.yaml'}\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'updatecraftbox'" + keywords: + - "bioinformatics" + - "sequencing" + - "high-throughput" + - "RNAseq" + - "mapping" + - "counting" + - "pipeline" + - "workflow" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/viash-hub/htrnaseq" + issue_tracker: "https://github.com/viash-hub/htrnaseq/issues" diff --git a/target/nextflow/parallel_map/STAR b/target/nextflow/parallel_map/STAR new file mode 100755 index 00000000..331a838d Binary files /dev/null and b/target/nextflow/parallel_map/STAR differ diff --git a/target/nextflow/parallel_map/_viash.yaml b/target/nextflow/parallel_map/_viash.yaml new file mode 100644 index 00000000..d08de7b1 --- /dev/null +++ b/target/nextflow/parallel_map/_viash.yaml @@ -0,0 +1,21 @@ +name: htrnaseq +summary: | + A workflow for high-throughput RNA-seq data analyses. +description: "This workflow is designed to process high-throughput RNA-seq data, where every\nwell of a microarray plate is a sample. A fasta file provided as input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow is built in a modular fashion, where most of the base functionality\nis provided by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\nsupplemented by custom base components and workflow components in this package.\n\nThe full workflow is split in two major subworkflows that can be run independently:\n\n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate QC reports.\n\nEach of those can be started individually, or the full workflow can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq) \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner) where a\nnumber of choices (input/output structure and location) have been made.\n\nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex) first.\n" +license: MIT +keywords: [bioinformatics, sequencing, high-throughput, RNAseq, mapping, counting, pipeline, workflow] +links: + issue_tracker: https://github.com/viash-hub/htrnaseq/issues + repository: https://github.com/viash-hub/htrnaseq +viash_version: 0.9.4 +info: + test_resources: + - path: gs://viash-hub-resources/htrnaseq/v2 + dest: resources_test +config_mods: | + .requirements.commands := ['ps'] + .runners[.type == 'nextflow'].config.script += 'includeConfig("nextflow_labels.config")' + .resources += {path: '/src/config/labels.config', dest: 'nextflow_labels.config'} + .resources += {path: '/_viash.yaml', dest: '_viash.yaml'} +version: updatecraftbox +organization: vsh diff --git a/target/nextflow/parallel_map/main.nf b/target/nextflow/parallel_map/main.nf new file mode 100644 index 00000000..601a3c12 --- /dev/null +++ b/target/nextflow/parallel_map/main.nf @@ -0,0 +1,4293 @@ +// parallel_map updatecraftbox +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries Schaumont (maintainer) +// * Toni Verbeiren (author, maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "parallel_map", + "version" : "updatecraftbox", + "authors" : [ + { + "name" : "Dries Schaumont", + "roles" : [ + "maintainer" + ], + "info" : { + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + }, + { + "name" : "Toni Verbeiren", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "role" : "Core Team Member", + "links" : { + "github" : "tverbeiren", + "linkedin" : "verbeiren" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist and CEO" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Input arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input_r1", + "description" : "Input FASTQ files for the forward reads. All FASTQ file names must start with the prefix '{well_id}_R1', where\n'well_id' can be found as the sequence identifier in the barcodes FASTA file (see 'barcodesFasta' argument).\nFor each FASTQ file, a matching FASTQ file for the reverse reads must be provided to the 'input_r2' argument,\nmeaning that their 'well_id' prefix must match. The number of items provided for 'input_r1' must be equal\nto the number of items for 'input_r2'.\n", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--input_r2", + "description" : "Input FASTQ files for the reverse reads. All FASTQ file names must start with the prefix '{well_id}_R2', where\n'well_id' can be found as the sequence identifier in the barcodes FASTA file (see 'barcodesFasta' argument).\nFor each FASTQ file, a matching FASTQ file for the reverse reads must be provided to the 'input_r1' argument,\nmeaning that their 'well_id' prefix must match. The number of items provided for 'input_r1' must be equal\nto the number of items for 'input_r2'.\n", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--genomeDir", + "description" : "Reference genome to match to. Can be generated from genomic FASTA sequences and a genome annotation\nby using STAR with '--runMode genomeGenerate'.\n", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--barcodesFasta", + "description" : "FASTA file where each entry specifies a unique barcode sequence present at the start of the forward input reads\n(input_r1). The IDs of each barcode (the start of the FASTA headers up until the first whitespace character) must\nmatch with the start of one input FASTQ pair.\n", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Barcode arguments", + "arguments" : [ + { + "type" : "integer", + "name" : "--umiLength", + "description" : "Length of the Unique Molecular Identifiers (UMI). The UMI are expected to be located after the barcodes in the\nforwards reads.\n", + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--limitBAMsortRAM", + "default" : [ + "10000000000" + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Runtime arguments", + "arguments" : [ + { + "type" : "integer", + "name" : "--runThreadN", + "description" : "Number of threads to use for a single STAR execution.", + "default" : [ + 1 + ], + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Output arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "A list of output folders which are the result of using STAR to map each input FASTQ pair STAR to the reference genome.\nThe order of the items DO NOT match with the order of the entries in the barcodes FASTA file or the input FASTQ pairs. \n", + "default" : [ + "./*" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--joblog", + "description" : "Where to store the log file listing all the jobs.", + "default" : [ + "execution_log.txt" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "bash_script", + "path" : "script.sh", + "is_executable" : true + }, + { + "type" : "file", + "path" : "STAR" + }, + { + "type" : "file", + "path" : "/src/config/labels.config", + "dest" : "nextflow_labels.config" + }, + { + "type" : "file", + "path" : "/_viash.yaml", + "dest" : "_viash.yaml" + } + ], + "description" : "Map wells in batch, using STAR\nSpliced Transcripts Alignment to a Reference (C) Alexander Dobin\nhttps://github.com/alexdobin/STAR\n", + "test_resources" : [ + { + "type" : "bash_script", + "path" : "test.sh", + "is_executable" : true + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "requirements" : { + "commands" : [ + "ps" + ] + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/viash-hub/htrnaseq" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "debian:stable-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "updatecraftbox", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps", + "wget", + "automake", + "make", + "gcc", + "g++", + "zlib1g-dev", + "parallel", + "file", + "seqkit" + ], + "interactive" : false + }, + { + "type" : "docker", + "copy" : [ + "STAR /usr/local/bin/$STAR_BINARY" + ], + "build_args" : [ + "STAR_V=2.7.6a" + ], + "env" : [ + "STAR_SOURCE=\\"https://github.com/alexdobin/STAR/archive/refs/tags/$STAR_V.tar.gz\\"", + "STAR_TARGET=\\"/app/star-$STAR_V.tar.gz\\"", + "STAR_INSTALL_DIR=\\"/app/STAR-$STAR_V\\"", + "STAR_BINARY=STAR" + ] + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/parallel_map/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "target/nextflow/parallel_map", + "viash_version" : "0.9.4", + "git_commit" : "e6da525fc57aaec74f348eb974b68faa647bf800", + "git_remote" : "https://github.com/viash-hub/htrnaseq" + }, + "package_config" : { + "name" : "htrnaseq", + "version" : "updatecraftbox", + "summary" : "A workflow for high-throughput RNA-seq data analyses.\n", + "description" : "This workflow is designed to process high-throughput RNA-seq data, where every\nwell of a microarray plate is a sample. A fasta file provided as input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow is built in a modular fashion, where most of the base functionality\nis provided by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\nsupplemented by custom base components and workflow components in this package.\n\nThe full workflow is split in two major subworkflows that can be run independently:\n\n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate QC reports.\n\nEach of those can be started individually, or the full workflow can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq) \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner) where a\nnumber of choices (input/output structure and location) have been made.\n\nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex) first.\n", + "info" : { + "test_resources" : [ + { + "path" : "gs://viash-hub-resources/htrnaseq/v2", + "dest" : "resources_test" + } + ] + }, + "viash_version" : "0.9.4", + "source" : "src", + "target" : "target", + "config_mods" : [ + ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script += 'includeConfig(\\"nextflow_labels.config\\")'\n.resources += {path: '/src/config/labels.config', dest: 'nextflow_labels.config'}\n.resources += {path: '/_viash.yaml', dest: '_viash.yaml'}\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'updatecraftbox'" + ], + "keywords" : [ + "bioinformatics", + "sequencing", + "high-throughput", + "RNAseq", + "mapping", + "counting", + "pipeline", + "workflow" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/viash-hub/htrnaseq", + "issue_tracker" : "https://github.com/viash-hub/htrnaseq/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.sh" +cat > "$tempscript" << VIASHMAIN +#!/bin/bash + +## VIASH START +# The following code has been auto-generated by Viash. +$( if [ ! -z ${VIASH_PAR_INPUT_R1+x} ]; then echo "${VIASH_PAR_INPUT_R1}" | sed "s#'#'\\"'\\"'#g;s#.*#par_input_r1='&'#" ; else echo "# par_input_r1="; fi ) +$( if [ ! -z ${VIASH_PAR_INPUT_R2+x} ]; then echo "${VIASH_PAR_INPUT_R2}" | sed "s#'#'\\"'\\"'#g;s#.*#par_input_r2='&'#" ; else echo "# par_input_r2="; fi ) +$( if [ ! -z ${VIASH_PAR_GENOMEDIR+x} ]; then echo "${VIASH_PAR_GENOMEDIR}" | sed "s#'#'\\"'\\"'#g;s#.*#par_genomeDir='&'#" ; else echo "# par_genomeDir="; fi ) +$( if [ ! -z ${VIASH_PAR_BARCODESFASTA+x} ]; then echo "${VIASH_PAR_BARCODESFASTA}" | sed "s#'#'\\"'\\"'#g;s#.*#par_barcodesFasta='&'#" ; else echo "# par_barcodesFasta="; fi ) +$( if [ ! -z ${VIASH_PAR_UMILENGTH+x} ]; then echo "${VIASH_PAR_UMILENGTH}" | sed "s#'#'\\"'\\"'#g;s#.*#par_umiLength='&'#" ; else echo "# par_umiLength="; fi ) +$( if [ ! -z ${VIASH_PAR_LIMITBAMSORTRAM+x} ]; then echo "${VIASH_PAR_LIMITBAMSORTRAM}" | sed "s#'#'\\"'\\"'#g;s#.*#par_limitBAMsortRAM='&'#" ; else echo "# par_limitBAMsortRAM="; fi ) +$( if [ ! -z ${VIASH_PAR_RUNTHREADN+x} ]; then echo "${VIASH_PAR_RUNTHREADN}" | sed "s#'#'\\"'\\"'#g;s#.*#par_runThreadN='&'#" ; else echo "# par_runThreadN="; fi ) +$( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "${VIASH_PAR_OUTPUT}" | sed "s#'#'\\"'\\"'#g;s#.*#par_output='&'#" ; else echo "# par_output="; fi ) +$( if [ ! -z ${VIASH_PAR_JOBLOG+x} ]; then echo "${VIASH_PAR_JOBLOG}" | sed "s#'#'\\"'\\"'#g;s#.*#par_joblog='&'#" ; else echo "# par_joblog="; fi ) +$( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "${VIASH_META_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_name='&'#" ; else echo "# meta_name="; fi ) +$( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "${VIASH_META_FUNCTIONALITY_NAME}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_functionality_name='&'#" ; else echo "# meta_functionality_name="; fi ) +$( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "${VIASH_META_RESOURCES_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_resources_dir='&'#" ; else echo "# meta_resources_dir="; fi ) +$( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "${VIASH_META_EXECUTABLE}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_executable='&'#" ; else echo "# meta_executable="; fi ) +$( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "${VIASH_META_CONFIG}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_config='&'#" ; else echo "# meta_config="; fi ) +$( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "${VIASH_META_TEMP_DIR}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_temp_dir='&'#" ; else echo "# meta_temp_dir="; fi ) +$( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "${VIASH_META_CPUS}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_cpus='&'#" ; else echo "# meta_cpus="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "${VIASH_META_MEMORY_B}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_b='&'#" ; else echo "# meta_memory_b="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "${VIASH_META_MEMORY_KB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kb='&'#" ; else echo "# meta_memory_kb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "${VIASH_META_MEMORY_MB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mb='&'#" ; else echo "# meta_memory_mb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "${VIASH_META_MEMORY_GB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gb='&'#" ; else echo "# meta_memory_gb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "${VIASH_META_MEMORY_TB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tb='&'#" ; else echo "# meta_memory_tb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "${VIASH_META_MEMORY_PB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pb='&'#" ; else echo "# meta_memory_pb="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "${VIASH_META_MEMORY_KIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_kib='&'#" ; else echo "# meta_memory_kib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "${VIASH_META_MEMORY_MIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_mib='&'#" ; else echo "# meta_memory_mib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "${VIASH_META_MEMORY_GIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_gib='&'#" ; else echo "# meta_memory_gib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "${VIASH_META_MEMORY_TIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_tib='&'#" ; else echo "# meta_memory_tib="; fi ) +$( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "${VIASH_META_MEMORY_PIB}" | sed "s#'#'\\"'\\"'#g;s#.*#meta_memory_pib='&'#" ; else echo "# meta_memory_pib="; fi ) + +## VIASH END + +set -eo pipefail + +# Check if wildcard character is present in output folder template +printf "Checking if output folder template (\\$par_output) contains a single wildcard character '*'. " +output_glob_character="\\${par_output//[^\\\\*]}" +if [[ "\\${#output_glob_character}" -ne "1" ]]; then + echo "The value for --output must contain exactly one '*' character. Exiting..." + exit 1 +else + echo "Done, wildcard character found!" +fi + +# Split the delimited strings into arrays +IFS=';' read -r -a input_r1 <<< "\\$par_input_r1" +IFS=';' read -r -a input_r2 <<< "\\$par_input_r2" + +# Read barcodes FASTQ +# seqkit will make sure to take the leading non-whitespace as sequence identifier (ID) +# Luckily, this is the same as how cutadapt determines an adapter name from the FASTA header. +readarray -t well_ids < <(seqkit seq --name "\\$par_barcodesFasta" ) +readarray -t barcodes < <(seqkit seq --seq --upper-case --remove-gaps --gap-letters '^' --validate-seq "\\$par_barcodesFasta") + +# Function to test for unique values in array +function arrayContainsUniqueValues { + # Pass the argument by reference + local -n arr=\\$1 + # Create a temporary associative array + # in order to use its uniqueness of keys + # 'declare' in a function is automatically local + declare -A uniq_tmp + for item in "\\${arr[@]}"; do + uniq_tmp[\\$item]=0 # assigning a placeholder + done + local unique_array_values=(\\${!uniq_tmp[@]}) + if [ "\\${#unique_array_values[@]}" -eq "\\${#arr[@]}" ]; then + return + fi + false +} +arrayContainsUniqueValues barcodes +is_array_unique_exit_code=\\$? +if ! (exit \\$is_array_unique_exit_code); then + echo "The provided barcodes should be unique!" + echo "Values: \\$par_barcodes" + exit 1 +fi + +# Check that the number of values provided for the fastq files are the same. +num_r1_inputs="\\${#input_r1[@]}" +num_r2_inputs="\\${#input_r2[@]}" + +if [ ! "\\$num_r1_inputs" -eq "\\$num_r2_inputs" ]; then + echo "The number of values for arguments "\\\\ + "'input_r1' (\\$num_r1_inputs) and 'input_r2' (\\$num_r2_inputs) "\\\\ + "should be the same." + exit 1 +else + echo "Checked if the same as the number of R1 FASTQ (\\$num_r1_inputs) and R2 FASTQ files "\\\\ + "(\\$num_r2_inputs) were provided. Seems OK!" +fi + +# Loop over the well IDs and match them to the input FASTQ files +# The FASTQ file names should have the format {well_id}_R(1|2).fastq, +# which is the output format that the cutadapt component uses for demultiplexing. +# sorted_input_r1 and sorted_input_r2 are the input FASTQ files sorted by the order +# of the barcodes in the barcodes array (i.e. the order in the barcodes FASTA file). +declare -a sorted_input_r1=() +declare -a sorted_input_r2=() +for barcode_index in "\\${!barcodes[@]}"; do + barcode="\\${barcodes[\\$barcode_index]}" + well_id="\\${well_ids[\\$barcode_index]}" + echo "Finding FASTQ files for barcode \\${barcode}, well ID '\\${well_id}'." + # The FASTQ files for a particular barcode must match the following regex: + input_file_regex="^\\${well_id}_R[1-2]" + + for r1_index in "\\${!input_r1[@]}"; do + r1_file_path=\\${input_r1[\\$r1_index]} + r2_file_path=\\${input_r2[\\$r1_index]} + # Get the file names from the full path + r1_file_name=\\$(basename -- "\\$r1_file_path") + r2_file_name=\\$(basename -- "\\$r2_file_path") + + # Check if the file names match the regex + if [[ \\$r1_file_name =~ \\$input_file_regex ]]; then + echo "Matched with \\$r1_file_name and \\$r2_file_name." + # If the R1 FASTQ file matched the regex, + # the R2 file must have also been matched + if ! [[ \\$r2_file_name =~ \\$input_file_regex ]]; then + echo "File \\${r1_file_name} matched with regex \\${input_file_regex} "\\\\ + "but \\${r2_file_name} did not! Make sure that the order of "\\\\ + "the R1 and R2 input files match." + exit 1 + fi + # Add the + sorted_input_r1+=("\\$r1_file_path") + sorted_input_r2+=("\\$r2_file_path") + # Do not continue looking for more files for this barcode + # '2' to affect the *outer* loop (which indeed loops barcodes)! + continue 2 + fi + done + echo "Did not find FASTQ files files for well \\${well_id}! "\\\\ + "Make sure that the input files have the correct file name format."\\\\ + "Input files: \\${input_r1[@]}" + exit 1 +done + + +# Define the function that will be used to run a single job +function _run() { + local par_UMIlength="\\$1" + local par_output="\\$2" + local par_genomeDir="\\$3" + local par_limitBAMsortRAM="\\$4" + local par_runThreadN="\\$5" + local barcode="\\$6" + local input_R1="\\$7" + local input_R2="\\$8" + + local barcode_length="\\${#barcode}" + local umi_start="\\$((\\$barcode_length + 1))" + + set -eo pipefail + + echo <<-EOF + Processing \\$barcode + For the following inputs (lanes): + "\\$star_readFilesIn + EOF + + echo "Writing barcode '\\$barcode' to \\$barcode.txt and using it as input". + # Note that there is no possible conflict between jobs here + # because the barcodes are unique (and the barcode is part of the name + # of the file). + echo "\\$barcode" > "\\$barcode.txt" + + local dir="\\${par_output//\\\\*/\\$barcode}/" + echo "Setting output for barcode '\\$barcode' to '\\$dir'." + mkdir -p "\\$dir" + + # check if files are compressed + local TMPDIR=\\$(mktemp -d "\\$meta_temp_dir/parallel_map-\\$barcode-XXXXXX") + function clean_up { + [[ -d "\\$TMPDIR" ]] && rm -r "\\$TMPDIR" + } + trap clean_up RETURN + + # Decompress the input files when needed + # NOTE: for some reason, using STAR's --readFilesCommand does not always work + # This might be because STAR creates fifo files (see https://man7.org/linux/man-pages/man7/fifo.7.html) + # and this requires a filesystem that supports this. Another cause might be that the input files + # are symlinks. When testing this, using '--readFilesCommand "zcat"' + # always produced empty BAM files, but also a succesfull exit code (0) so the problem is not reported. + # However, the logs showed the following error: "gzip -: unexpected end of file". + + function is_gzipped { + printf "Checking if input '\\$1' (barcode '\\$barcode') is gzipped... " + if file "\\$1" | grep -q 'gzip'; then + echo "Done, detected compressed file." + return + fi + echo "Done, file does not need decompression." + false + } + + # Resolve symbolic links to actual file paths + input_R1=\\$(realpath \\$input_R1) + input_R2=\\$(realpath \\$input_R2) + + if is_gzipped \\$input_R1; then + local compressed_file_name_r1="\\$(basename -- \\$input_R1)" + local uncompressed_file_r1="\\$TMPDIR/\\${compressed_file_name_r1%.gz}" + printf "Unpacking input to \\$uncompressed_file_r1... " + zcat "\\$input_R1" > "\\$uncompressed_file_r1" + echo "Decompression done." + else + local uncompressed_file_r1="\\$input_R1" + fi + + if is_gzipped \\$input_R2; then + local compressed_file_name_r2="\\$(basename -- \\$input_R2)" + local uncompressed_file_r2="\\$TMPDIR/\\${compressed_file_name_r2%.gz}" + printf "Unpacking input to \\$uncompressed_file_r2... " + zcat "\\$input_R2" > "\\$uncompressed_file_r2" + echo "Decompression done." + else + local uncompressed_file_r2="\\$input_R2" + fi + + local n_input_lines_r1=\\$(wc -l < "\\$uncompressed_file_r1") + local n_input_lines_r2=\\$(wc -l < "\\$uncompressed_file_r2") + + printf "Checking if length of input file mates match. " + if (( \\$n_input_lines_r1 != n_input_lines_r2 )); then + echo "The length of file \\$input_R1 (\\$n_input_lines_r1) does not match with \\$input_R2 (\\$n_input_lines_r2)" + return 1 + else + echo "Seems OK, \\$n_input_lines_r1 input lines." + fi + echo "Starting STAR for barcode '\\$barcode'" + # soloType 'Droplet' is the same as 'CB_UMI_Simple': one UMI and one cell barcode of fixed length. + # By default in this mode, STAR will look for the cell barcode and the UMI int the last files specified with --readFilesIn + # So we need to specify R2 first and R1 second, because R1 contains the barcode and UMI. + # Also, you might be tempted to use '--soloBarcodeMate 1' to alter this behavior, but this requires the clipping + # the barcode from this mate by specifying --clip5pNbases and/or --clip3pNbases, which we do not want to do. + STAR \\\\ + --readFilesIn "\\$uncompressed_file_r2" "\\$uncompressed_file_r1" \\\\ + --soloType Droplet \\\\ + --quantMode GeneCounts \\\\ + --genomeLoad LoadAndKeep \\\\ + --limitBAMsortRAM "\\$par_limitBAMsortRAM" \\\\ + --runThreadN "\\$par_runThreadN" \\\\ + --outFilterMultimapNmax 1 \\\\ + --outSAMtype BAM SortedByCoordinate \\\\ + --soloCBstart 1 \\\\ + --readFilesType "Fastx" \\\\ + --soloCBlen "\\$barcode_length" \\\\ + --soloUMIstart "\\$umi_start" \\\\ + --soloUMIlen "\\$par_UMIlength" \\\\ + --soloBarcodeReadLength 0 \\\\ + --soloStrand Unstranded \\\\ + --soloFeatures Gene \\\\ + --genomeDir "\\$par_genomeDir" \\\\ + --outReadsUnmapped Fastx \\\\ + --outSAMunmapped Within \\\\ + --outSAMattributes NH HI nM AS CR UR CB UB GX GN \\\\ + --soloCBwhitelist "\\$barcode.txt" \\\\ + --outFileNamePrefix "\\$dir" \\\\ + --outTmpDir "\\$TMPDIR/STARtemp/" + + printf "Done running STAR. " + # Check if the number of processed reads is equal to the number of input reads + local n_input_reads=\\$((\\$n_input_lines_r1 / 4)) + local nr_output_reads=\\$(grep -Po "Number\\\\ of\\\\ input\\\\ reads \\\\\\\\|\\\\W*\\\\K\\\\d+" "\\$dir/Log.final.out") + if (( \\$nr_output_reads != \\$n_input_reads )); then + echo "Not all input reads were processed for barcode \\$barcode." + return 1 + else + echo "Processed \\$nr_output_reads reads for barcode \\$barcode". + fi + + printf "Making sure that the output has the proper permissions." + find "\\$dir" -type d -exec chmod o+x {} \\\\; + chmod -R o+r "\\$dir" + echo "Done" +} + +# Export the function - requires bash +export -f _run + +# Load reference genome +echo "Loading reference genome" +STAR --genomeLoad LoadAndExit --genomeDir "\\$par_genomeDir" + +# Run the concurrent jobs using GNU parallel + +# Make sure that parallel uses the correct shell +export PARALLEL_SHELL="/bin/bash" + +# Some notes: +# --halt now,fail=1: instruct parallel to exit when a job has failed and kill remaining running jobs. +# +# ::: is a special syntax for GNU parallel to delineate inputs +# If multiple ::: are given, each group will be treated as an input source, and all combinations of input +# sources will be generated. E.g. ::: 1 2 ::: a b c will result in the combinations (1,a) (1,b) (1,c) (2,a) (2,b) (2,c) +# The delimiter :::+ (note the extra '+') links the argument to the previous argument, and one argument from each of the input +# sources will be read. +parallel_cmd=("parallel" "--jobs" "80%" "--verbose" "--memfree" "2G" + "--tmpdir" "\\$meta_temp_dir" + "--retry-failed" "--retries" "4" "--halt" "soon,fail=1" + "--joblog" "\\$par_joblog" "_run" "{}") + +# Arguments for which there is one value, so these will not create extra jobs +parallel_cmd+=(":::" "\\$par_umiLength" ":::" "\\$par_output" ":::" "\\$par_genomeDir" ":::" "\\$par_limitBAMsortRAM" ":::" "\\$par_runThreadN") + +# Argument which in fact will cause extra jobs to be spawned, per job one item from each argument will be selected +# Thus, these argument lists should have the same length. +parallel_cmd+=(":::" "\\${barcodes[@]}" ":::+" "\\${sorted_input_r1[@]}" ":::+" "\\${sorted_input_r2[@]}") + +set +eo pipefail +"\\${parallel_cmd[@]}" +exit_code=\\$? +set -eo pipefail + +echo "GNU parallel finished!" + +# Unload reference +printf "Unloading reference genome. " +STAR --genomeLoad Remove --genomeDir "\\$par_genomeDir" +echo "Done!" + +# Exit code from GNU parallel: +# If fail=1 is used, the exit status will be the exit status of the failing job. +echo "Checking exit code" +if ((exit_code>0)); then + # Note that the ending HERE must be indented with TAB characters (not spaces) + # in order to remove leading indentation + MESSAGE=\\$( + cat <<-HERE + ================================================================== + + !!! An error occurred for one of the jobs. + Exit code of the failing job: \\$exit_code. + + %s + + ================================================================== + + HERE + ) + printf "\\$MESSAGE" "\\$(<\\$par_joblog)" + exit 1 +else + cat <<-HERE + ================================================================== + + Mapping went fine (exit code '\\$exit_code'), zero errors occurred + + ================================================================== + HERE + +fi +VIASHMAIN +bash "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/htrnaseq/parallel_map", + "tag" : "updatecraftbox" + }, + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/parallel_map/nextflow.config b/target/nextflow/parallel_map/nextflow.config new file mode 100644 index 00000000..8089ae61 --- /dev/null +++ b/target/nextflow/parallel_map/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'parallel_map' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'updatecraftbox' + description = 'Map wells in batch, using STAR\nSpliced Transcripts Alignment to a Reference (C) Alexander Dobin\nhttps://github.com/alexdobin/STAR\n' + author = 'Dries Schaumont, Toni Verbeiren' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/parallel_map/nextflow_labels.config b/target/nextflow/parallel_map/nextflow_labels.config new file mode 100644 index 00000000..2821ec46 --- /dev/null +++ b/target/nextflow/parallel_map/nextflow_labels.config @@ -0,0 +1,108 @@ +executor { + $k8s { + submitRateLimit = '10sec' + pollInterval = '1 sec' + } +} + +process { + container = 'nextflow/bash:latest' + + // default resources + memory = { 8.Gb * task.attempt } + cpus = 8 + maxForks = 36 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + maxMemory = 192.GB + + // Resource labels + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 8 } + withLabel: midcpu { cpus = 16 } + withLabel: highcpu { cpus = 32 } + + withLabel: verylowmem { memory = { get_memory( 4.GB * task.attempt ) } } + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 16.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 64.GB * task.attempt ) } } + +} + +profiles { + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.fixOwnership = true + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + + local { + // This config is for local processing. + process { + withName: ".*parallel_map_process" { + maxForks = 1 + } + maxMemory = 25.GB + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 6 } + withLabel: highcpu { cpus = 8 } + + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 12.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 20.GB * task.attempt ) } } + } + } +} + +def get_memory(to_compare) { + if (!process.containsKey("maxMemory") || !process.maxMemory) { + return to_compare + } + + try { + if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) { + return process.maxMemory + } + else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) { + return max_memory as nextflow.util.MemoryUnit + } + else { + return to_compare + } + } catch (all) { + println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!" + System.exit(1) + } + } diff --git a/target/nextflow/parallel_map/nextflow_schema.json b/target/nextflow/parallel_map/nextflow_schema.json new file mode 100644 index 00000000..702bf13a --- /dev/null +++ b/target/nextflow/parallel_map/nextflow_schema.json @@ -0,0 +1,135 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "parallel_map", + "description": "Map wells in batch, using STAR\nSpliced Transcripts Alignment to a Reference (C) Alexander Dobin\nhttps://github.com/alexdobin/STAR\n", + "type": "object", + "$defs": { + "input arguments": { + "title": "Input arguments", + "type": "object", + "description": "No description", + "properties": { + "input_r1": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "exists": true, + "description": "Input FASTQ files for the forward reads", + "help_text": "Type: `file`, multiple: `True`, required, direction: `input`. " + }, + "input_r2": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "exists": true, + "description": "Input FASTQ files for the reverse reads", + "help_text": "Type: `file`, multiple: `True`, required, direction: `input`. " + }, + "genomeDir": { + "type": "string", + "format": "path", + "exists": true, + "description": "Reference genome to match to", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`. " + }, + "barcodesFasta": { + "type": "string", + "format": "path", + "exists": true, + "description": "FASTA file where each entry specifies a unique barcode sequence present at the start of the forward input reads\n(input_r1)", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`. " + } + } + }, + "barcode arguments": { + "title": "Barcode arguments", + "type": "object", + "description": "No description", + "properties": { + "umiLength": { + "type": "integer", + "description": "Length of the Unique Molecular Identifiers (UMI)", + "help_text": "Type: `integer`, multiple: `False`, required. " + }, + "limitBAMsortRAM": { + "type": "string", + "description": "", + "help_text": "Type: `string`, multiple: `False`, default: `\"10000000000\"`. ", + "default": "10000000000" + } + } + }, + "runtime arguments": { + "title": "Runtime arguments", + "type": "object", + "description": "No description", + "properties": { + "runThreadN": { + "type": "integer", + "description": "Number of threads to use for a single STAR execution.", + "help_text": "Type: `integer`, multiple: `False`, default: `1`. ", + "default": 1 + } + } + }, + "output arguments": { + "title": "Output arguments", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "description": "A list of output folders which are the result of using STAR to map each input FASTQ pair STAR to the reference genome.\nThe order of the items DO NOT match with the order of the entries in the barcodes FASTA file or the input FASTQ pairs", + "help_text": "Type: `file`, multiple: `True`, required, default: `[\"./*\"]`, direction: `output`. ", + "default": [ + "./*" + ] + }, + "joblog": { + "type": "string", + "format": "path", + "description": "Where to store the log file listing all the jobs.", + "help_text": "Type: `file`, multiple: `False`, default: `\"execution_log.txt\"`, direction: `output`. ", + "default": "execution_log.txt" + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/input arguments" + }, + { + "$ref": "#/$defs/barcode arguments" + }, + { + "$ref": "#/$defs/runtime arguments" + }, + { + "$ref": "#/$defs/output arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/report/create_report/.config.vsh.yaml b/target/nextflow/report/create_report/.config.vsh.yaml new file mode 100644 index 00000000..9ebaf4f3 --- /dev/null +++ b/target/nextflow/report/create_report/.config.vsh.yaml @@ -0,0 +1,268 @@ +name: "create_report" +namespace: "report" +version: "updatecraftbox" +authors: +- name: "Dries Schaumont" + roles: + - "maintainer" + info: + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Marijke Van Moerbeke" + roles: + - "author" + - "maintainer" + info: + links: + github: "mvanmoerbeke" + orcid: "0000-0002-3097-5621" + linkedin: "marijke-van-moerbeke-84303a34" + organizations: + - name: "OpenAnalytics" + href: "https://www.openanalytics.eu" + role: "Statistical Consultant" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--eset" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--output_report" + info: null + example: + - "report.html" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "r_script" + path: "script.R" + is_executable: true +- type: "r_script" + path: "template.Rmd" + is_executable: true +- type: "r_script" + path: "plateLayouts.R" + is_executable: true +- type: "file" + path: "OutputSTARsolo.png" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +- type: "file" + path: "_viash.yaml" + dest: "_viash.yaml" +description: "Create a basic QC report in HTML format based on a number of esets.\n" +test_resources: +- type: "r_script" + path: "test.R" + is_executable: true +- type: "file" + path: "test_data" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +requirements: + commands: + - "ps" +license: "MIT" +links: + repository: "https://github.com/viash-hub/htrnaseq" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "rocker/r2u:24.04" + target_registry: "images.viash-hub.com" + target_tag: "updatecraftbox" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + - "pandoc" + interactive: false + - type: "r" + script: + - "install.packages(\"BiocManager\")" + - "BiocManager::install(version = \"3.21\", type = \"source\", checkBuilt = TRUE)" + bioc_force_install: false + warnings_as_errors: true + - type: "r" + cran: + - "ggplot2" + - "knitr" + - "gridExtra" + - "RColorBrewer" + - "processx" + - "whisker" + - "rmarkdown" + - "bookdown" + - "data.table" + - "platetools" + - "htmltools" + - "DT" + - "logger" + - "bit64" + bioc: + - "Biobase" + - "ComplexHeatmap" + script: + - "install.packages(\"oaStyle\", repos = c(rdepot = \"https://repos.openanalytics.eu/repo/public\"\ + , getOption(\"repos\")))" + bioc_force_install: false + warnings_as_errors: true + test_setup: + - type: "r" + packages: + - "testthat" + - "R.utils" + bioc_force_install: false + warnings_as_errors: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/report/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/report/create_report" + executable: "target/nextflow/report/create_report/main.nf" + viash_version: "0.9.4" + git_commit: "e6da525fc57aaec74f348eb974b68faa647bf800" + git_remote: "https://github.com/viash-hub/htrnaseq" +package_config: + name: "htrnaseq" + version: "updatecraftbox" + summary: "A workflow for high-throughput RNA-seq data analyses.\n" + description: "This workflow is designed to process high-throughput RNA-seq data,\ + \ where every\nwell of a microarray plate is a sample. A fasta file provided as\ + \ input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow\ + \ is built in a modular fashion, where most of the base functionality\nis provided\ + \ by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\n\ + supplemented by custom base components and workflow components in this package.\n\ + \nThe full workflow is split in two major subworkflows that can be run independently:\n\ + \n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per\ + \ well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate\ + \ QC reports.\n\nEach of those can be started individually, or the full workflow\ + \ can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq)\ + \ \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner)\ + \ where a\nnumber of choices (input/output structure and location) have been made.\n\ + \nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other\ + \ formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex)\ + \ first.\n" + info: + test_resources: + - path: "gs://viash-hub-resources/htrnaseq/v2" + dest: "resources_test" + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script\ + \ += 'includeConfig(\"nextflow_labels.config\")'\n.resources += {path: '/src/config/labels.config',\ + \ dest: 'nextflow_labels.config'}\n.resources += {path: '/_viash.yaml', dest:\ + \ '_viash.yaml'}\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'updatecraftbox'" + keywords: + - "bioinformatics" + - "sequencing" + - "high-throughput" + - "RNAseq" + - "mapping" + - "counting" + - "pipeline" + - "workflow" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/viash-hub/htrnaseq" + issue_tracker: "https://github.com/viash-hub/htrnaseq/issues" diff --git a/target/nextflow/report/create_report/OutputSTARsolo.png b/target/nextflow/report/create_report/OutputSTARsolo.png new file mode 100644 index 00000000..cb77d8e0 Binary files /dev/null and b/target/nextflow/report/create_report/OutputSTARsolo.png differ diff --git a/target/nextflow/report/create_report/_viash.yaml b/target/nextflow/report/create_report/_viash.yaml new file mode 100644 index 00000000..d08de7b1 --- /dev/null +++ b/target/nextflow/report/create_report/_viash.yaml @@ -0,0 +1,21 @@ +name: htrnaseq +summary: | + A workflow for high-throughput RNA-seq data analyses. +description: "This workflow is designed to process high-throughput RNA-seq data, where every\nwell of a microarray plate is a sample. A fasta file provided as input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow is built in a modular fashion, where most of the base functionality\nis provided by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\nsupplemented by custom base components and workflow components in this package.\n\nThe full workflow is split in two major subworkflows that can be run independently:\n\n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate QC reports.\n\nEach of those can be started individually, or the full workflow can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq) \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner) where a\nnumber of choices (input/output structure and location) have been made.\n\nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex) first.\n" +license: MIT +keywords: [bioinformatics, sequencing, high-throughput, RNAseq, mapping, counting, pipeline, workflow] +links: + issue_tracker: https://github.com/viash-hub/htrnaseq/issues + repository: https://github.com/viash-hub/htrnaseq +viash_version: 0.9.4 +info: + test_resources: + - path: gs://viash-hub-resources/htrnaseq/v2 + dest: resources_test +config_mods: | + .requirements.commands := ['ps'] + .runners[.type == 'nextflow'].config.script += 'includeConfig("nextflow_labels.config")' + .resources += {path: '/src/config/labels.config', dest: 'nextflow_labels.config'} + .resources += {path: '/_viash.yaml', dest: '_viash.yaml'} +version: updatecraftbox +organization: vsh diff --git a/target/nextflow/report/create_report/main.nf b/target/nextflow/report/create_report/main.nf new file mode 100644 index 00000000..7b498c39 --- /dev/null +++ b/target/nextflow/report/create_report/main.nf @@ -0,0 +1,3953 @@ +// create_report updatecraftbox +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries Schaumont (maintainer) +// * Marijke Van Moerbeke (author, maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "create_report", + "namespace" : "report", + "version" : "updatecraftbox", + "authors" : [ + { + "name" : "Dries Schaumont", + "roles" : [ + "maintainer" + ], + "info" : { + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + }, + { + "name" : "Marijke Van Moerbeke", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "links" : { + "github" : "mvanmoerbeke", + "orcid" : "0000-0002-3097-5621", + "linkedin" : "marijke-van-moerbeke-84303a34" + }, + "organizations" : [ + { + "name" : "OpenAnalytics", + "href" : "https://www.openanalytics.eu", + "role" : "Statistical Consultant" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--eset", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output_report", + "example" : [ + "report.html" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "r_script", + "path" : "script.R", + "is_executable" : true + }, + { + "type" : "r_script", + "path" : "template.Rmd", + "is_executable" : true + }, + { + "type" : "r_script", + "path" : "plateLayouts.R", + "is_executable" : true + }, + { + "type" : "file", + "path" : "OutputSTARsolo.png" + }, + { + "type" : "file", + "path" : "/src/config/labels.config", + "dest" : "nextflow_labels.config" + }, + { + "type" : "file", + "path" : "/_viash.yaml", + "dest" : "_viash.yaml" + } + ], + "description" : "Create a basic QC report in HTML format based on a number of esets.\n", + "test_resources" : [ + { + "type" : "r_script", + "path" : "test.R", + "is_executable" : true + }, + { + "type" : "file", + "path" : "./test_data" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "requirements" : { + "commands" : [ + "ps" + ] + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/viash-hub/htrnaseq" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "rocker/r2u:24.04", + "target_registry" : "images.viash-hub.com", + "target_tag" : "updatecraftbox", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps", + "pandoc" + ], + "interactive" : false + }, + { + "type" : "r", + "script" : [ + "install.packages(\\"BiocManager\\")", + "BiocManager::install(version = \\"3.21\\", type = \\"source\\", checkBuilt = TRUE)" + ], + "bioc_force_install" : false, + "warnings_as_errors" : true + }, + { + "type" : "r", + "cran" : [ + "ggplot2", + "knitr", + "gridExtra", + "RColorBrewer", + "processx", + "whisker", + "rmarkdown", + "bookdown", + "data.table", + "platetools", + "htmltools", + "DT", + "logger", + "bit64" + ], + "bioc" : [ + "Biobase", + "ComplexHeatmap" + ], + "script" : [ + "install.packages(\\"oaStyle\\", repos = c(rdepot = \\"https://repos.openanalytics.eu/repo/public\\", getOption(\\"repos\\")))" + ], + "bioc_force_install" : false, + "warnings_as_errors" : true + } + ], + "test_setup" : [ + { + "type" : "r", + "packages" : [ + "testthat", + "R.utils" + ], + "bioc_force_install" : false, + "warnings_as_errors" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/report/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "target/nextflow/report/create_report", + "viash_version" : "0.9.4", + "git_commit" : "e6da525fc57aaec74f348eb974b68faa647bf800", + "git_remote" : "https://github.com/viash-hub/htrnaseq" + }, + "package_config" : { + "name" : "htrnaseq", + "version" : "updatecraftbox", + "summary" : "A workflow for high-throughput RNA-seq data analyses.\n", + "description" : "This workflow is designed to process high-throughput RNA-seq data, where every\nwell of a microarray plate is a sample. A fasta file provided as input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow is built in a modular fashion, where most of the base functionality\nis provided by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\nsupplemented by custom base components and workflow components in this package.\n\nThe full workflow is split in two major subworkflows that can be run independently:\n\n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate QC reports.\n\nEach of those can be started individually, or the full workflow can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq) \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner) where a\nnumber of choices (input/output structure and location) have been made.\n\nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex) first.\n", + "info" : { + "test_resources" : [ + { + "path" : "gs://viash-hub-resources/htrnaseq/v2", + "dest" : "resources_test" + } + ] + }, + "viash_version" : "0.9.4", + "source" : "src", + "target" : "target", + "config_mods" : [ + ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script += 'includeConfig(\\"nextflow_labels.config\\")'\n.resources += {path: '/src/config/labels.config', dest: 'nextflow_labels.config'}\n.resources += {path: '/_viash.yaml', dest: '_viash.yaml'}\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'updatecraftbox'" + ], + "keywords" : [ + "bioinformatics", + "sequencing", + "high-throughput", + "RNAseq", + "mapping", + "counting", + "pipeline", + "workflow" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/viash-hub/htrnaseq", + "issue_tracker" : "https://github.com/viash-hub/htrnaseq/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.R" +cat > "$tempscript" << VIASHMAIN +## VIASH START +# The following code has been auto-generated by Viash. +# treat warnings as errors +.viash_orig_warn <- options(warn = 2) + +par <- list( + "eset" = $( if [ ! -z ${VIASH_PAR_ESET+x} ]; then echo -n "strsplit('"; echo -n "$VIASH_PAR_ESET" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "', split = ';')[[1]]"; else echo NULL; fi ), + "output_report" = $( if [ ! -z ${VIASH_PAR_OUTPUT_REPORT+x} ]; then echo -n "'"; echo -n "$VIASH_PAR_OUTPUT_REPORT" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ) +) +meta <- list( + "name" = $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_META_NAME" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "functionality_name" = $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo -n "'"; echo -n "$VIASH_META_FUNCTIONALITY_NAME" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "resources_dir" = $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_META_RESOURCES_DIR" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "executable" = $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo -n "'"; echo -n "$VIASH_META_EXECUTABLE" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "config" = $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo -n "'"; echo -n "$VIASH_META_CONFIG" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "temp_dir" = $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo -n "'"; echo -n "$VIASH_META_TEMP_DIR" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "'"; else echo NULL; fi ), + "cpus" = $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo -n "as.integer('"; echo -n "$VIASH_META_CPUS" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_b" = $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_B" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_kb" = $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_KB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_mb" = $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_MB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_gb" = $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_GB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_tb" = $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_TB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_pb" = $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_PB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_kib" = $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_KIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_mib" = $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_MIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_gib" = $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_GIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_tib" = $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_TIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ), + "memory_pib" = $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo -n "bit64::as.integer64('"; echo -n "$VIASH_META_MEMORY_PIB" | sed "s#['\\\\]#\\\\\\\\&#g"; echo "')"; else echo NULL; fi ) +) +dep <- list( + +) + + +# restore original warn setting +options(.viash_orig_warn) +rm(.viash_orig_warn) + +## VIASH END +library(whisker) +library(logger) + +log_info("Setting temporary directory to: {meta\\$temp_dir}") +Sys.setenv(TMP = meta\\$temp_dir) +temp_folder <- tempdir(check = TRUE) +log_info("Created temporary directory {temp_folder}") + +template <- file.path(meta\\$resources_dir, "template.Rmd") + +esets_normalized <- lapply(par\\$eset, function(eset_path) { + return(file.path(normalizePath(dirname(eset_path)), basename(eset_path))) +}) + +log_info(paste0( + "Rendering markdown {template} to HTML ", + "{par\\$output_report} with esets {paste(esets_normalized, collapse = ', ')}" +)) + +rmarkdown::render( + normalizePath(template), + output_file = basename(par\\$output_report), + output_dir = dirname(par\\$output_report), + runtime = "static", + intermediates_dir = par\\$report_dir, + clean = TRUE, + params = list( + esets = esets_normalized, + outputDir = par\\$report_dir + ) +) + +log_info("Done") +VIASHMAIN +Rscript "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/htrnaseq/report/create_report", + "tag" : "updatecraftbox" + }, + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/report/create_report/nextflow.config b/target/nextflow/report/create_report/nextflow.config new file mode 100644 index 00000000..dcb6d23c --- /dev/null +++ b/target/nextflow/report/create_report/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'report/create_report' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'updatecraftbox' + description = 'Create a basic QC report in HTML format based on a number of esets.\n' + author = 'Dries Schaumont, Marijke Van Moerbeke' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/report/create_report/nextflow_labels.config b/target/nextflow/report/create_report/nextflow_labels.config new file mode 100644 index 00000000..2821ec46 --- /dev/null +++ b/target/nextflow/report/create_report/nextflow_labels.config @@ -0,0 +1,108 @@ +executor { + $k8s { + submitRateLimit = '10sec' + pollInterval = '1 sec' + } +} + +process { + container = 'nextflow/bash:latest' + + // default resources + memory = { 8.Gb * task.attempt } + cpus = 8 + maxForks = 36 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + maxMemory = 192.GB + + // Resource labels + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 8 } + withLabel: midcpu { cpus = 16 } + withLabel: highcpu { cpus = 32 } + + withLabel: verylowmem { memory = { get_memory( 4.GB * task.attempt ) } } + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 16.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 64.GB * task.attempt ) } } + +} + +profiles { + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.fixOwnership = true + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + + local { + // This config is for local processing. + process { + withName: ".*parallel_map_process" { + maxForks = 1 + } + maxMemory = 25.GB + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 6 } + withLabel: highcpu { cpus = 8 } + + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 12.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 20.GB * task.attempt ) } } + } + } +} + +def get_memory(to_compare) { + if (!process.containsKey("maxMemory") || !process.maxMemory) { + return to_compare + } + + try { + if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) { + return process.maxMemory + } + else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) { + return max_memory as nextflow.util.MemoryUnit + } + else { + return to_compare + } + } catch (all) { + println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!" + System.exit(1) + } + } diff --git a/target/nextflow/report/create_report/nextflow_schema.json b/target/nextflow/report/create_report/nextflow_schema.json new file mode 100644 index 00000000..5362bf8e --- /dev/null +++ b/target/nextflow/report/create_report/nextflow_schema.json @@ -0,0 +1,52 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "create_report", + "description": "Create a basic QC report in HTML format based on a number of esets.\n", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "eset": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "exists": true, + "description": "", + "help_text": "Type: `file`, multiple: `True`, required, direction: `input`. " + }, + "output_report": { + "type": "string", + "format": "path", + "description": "", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output_report.html\"`, direction: `output`, example: `\"report.html\"`. ", + "default": "$id.$key.output_report.html" + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/report/create_report/plateLayouts.R b/target/nextflow/report/create_report/plateLayouts.R new file mode 100755 index 00000000..a2909d4a --- /dev/null +++ b/target/nextflow/report/create_report/plateLayouts.R @@ -0,0 +1,447 @@ + +#' Displays the annotation of the wells in a plateLayout +#' @param plateData a data.table object containing the information +#' of the plate. This must contain a "WellID". +#' @param plateName The plate name +#' @param valueVariable The name of the variable in 'plateData' to +#' be visualized in a plate layout. +#' @param textVariable The name of the variable in 'plateData' to be +#' shown in the wells of the plate layout. If NULL, the valueVariable +#' is shown. +#' @param colours A named character vector containing the colours +#' for the different levels of the valuevariable. The names should +#' correspond to the dose levels. if not specified, a scheme of blues +#' will be provided. +#' @param breaks Numeric vector indicating breaks for plot coloring. +#' @param colourWellText Colour to display the text in the wells. +#' @param layout Integer vector of length two with number of rows and +#' colums in a plate, e.g. \code{c(16,24)} +#' @param legend.title A title for the legend +#' @param plot.title A title for the plot, will be contracted +#' with the plate name +#' @param ... additional arguments for \code{plateLayout.default} function +#' @import data.table +#' @importFrom platetools fill_plate +#' @export +plateLayout.annotation <- function( + plateData, + plateName = character(), + valueVariable = "Dose", + textVariable = NULL, + breaks = NULL, colours = NULL, + colourWellText = "black", + layout = c(16, 24), + legend.title = "Dose", + plot.title = "Plate Annotation - ", + textFontSize = 9, ... +) { + WellID <- Label <- NULL + + if (!(all(c("WellID", "SampleName") %in% colnames(plateData)))) { + stop(" 'WellID' and 'SampleName' column required in plateData object") + } + + #Check WellID Format + checkWellID <- grepl("^[[:upper:]]{1,2}[[:digit:]]{1,2}$", plateData$WellID) + if(!all(checkWellID)){ + stop("WellID does not have the correct format") + } + + + plateData[, WellID := paste0( + sub(".*([[:alpha:]]).+", "\\1", plateData$WellID), + sprintf( + "%02d", as.numeric(sub(".*[[:alpha:]](.+)", "\\1", plateData$WellID)) + ) + )] + + plateData <- platetools::fill_plate(plateData, "WellID", plate = layout[1]*layout[2]) + + plateData$column <- factor( + sprintf( + "%02d", + as.numeric(sub(".*[[:alpha:]](.+)", "\\1", plateData$WellID)) + ), + levels = sprintf("%02d", seq(1, layout[2])) + ) + plateData$row <- factor(sub(".*([[:alpha:]]).+", "\\1", plateData$WellID), + levels = LETTERS[seq(1, layout[1])]) + + if (!is.null(valueVariable)){ + plateData[, values := as.character(plateData[, ..valueVariable][[1]])] + valueVar <- "values" + }else{ + plateData[, values := "grey"] + valueVar <- "values" + colours <- setNames("grey", "grey") + } + + + if (is.null(colours)) { + + blues <- colorRampPalette(c("#d6e0ff", "#2171B5")) + greens <- colorRampPalette(c("light green", "dark green")) + + numLevels <- sort(as.numeric(as.character(unique(plateData[, values])[ + grepl( + "^[[:digit:]]+([.][[:digit:]]+)?$", + trimws(unique(plateData[, values])) + ) + ]))) + otherLevels <- sort(as.character(unique(plateData[, values])[ + !grepl( + "^[[:digit:]]+([.][[:digit:]]+)?$", + trimws(unique(plateData[,values])) + ) + ])) + + colours <- c(blues(length(numLevels)), greens(length(otherLevels)), "red") + names(colours) <- c(numLevels, otherLevels, "failed") + } + + if (!is.null(textVariable)) { + plateData[, + Label := do.call(paste, c(.SD, sep = "\n ")), + .SDcols = textVariable + ] + plateData[, Label := gsub("-", "-\n", Label)] + plateData[, Label := gsub("_", "_\n", Label)] + textVar <- "Label" + } else { + textVar <- NULL + } + + + if (is.null(breaks)){ + breaks <- seq_len(length(colours)) + } + + plateLayout( + plateData = plateData, valueVariable = valueVar, + textVariable = textVar, plateName = plateName, + breaks = breaks, colourWellText = colourWellText, + legend.title = legend.title, layout = layout, + colours = colours, plot.title = plot.title, + textFontSize = textFontSize, ... + ) +} + + + +#' Create a heatmap of values in a plateLayout view. The values can be +#' library sizes, number of genes, qcScore (0/1) or a factor. +#' @param plateData A data.table of the values to be visualized with +#' at least the column of interest (specified in 'varOfInterest') +#' and a 'WellID' column indicating the wells in the plate. The WellID +#' is a combination of a letter (row in the plate) and an integer +#' (column in the plate). +#' @param valueVariable The name of the variable in 'plateData' +#' to be visualized in a plate layout +#' @param textVariable The name of the variable in 'plateData' +#' to be shown in the wells of the plate layout. Defaults to the +#' valueVariable and if NULL, no text will be displayed. +#' @param breaks Numeric vector indicating breaks for plot coloring. +#' @param colours Colours to be used for levels specified by +#' the breaks. If NULL, a colour scheme of purples is shown. +#' @param colourWellText Colour to display the text in the wells. +#' @param layout Integer vector of length two with number of rows +#' and colums in a plate, e.g. \code{c(16,24)} +#' @param makeContourColours Logical, whether or not the plate +#' layout will contain a contour colours for the wells based on the +#' parameters in 'contourColours' and 'categories' +#' @param contourVariable The variable used for the contour colouring +#' @param contourColours Character vector specifying a colour for +#' each range in 'categories' +#' @param labelsCategories Character vector specifying the names +#' (labels) for each range in 'categories' +#' @param categories if contour Variable is not a factor, a numeric +#' vector specifying the categories to divide the 'varOfInterest', +#' including the lower and upper limits. +#' @param plateName The plate name +#' @param plot.title A title for the plot, will be contracted with +#' the plate name +#' @param legend.title A title for the legend +#' @param displayHeatmap Logical, whether to display the plateLayout heatmap +#' @param saveHeatmap Logical, whether to save the plateLayout heatmap +#' @param outputDir The directory where the plateLayout heatmap should be saved +#' @param prefix The prefix to the file name of the saved plateLayout heatmap +#' @param ... additional arguments for \code{ComplexHeatmap::Heatmap} function +#' @importFrom platetools fill_plate +#' @importFrom RColorBrewer brewer.pal +#' @importFrom ComplexHeatmap Heatmap +#' @importFrom circlize colorRamp2 +#' @importFrom grid grid.text grid.rect gpar legendGrob gpar +#' @importFrom grDevices dev.off png +#' @importFrom graphics title +#' @export +plateLayout <- function( + plateData, valueVariable, textVariable = valueVariable, + breaks = NULL, colours = NULL, colourWellText = "white", textFontSize = 6, + layout = c(16, 24), makeContourColours = FALSE, contourVariable = character(), + contourColours = c("red", "orange", "seagreen3"),lwdContours = c(1, 1, 1), + labelsCategories = c('1', '2', '3'), categories = NULL, plateName = character(), + plot.title = character(), legend.title = NULL, legendFontSize = 15, + row_split = rep("A", 16), col_split = rep("A", 24), legendFontSizeTitle = 15, + displayHeatmap = TRUE, saveHeatmap = FALSE, outputDir = ".", prefix = "" +) { + WellID <- NULL + if (!(all(c("WellID", "SampleName") %in% colnames(plateData)))) { + stop(" 'WellID' and 'SampleName' column required in plateData object") + } + + + plateData[, WellID := paste0( + sub(".*([[:alpha:]]).+", "\\1", plateData$WellID), + sprintf( + "%02d", + as.numeric(sub(".*[[:alpha:]](.+)", "\\1", plateData$WellID)) + ) + )] + + plateData <- platetools::fill_plate(plateData, "WellID", plate = 384) + + plateData$column <- factor( + sprintf("%02d", as.numeric( + sub(".*[[:alpha:]](.+)", "\\1", plateData$WellID) + )), + levels = sprintf("%02d", seq(1, layout[2])) + ) + plateData$row <- factor(sub(".*([[:alpha:]]).+", "\\1", plateData$WellID), + levels = LETTERS[seq(1, layout[1])]) + + + plateValues <- plateLayoutFormat( + plateData, + varOfInterest = valueVariable, + rows = layout[1], + cols = layout[2] + ) + if (!is.null(textVariable)) { + plateText <- plateLayoutFormat( + plateData, varOfInterest = textVariable, + rows = layout[1], + cols = layout[2] + ) + } + plot.title <- gsub( + "^([a-z])", "\\U\\1", + gsub("([A-Z])", " \\1", + plot.title, perl = TRUE), perl = TRUE + ) + mainTitle <- paste0(plot.title, plateName) + plateContourColours <- matrix("", nrow = layout[1], ncol = layout[2]) + + if (makeContourColours) { + contourData <- plateData[WellType %in% c("nonEmpty", "Treated Wells"), ] + + if (is.numeric(contourData[, ..contourVariable][[1]])) { + contourData$contours <- cut( + contourData[, ..contourVariable][[1]], + categories, left = TRUE, + right = TRUE, + labels = labelsCategories) + } + else { + contourData$contours <- contourData[, ..contourVariable][[1]] + } + names(contourColours) <- labelsCategories + names(lwdContours) <- labelsCategories + for (i in seq_len(layout[1])) { + for (j in seq_len(layout[2])) { + tryCatch({ + sampleHit <- which( + as.character(contourData$WellID) == paste0( + LETTERS[i], sprintf("%02d", j) + ) + ) + if (length(sampleHit) == 1) { + plateContourColours[i, j] <- as.character( + contourData[sampleHit,'contours'][[1]] + ) + } + }, + error = function(e) { + print(paste0(LETTERS[i], sprintf("%02d", j), " is missing.")) + } + ) + } + } + } + + plateValues$contours <- plateContourColours + colnames(plateValues$values) <- seq_len(ncol(plateValues$values)) + + if (is.null(breaks)) { + breakValues <- plateValues$values + breakValues[which(is.na(breakValues))] <- 0 + if (all(breakValues >= 0)) { + breaks <- computeBreaks(7, max(plateValues$values, na.rm = TRUE)) + } else { + breaks <- quantile(plateValues$values, probs = seq(0, 1, 0.125)) + } + } + + if (is.null(colours)) { + colours <- tryCatch({ + circlize::colorRamp2( + breaks = breaks, + colors = brewer.pal(length(breaks), "Purples") + ) + }, + error = function(cond){ + + message("Recomputed breaks for proper colour mapping") + + breakValues <- plateValues$values + breakValues[which(is.na(breakValues))] <- 0 + if (all(breakValues >= 0)) { + breaks <- computeBreaks(7, max(plateValues$values, na.rm = TRUE)) + } else { + breaks <- quantile(plateValues$values, probs = seq(0, 1, 0.125)) + } + + circlize::colorRamp2( + breaks = breaks, + colors = brewer.pal(length(breaks), "Purples") + ) + + }) + } + + ht <- Heatmap( + plateValues$values, + column_title = mainTitle, column_title_side = "top", + rect_gp = gpar(lwd = 0.4), + cluster_rows = FALSE, cluster_columns = FALSE, + col = colours, row_title = NULL, + row_split = row_split, column_split = col_split, + row_names_side = "left", + cluster_row_slices = FALSE, + cluster_column_slices = FALSE, + show_heatmap_legend = TRUE, + heatmap_legend_param = list( + title = ifelse( + is.null(legend.title), + paste0(valueVariable, "\n"), + paste0(legend.title, "\n") + ), + grid_height = unit(9, "mm"), border = "black", + labels_gp = gpar(fontsize = legendFontSize), + title_gp = gpar(fontsize = legendFontSizeTitle) + ), + cell_fun = function(j, i, x, y, width, height, fill) { + if (is.na(plateValues$values[i, j])) { + grid.rect( + x, y, width, height, + gp = gpar(fill = "white", alpha = 0.7, lwd = 0.7, col = "white") + ) + } + else if (!is.null(textVariable)) { + grid.text( + plateText$values[i, j], x, y, + just = "centre", + gp = gpar(fontsize = textFontSize, col = colourWellText) + ) + } + if (makeContourColours) { + if (!is.na(plateValues$contours[i, j])) { + grid.rect( + x, y, width, height, + gp = gpar( + col = contourColours[as.character(plateValues$contours[i, j])], + fill = NA, + lwd = lwdContours[as.character(plateValues$contours[i, j])] + ) + ) + } + } + } + ) + + if (displayHeatmap) { + print(ht) + } + if (saveHeatmap) { + png( + file.path( + outputDir, + paste0(prefix,gsub(" |-", "",plot.title), "_", plateName, ".png") + ), + width = 30, height = 10, units = "cm", res = 1200 + ) + print(ht) + dev.off() + } + + return(ht) +} + + +#' Return numerical matrix with number of reads that corresponds to the +#' plate layout +#' @param data A data.frame of the values to be visualized with at least +#' the columnof interest (specified in 'varOfInterest') and a 'WellID' column +#' indicating the wells in the plate. The WellID is a combination of a +#' letter (row in the plate) and an integer (column in the plate). +#' @param varOfInterest The name of the variable in 'data' to be visualized +#' in a plate layout +#' @param rows number of rows in a plate layout +#' @param cols number of columns in a plate layout +#' @param verbose if \code{TRUE}, samples missing from the plate +#' will be reported +#' @export +plateLayoutFormat <- function( + data, varOfInterest, + rows = 16, cols = 24, + verbose = FALSE +) { + plateValues <- matrix(NA, nrow = rows, ncol = cols) + for (i in seq_len(rows)) { + for (j in seq_len(cols)) { + tryCatch({ + sampleHit <- which( + as.character(data$WellID) == paste0(LETTERS[i], sprintf("%02d", j)) + ) + if(length(sampleHit) == 1){ + plateValues[i, j] <- data[sampleHit, ..varOfInterest][[1]] + } + }, + error = function(e) { + if (verbose == TRUE) { + print(paste0(LETTERS[i], sprintf("%02d", j), " is missing.")) + } + } + ) + } + } + + row.names(plateValues) <- LETTERS[1:rows] + return(list("values" = plateValues)) +} + + + +#' Helper function to automate break selection for raw count data +#' +#' This function creates an exponentially increasing vector for given number +#' breaks between zero and some element of choice. It is particularly useful for +#' raw counts or raw counts per million. +#' +#' @param nBreaks Number of breaks to be generated +#' @param maxElement Maximum value of data entries +#' @export +computeBreaks <- function(nBreaks, variable) { + + maxElement <- max(variable, na.rm = TRUE) + if (length(unique(variable)) == 1) { + breaks <- c(0, 0.5, ifelse(maxElement < 1, 1, maxElement)) + } else { + coefSystem <- solve( + rbind(c(1, 1), c(1, (nBreaks - 1)))) %*% c(0, log(maxElement) + ) + coefExp <- c(exp(coefSystem[1]), coefSystem[2]) + breaks <- coefExp[1] * exp((1:(nBreaks - 1)) * coefExp[2]) + breaks <- unique(c(0, breaks)) + } + return(breaks) +} diff --git a/target/nextflow/report/create_report/template.Rmd b/target/nextflow/report/create_report/template.Rmd new file mode 100755 index 00000000..20f8c190 --- /dev/null +++ b/target/nextflow/report/create_report/template.Rmd @@ -0,0 +1,977 @@ +--- +title: "Exploratory Data Report" +date: "`r format(Sys.time(), '%d %B, %Y')`" +editor_options: + chunk_output_type: console +output: + oaStyle::html_report +# parameters which are overwritten by the script +params: + outputDir: 'output/' + esets: + - sample1.rds + - sample2.rds +--- + + + + + + + + + +```{r params, eval = TRUE, include = FALSE} +outputDir <- params$outputDir +esets <- params$esets +``` + + +```{r outputDir, echo = FALSE} +## Required: ABSOLUTE outputDir +outputDir <- file.path(outputDir) + +# When working on a windows computer it should be +# "/Users/..." instead of "C:/Users/..." +if (.Platform$OS.type == "windows") { + outputDir <- paste0( + "/", + paste( + unlist(strsplit(outputDir, split = "/"))[-1], collapse = "/" + ), + "/" + ) +} +``` + + + + +```{r optionsChunkDoNotModify, echo = FALSE, message = FALSE, warning=FALSE} + +## Chunk with options for knitr. This chunk should not be modified. +knitr::opts_chunk$set( + eval = TRUE, + echo = FALSE, + message = FALSE, + cache = FALSE, + warning = FALSE, + error = FALSE, + comment = NA, #"#", + tidy = FALSE, + collapse = TRUE, + out.width = "100%", + fig.width = 20, + fig.height = 10, + results = "asis") + +knitr::opts_knit$set(root.dir = getwd()) + +options(warn = 1, width = 200) + +``` + +```{r libraries_and_functions} +source("plateLayouts.R") +library(ComplexHeatmap) +library(data.table) +library(ggplot2) +library(knitr) +library(Biobase) +library(gridExtra) +library(RColorBrewer) +``` + + +```{r dataImport} + +# Create esetList +esetList <- sapply( + esets, simplify = FALSE, + USE.NAMES = TRUE, + function(eset_raw) { + if (!file.exists(eset_raw)) { + stop(paste0("Provided path '", eset_raw, "' is not a file.")) + } + eset <- readRDS(eset_raw) + } +) +pools <- sapply(esetList, function(eset) { + unique(eset$PoolName) +}) +names(esetList) <- unlist(pools) + +# Create qcData +pDataList <- lapply(esetList, function(eset) data.table(pData(eset))) +qcData <- rbindlist(pDataList, fill = TRUE) + +textVars <- "SampleName" +annotationVar <- "PoolName" + +if (!"SampleName" %in% names(qcData)) { + qcData[, SampleName := paste0(PoolName, "_", WellBC)] +} +qcData[, log10LibSize := round(log10(NumberOfInputReads))] +qcData[, (annotationVar) := lapply(.SD, as.factor), .SDcols = annotationVar] + + +colourList <- list() +Design_levels <- sort( + as.character(unique(qcData[, ..annotationVar][[1]])), + decreasing = TRUE +) + +if (length(Design_levels) == 1) { + colours <- c("#d6e0ff", "lightgrey") + names(colours) <- c(Design_levels, "Empty") + colourList[[annotationVar]] <- list( + "colours" = colours, + "annotVar" = annotationVar, + "text" = textVars + ) +}else if (length(Design_levels) == 2) { + colours <- c("#d6e0ff", "#FF9999") + + names(colours) <- c(Design_levels) + colourList[[annotationVar]] <- list( + "colours" = colours, + "annotVar" = annotationVar, + "text" = textVars + ) +} else if (length(Design_levels) <= 20) { + + if (length(Design_levels) > 12) { + colours <- c( + brewer.pal(12, "Set3"), + brewer.pal((length(Design_levels) - 12), + "Pastel2") + ) + } else { + colours <- c(brewer.pal(length(Design_levels), "Set3")) + } + + names(colours) <- c(Design_levels) + colourList[[annotationVar]] <- list( + "colours" = colours, + "annotVar" = annotationVar, + "text" = textVars + ) +} else { + colours <- c("#d6e0ff") + names(colours) <- c("nonEmpty") + colourList[[annotVar]] <- list( + "colours" = colours, + "annotVar" = annotVar, + "text" = annotVar + ) +} +``` + +# Pool Description + +Per pool within this study, there are several pool layout plots shown, based on the + +* number of STAR input reads (= library size) + +* log10 transformed number of STAR input reads + +* number of detected UMIs + +* number of detected genes + +* number of chromosomal reads + +* percentage of ERCC + +* percentage of mitochondria + + +> The values for the different samples within each pool is expected to be comparable if the content of the different pools is equally diverse. + +```{r plateAnnotation, out.width = "100%",fig.width = 20, fig.height= 10} + +plateVars <- c("NumberOfInputReads", "log10LibSize", "NumberOfMappedReads", + "NumberOfChromReads", "NumberOfUMIs", "NumberOfGenes", + "pctMT", "pctERCC") + +breaksVars <- lapply( + plateVars, + function(var) { + computeBreaks(7, qcData[, ..var]) + } +) +names(breaksVars) <- plateVars + +for (pool in pools){ + cat("\n\n") + cat(paste0("## ", pool, " {.tabset} \n\n")) + poolData <- qcData[PoolName == pool] + lapply(plateVars, function(plateVar) { + cat("\n\n") + cat(sprintf("### %s {.unnumbered}", plateVar)) + cat("\n\n") + plateLayout( + poolData, valueVariable = plateVar, + textFontSize = 10, legendFontSize = 12, + plateName = pool, plot.title = "libSize - ", + legend.title = "libSize", breaks = breaksVars[[plateVar]] + ) + cat("\n\n") + }) + cat("\n\n") +} +``` + +
+ + +# Data Distributions + + +## Reads Distributions {.tabset} + +The 4 box plots below represent the distributions per pool of the different samples based on: + +* the number of STAR input reads + +* the number of STAR mapped reads + +* the percentage of STAR mapped reads + +* the number of detected genes + +> The distributions contribute to the QC metrics mentioned in Par 3. The higher these values, the better. +> The data range for the different plates is expected to be comparable if the content of the different plates is equally diverse. + + +### Number of Input Reads {.tabset .unnumbered} + +```{r settings_1} + +nColPlots = 1 +figHeight = 7 + +``` + +#### Distribution {.tabset .unnumbered} + + +```{r boxplots_input_plate, fig.height = figHeight} +ggplot( + qcData, + aes( + x = PoolName, + y = NumberOfInputReads, colour = PoolName + ) +) + geom_boxplot() + ylab("Number of Input Reads") + + ggtitle("Number of Input Reads") + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size = 10), + axis.text.y = element_text(angle = 90, size = 14), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title = element_text(size = 15), + axis.text.x = element_blank(), + axis.ticks.x = element_blank() + ) + +``` + + +### Number of Mapped Reads {.tabset .unnumbered} + +#### Distribution {.unnumbered} + +```{r boxplots_mapped_plate, fig.height = figHeight} + +ggplot( + qcData, + aes(x = PoolName, y = NumberOfMappedReads, colour = PoolName) +) + geom_boxplot() + ylab("Number of Mapped Reads") + + ggtitle("Number of Mapped Reads") + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size = 10), + axis.text.y = element_text(angle = 90, size = 14), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title.y = element_text(size = 15), + axis.text.x = element_blank(), + axis.ticks.x = element_blank() + ) +``` + + +#### pct Mapped Reads {.unnumbered} + +```{r boxplots_pctMapped_plate, fig.height = figHeight} +ggplot( + qcData, + aes(x = PoolName, y = PctMappedReads, colour = PoolName) +) + + geom_boxplot() + + ylab("pct Mapped Reads") + + ggtitle("pct Mapped Reads") + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size = 10), + axis.text.y = element_text(angle = 90, size = 14), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title.y = element_text(size = 15), + axis.text.x = element_blank(), + axis.ticks.x = element_blank() + ) +``` + +### Number of Chromosomal Reads {.tabset .unnumbered} + +#### Distribution {.unnumbered} + +```{r boxplots_chrom_plate, fig.height = figHeight} + +ggplot( + qcData, + aes(x = PoolName, y = NumberOfChromReads, colour = PoolName) +) + geom_boxplot() + ylab("Number of Chromosomal Reads") + + ggtitle("Number of Chromosomal Reads") + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size = 10), + axis.text.y = element_text(angle = 90, size = 14), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title.y = element_text(size = 15), + axis.text.x = element_blank(), + axis.ticks.x = element_blank() + ) + +``` + +#### pct Chromosomal Reads {.unnumbered} + +```{r boxplots_pctChrom_plate, fig.height = figHeight} + +ggplot( + qcData, + aes(x = PoolName, y = pctChrom, colour = PoolName) +) + geom_boxplot() + ylab("pct Chromosomal Reads") + + ggtitle("pct Chromosomal Reads") + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size = 10), + axis.text.y = element_text(angle = 90, size = 14), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title.y = element_text(size = 15), + axis.text.x = element_blank(), + axis.ticks.x = element_blank() + ) +``` + +### Number of UMIs {.tabset .unnumbered} + +#### Distribution {.tabset .unnumbered} + + +```{r boxplots_umi_plate, fig.height = figHeight} + +ggplot( + qcData, + aes(x = PoolName, y = NumberOfUMIs, colour = PoolName) +) + geom_boxplot() + ylab("Number of UMIs") + + ggtitle('Number of UMIs') + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size = 10), + axis.text.y = element_text(angle = 90, size = 14), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title = element_text(size = 15), + axis.text.x = element_blank(), + axis.ticks.x = element_blank() + ) + +``` + +#### Density distribution {.unnumbered} + +```{r density_numberOfUMIs} + +## Pre-filtering data exploration +dt_plot <- melt( + qcData, + id.vars = c("SampleName", "PoolName", "WellID"), + measure.vars = c("NumberOfInputReads", "NumberOfMappedReads", "NumberOfUMIs") +) + +readsDensity_plot <- ggplot(dt_plot, aes(value)) +readsDensity_plot <- readsDensity_plot + + geom_density(aes(fill = variable), alpha=0.8) + + facet_grid(~ PoolName, scales = "free_x", space = "fixed", drop = TRUE) + + geom_vline( + xintercept = 5e5, + linetype = "dashed", + color = "steelblue3", size = 2 + ) + + annotate( + "text", + x = 3.5e5, y = 2e-6, label = "500k", + angle = 90, color = "steelblue3", size = 10 + ) + + geom_vline( + xintercept = 1.5e6, linetype = "dashed", + color = "forestgreen", size = 2 + ) + + annotate( + "text", x = 1.35e6, y = 2e-6, label = "1.5M", + angle = 90, color = "forestgreen", size = 10 + ) + + labs( + title = "Density plot", + subtitle = paste0( + "# Samples with NumberOfMappedReads > 1.5M: ", + length(which(qcData$NumberOfMappedReads > 1.5e6)), + "\n# Samples with NumberOfUMIs > 500k: ", + length(which(qcData$NumberOfUMIs > 5e5)) + ), + caption = paste0("# Total samples (after removing empty): ", nrow(qcData)), + x = "Count", + fill = "Variable" + ) + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size = 5), + axis.text.x = element_text(angle = 90, size = 14), + plot.title = element_text(size = 18), + plot.subtitle = element_text(size = 17), + plot.caption = element_text(size = 15), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title = element_text(size = 15), + axis.text.y = element_blank(), + axis.ticks.y = element_blank(), + axis.title.y = element_blank() + ) +readsDensity_plot + +``` + +### Number of Genes {.tabset .unnumbered} + +#### Distribution {.unnumbered} + +```{r boxplots_genes_plate, fig.height = figHeight} +ggplot( + qcData, + aes(x = PoolName, y = NumberOfGenes, colour = PoolName) +) + + geom_boxplot() + ylab("Number of Genes") + + ggtitle("Number of Genes") + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size = 10), + axis.text.y = element_text(angle = 90, size = 14), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title.y = element_text(size = 15), + axis.text.x = element_blank(), + axis.ticks.x = element_blank() + ) +``` + +## {.tabset .toc-ignore .unnumbered} + + +In addition, several plots are shown visualizing the efficiency of the reads-to-genes translation: + +* the number of input reads vs the number of mapped reads + +* the number of chromosomal reads vs the number of mapped reads + +* the number of mapped reads per UMI vs the number of mapped reads + +* the number of UNI vs the number of mapped reads + +* the number of mapped reads vs the number of genes + +* the number of chromosomal reads vs the number of genes + +* the number of mapped reads per UMI vs the number of genes + +### Mapping Efficiency {.tabset .unnumbered} + +#### Number of Input Reads {.unnumbered} + +```{r mapping_efficiency_1_plate, fig.height = 7} + +ggplot( + qcData, + aes(x = NumberOfInputReads, y = NumberOfMappedReads, colour = PoolName) +) + + geom_point() + + xlab("Number of Input Reads") + + ylab("Number of Mapped Reads") + + ggtitle("Number of Mapped Reads vs Number of Input Reads") + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size = 10), + axis.text = element_text(angle = 90, size = 15), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title = element_text(size = 15) + ) + +``` + + +#### Number of Chromosomal Reads {.unnumbered} + +```{r mapping_efficiency_2_plate, fig.height = 7} + +ggplot( + qcData, + aes(x = NumberOfChromReads, y = NumberOfMappedReads, colour = PoolName) +) + geom_point() + + xlab("Number of Chromosomal Reads") + ylab("Number of Mapped Reads") + + ggtitle("Number of Chromosomal Reads vs Number of Mapped Reads") + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size = 10), + axis.text = element_text(angle = 90, size = 15), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title = element_text(size = 15) +) + +``` + + +#### Number of UMI {.unnumbered} + +```{r mapping_efficiency_4_plate, fig.height = 7} + +ggplot( + qcData, + aes(x =NumberOfUMIs, y = NumberOfMappedReads, colour = PoolName) +) + geom_point() + + ylab("Number of Mapped Reads") + xlab("Number of UMIs ") + + ggtitle("Number of UMIs vs Number of Mapped Reads") + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size = 10), + axis.text = element_text(angle = 90, size = 15), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title = element_text(size = 15) + ) + +``` + +### Counting Efficiency {.tabset .unnumbered} + +#### Number of Mapped Reads {.unnumbered} + +```{r gene_efficiency_1_plate, fig.height = 7} +ggplot( + qcData, + aes(x = NumberOfMappedReads, y = NumberOfGenes, colour = PoolName) +) + geom_point() + + ylab("Number of Genes") + xlab("Number of Mapped Reads") + + ggtitle("Number of Genes vs Number of Mapped Reads") + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size = 10), + axis.text = element_text(angle = 90, size = 15), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title = element_text(size = 15) + ) +``` + +#### Number of Chromosomal Reads {.unnumbered} + +```{r gene_efficiency_2_plate, fig.height = 7} +ggplot( + qcData, + aes(x = NumberOfChromReads, y = NumberOfGenes, colour = PoolName) +) + geom_point() + + ylab("Number of Genes") + xlab("Number of Chromosomal Reads") + + ggtitle("Number of Genes vs Number of Chromosomal Reads") + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size = 10), + axis.text = element_text(angle = 90, size = 15), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title = element_text(size = 15) + ) +``` + + + +## Sequencing Saturation {.tabset} + +The barplots below represent the sequencing saturation per sample as determined by STAR, split per pool. +The HT-RNAseq platform aims for shallow sequencing resulting in relatively low sequencing saturations of 10-20%. +In addition, the sequencing saturation vs the number of input reads is shown. + +### Sequencing Saturation {.unnumbered} + + + +```{r sequencingSaturation, fig.height = figHeight} + +ggplot( + qcData, + aes(x = WellID, y = SequencingSaturation, fill = PoolName) +) + geom_bar(stat = "identity", position = "dodge") + + xlab("Samples") + ggtitle("Sequencing Saturation per Sample") + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(1, "lines"), + text = element_text(size = 10), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title = element_text(size = 15), + axis.text.x = element_blank(), + axis.text.y = element_text(size = 15), + axis.ticks.x = element_blank() + ) +``` + +### Sequencing Saturation - Input Reads {.unnumbered} + + +```{r sequencingSaturation_inputReads, fig.height = figHeight} + + +ggplot( + qcData, + aes(x = NumberOfInputReads, y = SequencingSaturation, colour = PoolName) +) + geom_point() + + ggtitle("Sequencing Saturation vs Number of Input Reads") + + theme(strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size = 10), + axis.text = element_text(angle = 90, size = 15), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title = element_text(size = 15) + ) +``` + +### Sequencing Saturation - Mapped Reads {.unnumbered} + +```{r sequencingSaturation_mappedReads, fig.height = figHeight} +ggplot( + qcData, + aes(x = NumberOfChromReads, y = SequencingSaturation, colour = PoolName) +) + geom_point() + + ggtitle("Sequencing Saturation vs Number of Chromosomal Reads") + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size=10), + axis.text = element_text(angle = 90, size = 15), + plot.title = element_text(size=18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title = element_text(size = 15) +) +``` + +
+ +## Genomic Origin {.tabset} + +The 3 boxplots below represent, per pool, the distributions of the percentage of reads mapping to: + +* chromosomal regions + +* mitochondrial regions + +* ERCC spike-ins + +The 4th plot summarises the above results across samples per pool. + +The 5th plot shows the percentage of reads mapped to the transcriptome (as counted by STAR). This measurement serves as a proxy for the percentage of reads mapped to exons. + +> The percentage ERCC contributes to the QC metrics mentioned in Par 3. This value is ideally as low as possible (but non-zero to ensure the they have been spiked in) and comparable for the different pools. + + + + +### pctChrom {.tabset .unnumbered} + + +```{r genomicOrigin_chrom_plate, fig.height = figHeight} + +ggplot( + qcData, aes(x = PoolName, y = pctChrom, colour = PoolName) +) + + geom_boxplot() + + ggtitle("pctChrom") + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size = 10), + axis.text.y = element_text(angle = 90, size = 14), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title.y = element_text(size = 15), + axis.text.x = element_blank(), + axis.ticks.x = element_blank() + ) +``` + + +### pctMT {.tabset .unnumbered} + +```{r genomicOrigin_mt_plate, fig.height = figHeight} + +ggplot( + qcData, + aes(x = PoolName, y = pctMT, colour = PoolName) +) + + geom_boxplot() + ggtitle("pctMT") + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size = 10), + axis.text.y = element_text(angle = 90, size = 14), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title.y = element_text(size = 15), + axis.text.x = element_blank(), + axis.ticks.x = element_blank() + ) +``` + +### pctERCC {.tabset .unnumbered} + + +```{r genomicOrigin_ercc_plate, fig.height = figHeight} +ggplot(qcData, aes(x = PoolName, y = pctERCC, colour = PoolName)) + + geom_boxplot() + + ggtitle("pctERCC") + + theme( + strip.text.x = element_text(size = 20), + panel.spacing = unit(2, "lines"), + text = element_text(size = 10), + axis.text.y = element_text(angle = 90, size = 14), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title.y = element_text(size = 15), + axis.text.x = element_blank(), + axis.ticks.x = element_blank() + ) +``` + + +### Genomic Summary {.tabset .unnumbered} + + + +```{r genomicOrigin_summary_plate} +meanPctChromMTData <- qcData[, .( + "pctChrom" = median(pctChrom), + "pctMT" = median(pctMT), + "pctERCC" = median(pctERCC) +), by = PoolName] +meanPctChromMTDataLong <- melt( + meanPctChromMTData, + id.vars = "PoolName", + measure.vars = c("pctChrom", "pctMT", "pctERCC"), + variable.name = "Origin", value.name = "pct" +) +ggplot( + meanPctChromMTDataLong, + aes(fill = Origin, y = pct, x = PoolName)) + + geom_bar(position = "stack", stat = "identity") + + ggtitle("Genomic Origin") + + theme( + text = element_text(size = 10), + axis.text = element_text(angle = 90, size = 15), + plot.title = element_text(size = 18), + legend.text = element_text(size = 15), + legend.title = element_text(size = 17), + axis.title = element_text(size = 15) + ) + +``` + + + +# Depletion {.tabset} + +
+```{r depletion} + + +for (eset_name in pools) { + cat("\n\n") + cat(paste0("## ", eset_name, " {.unnumbered}")) + cat("\n\n") + + eset <- esetList[[eset_name]] + average_reads <- sort(apply(exprs(eset), 1, mean), decreasing = TRUE) + plotData <- data.table( + ENSGID = names(average_reads), + av_count = average_reads + ) + + gen_descript <- data.table( + ENSGID = eset@featureData@data$gene_id, + Description = eset@featureData@data$GENENAME + ) + order_gen_descript <- gen_descript[ + match(plotData$ENSGID, gen_descript$ENSGID), + ] + + g <- ggplot( + plotData[c(1:100)], + aes(x = reorder(ENSGID, -av_count), y = av_count) + ) + geom_bar(stat = "identity") + + theme( + axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 12), + axis.text.y = element_text(size = 12), + legend.text = element_text(size = 15), + legend.title = element_text(size = 15), + axis.title = element_text(size = 18), + plot.title = element_text(size = 20) + ) + ylab("Average Counts") + xlab("Genes") + + print(g) + + cat("\n\n") + cat("
") + cat("
") + + print(htmltools::tagList((DT::datatable(order_gen_descript[1:100, ])))) +} +``` +
+ + +
+
+
+
+ +# Glossary {.unnumbered} + + +## Read {.unlisted .unnumbered} + +A read is a oligonucleotide (a short RNA fragment) that has been sequenced. It consists of a fixed number of base pairs (bp) and therefore has a specific read length. + + + +## Input Read {.unlisted .unnumbered} + +Each read of the fastq file used as input to the STAR aligner is considered an input read. + + + +## Read With Valid Barcode {.unlisted .unnumbered} + +A read with a valid barcode is a read for which the barcode matches the white list of barcodes under the given restriction of the number of allowed mismatches. The number of reads with a valid barcode is lower or equal to the number of input reads. + + + +## Mapped Read {.unlisted .unnumbered} + +A read that has been aligned against the reference genome and for which one or more suitable matching locations have been found is a mapped read. Depending on the number of allowed mismatches this might or might not be be an exact match. The number of mapped reads is lower or equal to the number of reads with a valid barcode. + + + +## Uniquely Mapped Read {.unlisted .unnumbered} + +A read for which one and only one suitable matching location in the reference genome was found is an uniquely mapped read. The number of uniquely mapped reads is lower or equal to the number of mapped reads. + + + +## Counted Read {.unlisted .unnumbered} + +A mapped read will only be counted if it overlaps (1 nucleotide or more) with one and only one gene. The number of counted reads is lower or equal to the number of (uniquely) mapped reads. + + + +## UMIs {.unlisted .unnumbered} + +Unique molecular identifiers (UMI) are short sequences in order to uniquely tag each molecule in a sample library. Sequencing with UMIs allows bioinformatics software to filter out duplicate reads and PCR errors with a high level of accuracy and report unique reads. + +The reported UMIs is the number of UMIs among the set of reads that map to an unique gene, i.e the number of reads is deduplicated. + + + +## pctERCC {.unlisted .unnumbered} + +The percentage of reads mapping to the ERCC genes among the total number of **mapped** reads. + + + +## pctMT {.unlisted .unnumbered} + +The percentage of reads mapping to the MT genes among the total number of **mapped** reads. + + + +## Sequencing Saturation {.unlisted .unnumbered} + +The sequencing saturation is a measure of the fraction of library complexity. The inverse of one minus the sequencing saturation can be interpreted as the number of additional reads it would take to detect a new transcript. Consequently, a low sequencing saturation indicates a shallow sequencing in which a new transcript could be discovered with a few reads. + +
+
+
+
+ +
+![](OutputSTARsolo.png) +
+ +
+
diff --git a/target/nextflow/stats/combine_star_logs/.config.vsh.yaml b/target/nextflow/stats/combine_star_logs/.config.vsh.yaml new file mode 100644 index 00000000..76307262 --- /dev/null +++ b/target/nextflow/stats/combine_star_logs/.config.vsh.yaml @@ -0,0 +1,257 @@ +name: "combine_star_logs" +namespace: "stats" +version: "updatecraftbox" +authors: +- name: "Dries Schaumont" + roles: + - "author" + - "maintainer" + info: + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Arguments" + arguments: + - type: "string" + name: "--barcodes" + description: "Barcodes responding to the respective log files.\n" + info: null + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--star_logs" + description: "Paths to the STAR log files (most frequently called Log.final.out)\n" + info: null + example: + - "Log.final.out" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--gene_summary_logs" + description: "Paths to the Summary.csv files from the STAR Solo output. Can be\ + \ found in\nthe 'Solo.out/Gene' folder relative to the root of the STAR output\ + \ directory. \n" + info: null + example: + - "Summary.txt" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--reads_per_gene_logs" + description: "Paths to the 'ReadsPerGene.out.tab' files as output by STAR.\n" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--output" + description: "Tab-delimited file describing for each barcode (as the rows), the\ + \ metrics (as columns)\ngathered from the different input files. \n" + info: null + default: + - "starLogs.txt" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +- type: "file" + path: "_viash.yaml" + dest: "_viash.yaml" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "test_data" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +requirements: + commands: + - "ps" +license: "MIT" +links: + repository: "https://github.com/viash-hub/htrnaseq" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "updatecraftbox" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "pandas" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/stats/combine_star_logs/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/stats/combine_star_logs" + executable: "target/nextflow/stats/combine_star_logs/main.nf" + viash_version: "0.9.4" + git_commit: "e6da525fc57aaec74f348eb974b68faa647bf800" + git_remote: "https://github.com/viash-hub/htrnaseq" +package_config: + name: "htrnaseq" + version: "updatecraftbox" + summary: "A workflow for high-throughput RNA-seq data analyses.\n" + description: "This workflow is designed to process high-throughput RNA-seq data,\ + \ where every\nwell of a microarray plate is a sample. A fasta file provided as\ + \ input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow\ + \ is built in a modular fashion, where most of the base functionality\nis provided\ + \ by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\n\ + supplemented by custom base components and workflow components in this package.\n\ + \nThe full workflow is split in two major subworkflows that can be run independently:\n\ + \n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per\ + \ well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate\ + \ QC reports.\n\nEach of those can be started individually, or the full workflow\ + \ can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq)\ + \ \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner)\ + \ where a\nnumber of choices (input/output structure and location) have been made.\n\ + \nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other\ + \ formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex)\ + \ first.\n" + info: + test_resources: + - path: "gs://viash-hub-resources/htrnaseq/v2" + dest: "resources_test" + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script\ + \ += 'includeConfig(\"nextflow_labels.config\")'\n.resources += {path: '/src/config/labels.config',\ + \ dest: 'nextflow_labels.config'}\n.resources += {path: '/_viash.yaml', dest:\ + \ '_viash.yaml'}\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'updatecraftbox'" + keywords: + - "bioinformatics" + - "sequencing" + - "high-throughput" + - "RNAseq" + - "mapping" + - "counting" + - "pipeline" + - "workflow" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/viash-hub/htrnaseq" + issue_tracker: "https://github.com/viash-hub/htrnaseq/issues" diff --git a/target/nextflow/stats/combine_star_logs/_viash.yaml b/target/nextflow/stats/combine_star_logs/_viash.yaml new file mode 100644 index 00000000..d08de7b1 --- /dev/null +++ b/target/nextflow/stats/combine_star_logs/_viash.yaml @@ -0,0 +1,21 @@ +name: htrnaseq +summary: | + A workflow for high-throughput RNA-seq data analyses. +description: "This workflow is designed to process high-throughput RNA-seq data, where every\nwell of a microarray plate is a sample. A fasta file provided as input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow is built in a modular fashion, where most of the base functionality\nis provided by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\nsupplemented by custom base components and workflow components in this package.\n\nThe full workflow is split in two major subworkflows that can be run independently:\n\n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate QC reports.\n\nEach of those can be started individually, or the full workflow can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq) \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner) where a\nnumber of choices (input/output structure and location) have been made.\n\nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex) first.\n" +license: MIT +keywords: [bioinformatics, sequencing, high-throughput, RNAseq, mapping, counting, pipeline, workflow] +links: + issue_tracker: https://github.com/viash-hub/htrnaseq/issues + repository: https://github.com/viash-hub/htrnaseq +viash_version: 0.9.4 +info: + test_resources: + - path: gs://viash-hub-resources/htrnaseq/v2 + dest: resources_test +config_mods: | + .requirements.commands := ['ps'] + .runners[.type == 'nextflow'].config.script += 'includeConfig("nextflow_labels.config")' + .resources += {path: '/src/config/labels.config', dest: 'nextflow_labels.config'} + .resources += {path: '/_viash.yaml', dest: '_viash.yaml'} +version: updatecraftbox +organization: vsh diff --git a/target/nextflow/stats/combine_star_logs/main.nf b/target/nextflow/stats/combine_star_logs/main.nf new file mode 100644 index 00000000..28794f93 --- /dev/null +++ b/target/nextflow/stats/combine_star_logs/main.nf @@ -0,0 +1,4099 @@ +// combine_star_logs updatecraftbox +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries Schaumont (author, maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "combine_star_logs", + "namespace" : "stats", + "version" : "updatecraftbox", + "authors" : [ + { + "name" : "Dries Schaumont", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "string", + "name" : "--barcodes", + "description" : "Barcodes responding to the respective log files.\n", + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--star_logs", + "description" : "Paths to the STAR log files (most frequently called Log.final.out)\n", + "example" : [ + "Log.final.out" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--gene_summary_logs", + "description" : "Paths to the Summary.csv files from the STAR Solo output. Can be found in\nthe 'Solo.out/Gene' folder relative to the root of the STAR output directory. \n", + "example" : [ + "Summary.txt" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--reads_per_gene_logs", + "description" : "Paths to the 'ReadsPerGene.out.tab' files as output by STAR.\n", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output", + "description" : "Tab-delimited file describing for each barcode (as the rows), the metrics (as columns)\ngathered from the different input files. \n", + "default" : [ + "starLogs.txt" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/config/labels.config", + "dest" : "nextflow_labels.config" + }, + { + "type" : "file", + "path" : "/_viash.yaml", + "dest" : "_viash.yaml" + } + ], + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "test_data" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "requirements" : { + "commands" : [ + "ps" + ] + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/viash-hub/htrnaseq" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "updatecraftbox", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "pandas" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/stats/combine_star_logs/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "target/nextflow/stats/combine_star_logs", + "viash_version" : "0.9.4", + "git_commit" : "e6da525fc57aaec74f348eb974b68faa647bf800", + "git_remote" : "https://github.com/viash-hub/htrnaseq" + }, + "package_config" : { + "name" : "htrnaseq", + "version" : "updatecraftbox", + "summary" : "A workflow for high-throughput RNA-seq data analyses.\n", + "description" : "This workflow is designed to process high-throughput RNA-seq data, where every\nwell of a microarray plate is a sample. A fasta file provided as input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow is built in a modular fashion, where most of the base functionality\nis provided by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\nsupplemented by custom base components and workflow components in this package.\n\nThe full workflow is split in two major subworkflows that can be run independently:\n\n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate QC reports.\n\nEach of those can be started individually, or the full workflow can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq) \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner) where a\nnumber of choices (input/output structure and location) have been made.\n\nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex) first.\n", + "info" : { + "test_resources" : [ + { + "path" : "gs://viash-hub-resources/htrnaseq/v2", + "dest" : "resources_test" + } + ] + }, + "viash_version" : "0.9.4", + "source" : "src", + "target" : "target", + "config_mods" : [ + ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script += 'includeConfig(\\"nextflow_labels.config\\")'\n.resources += {path: '/src/config/labels.config', dest: 'nextflow_labels.config'}\n.resources += {path: '/_viash.yaml', dest: '_viash.yaml'}\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'updatecraftbox'" + ], + "keywords" : [ + "bioinformatics", + "sequencing", + "high-throughput", + "RNAseq", + "mapping", + "counting", + "pipeline", + "workflow" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/viash-hub/htrnaseq", + "issue_tracker" : "https://github.com/viash-hub/htrnaseq/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import logging +import pandas as pd +from itertools import batched, starmap + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'barcodes': $( if [ ! -z ${VIASH_PAR_BARCODES+x} ]; then echo "r'${VIASH_PAR_BARCODES//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'star_logs': $( if [ ! -z ${VIASH_PAR_STAR_LOGS+x} ]; then echo "r'${VIASH_PAR_STAR_LOGS//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'gene_summary_logs': $( if [ ! -z ${VIASH_PAR_GENE_SUMMARY_LOGS+x} ]; then echo "r'${VIASH_PAR_GENE_SUMMARY_LOGS//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'reads_per_gene_logs': $( if [ ! -z ${VIASH_PAR_READS_PER_GENE_LOGS+x} ]; then echo "r'${VIASH_PAR_READS_PER_GENE_LOGS//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + +logger = logging.getLogger() +console_handler = logging.StreamHandler() +logger.addHandler(console_handler) +logger.setLevel(logging.DEBUG) + + +def handle_percentages(column_value): + # TODO: handle this more gracefully + if column_value: + return column_value.strip('%') + return column_value + +def star_log_to_dataframe(barcode: str, log_path) -> pd.DataFrame: + logger.info("Reading STAR log %s for barcode '%s'", log_path, barcode) + result = pd.read_table(log_path, sep=r"\\\\|\\\\t+", converters={"Value": handle_percentages}, + engine="python", header=None, skip_blank_lines=True, + skipinitialspace=True, names=["Category", "Value"], index_col=0, + skiprows=[0, 1, 2]) + logger.info("Read %d row(s) and %d column(s) from STAR logs at %s", + *result.shape, log_path) + return result + + +def summary_to_dataframe(barcode: str, summary_path) -> pd.DataFrame: + logger.info("Reading summary log %s for barcode %s", summary_path, barcode) + result = pd.read_table(summary_path, sep=",", + header=None, names=["Category", "Value"], + index_col=0, dtype=pd.StringDtype()) + logger.info("Read %d row(s) and %d column(s) from summary file at %s", + *result.shape, summary_path) + return result + + +def reads_per_gene_to_dataframe(barcode, read_per_gene_path) -> pd.DataFrame: + logger.info("Reading reads per gene file %s for barcode %s", read_per_gene_path, barcode) + result = pd.read_table(read_per_gene_path, skiprows=[0, 1, 2, 3], header=None, sep="\\\\t", + dtype={"geneID": pd.StringDtype(), + "Unstranded": pd.Int64Dtype(), + "posStrand": pd.Int64Dtype(), + "negStrand": pd.Int64Dtype()}, + index_col=0, names=["geneID", "Unstranded", "posStrand", "negStrand"]) + result = result[["Unstranded"]] # Do not use .loc here because we need a DataFrame, not a Series + df = pd.DataFrame({"Value": result.sum()}) + df = df.rename({"Unstranded": "NumberOfCountedReads"}, errors="raise") + df.index.name = "Category" + logger.info("Read %d row(s) and %d column(s) from reads per gene file at %s", + *df.shape, read_per_gene_path) + return df + +def star_log_remove_unwanted_entries_and_adjust_format(barcode, df: pd.DataFrame) -> pd.DataFrame: + """ + For a single star log (Log.final.out) in dataframe format, filter out the + entries that are not needed and format the labels for some metrics: + - Replace '%' with 'pect' in the labels. + - Remove labels ending with ':' + (mostly the section separators like 'MULTI-MAPPING READS:' and 'UNMAPPED READS:') + - Remove the metrics we do no need based on the following keywords: + Mapping speed, Average, Number of splices, per base, chimeric reads, average + + The dataframe provided as input must have an index with 1 level with the metric names. + """ + # Remove index values ending with ':' (rows like 'MULTI-MAPPING READS:','UNIQUE READS:') + logger.info("Filtering STAR logs for barcode %s. Starting with %d row(s) and %d column(s)", barcode, *df.shape) + to_keep = ~df.index.to_series().str.endswith(":") + # Remove index values where the values contain any of these substrings + regex_columns_to_remove = "Mapping speed|Average|Number of splices|per base|chimeric reads|average" + to_keep = to_keep & ~df.index.to_series().str.contains(regex_columns_to_remove, regex=True) + logger.info("Removed the following log entries for barcode '%s':\\\\n\\\\t%s", + barcode, + "\\\\n\\\\t".join(to_keep[~to_keep].index.to_list())) + result = df.loc[to_keep] + + # Replace % by pect, remove columns, use camel case and remove spaces + # You might be tempted to use .title() to make everything uppercase, + # but characters which are already uppercase should stay that way. + # (example: NumberOfUMIs and not NumberOfUmis) + result.index = result.index.str.replace("%", "pect")\\\\ + .str.replace(":", "")\\\\ + .str.replace(r"(?:^|\\\\s).", lambda m:m.group(0).upper(), regex=True)\\\\ + .str.replace(" ", "") + result = result.rename({"UniquelyMappedReadsNumber": "NumberOfMappedReads", + "UniquelyMappedReadsPect": "PctMappedReads"}, errors="raise") + logger.info("Done filtering STAR logs for barcode %s. Result has %d row(s) and %d column(s). " + "Found entries:\\\\n\\\\t%s", + barcode, *result.shape, "\\\\n\\\\t".join(result.index.to_list())) + return result + + +def summary_remove_unwanted_entries_and_adjust_format(barcode, df: pd.DataFrame) -> pd.DataFrame: + logger.info("Filtering and formatting summary logs for barcode %s. " + "Starting with %d row(s) and %d column(s)", barcode, *df.shape) + columns_to_remove = ( + "Number of Reads", + "Q30 Bases in RNA read", + "Reads Mapped to Genome: Unique", + "Reads Mapped to Transcriptome: Unique Genes", + "Reads in Cells Mapped to Unique Genes", + "Median UMI per Cell", + "Median Genes per Cell", + "Reads Mapped to Genome: Unique+Multiple", + "Median Reads per Cell", + "Mean UMI per Cell", + "Mean Genes per Cell", + ) + + to_keep = ~df.index.isin(columns_to_remove) + logger.info("Removed the following summary entries for barcode '%s':\\\\n\\\\t%s", + barcode, + "\\\\n\\\\t".join(df.loc[~to_keep].index.to_list())) + result = df.loc[to_keep] + result.index = result.index.str.replace(r"(?:^|\\\\s).", lambda m:m.group(0).upper(), + regex=True).str.replace(" ", "") + to_rename = {"UMIsInCells": "NumberOfUMIs", + "TotalGenesDetected": "NumberOfGenes"} + try: + result = result.rename(to_rename, errors="raise") + except KeyError as e: + raise KeyError(f"Tried to rename log entries ({','.join(to_rename)}) in the summary " + f"log for barcode {barcode}, but an entry was not found in the file. " + "Make sure that you are using the correct version of STAR." + f"Available entries: {", ".join(result.index.to_list())}") from e + logger.info("Done filtering summary logs for barcode %s. Result has %d row(s) and %d column(s). " + "Found entries:\\\\n\\\\t%s", + barcode, *result.shape, "\\\\n\\\\t".join(result.index.to_list())) + return result + + +def join_dfs(df_list, barcodes) -> pd.DataFrame: + # Combine the dataframes together and add the barcodes as a level to the dataframe + # in order to make a 2-level index (first level the barcodes and second level the metrics). + result = pd.concat(dict(zip(barcodes, df_list)), names=["WellBC"]) + # Pivot the table by moving the metrics to the columns. Its added as an extra level, + # so we can just frop the 'Values' level that was already there + result = result.unstack(level="Category").droplevel(0, axis="columns") + return result + +def main(par): + logger.info("Component started.") + # Provide an overview of the parameters in the logs + parameters_str = [f'\\\\t{param}: {param_val}\\\\n' for param, param_val in par.items()] + logger.info("Parameters:\\\\n%s", "".join(parameters_str).rstrip()) + star_logs, gene_summary_logs, reads_per_gene_logs, barcodes = par["star_logs"], \\\\ + par["gene_summary_logs"], par["reads_per_gene_logs"], par["barcodes"] + number_of_inputs = tuple(len(i) for i in (star_logs, gene_summary_logs, + reads_per_gene_logs, barcodes)) + if len(set(number_of_inputs)) != 1: + raise ValueError("Expected the same number of inputs for 'star_logs' (%d), " + "'gene_summary_logs' (%d), 'reads_per_gene_logs' (%d) " + "and 'barcodes' (%d)." % number_of_inputs) + + logs_to_process = [ + (star_log_to_dataframe, star_log_remove_unwanted_entries_and_adjust_format, star_logs), + (summary_to_dataframe, summary_remove_unwanted_entries_and_adjust_format, gene_summary_logs), + (reads_per_gene_to_dataframe, None, reads_per_gene_logs), + ] + logger.info("Formatting the contents of the log files.") + all_logs_data = [] + for df_generator, formatter, data in logs_to_process: + data_as_df = list(starmap(df_generator, zip(barcodes, data))) + data_formatted = data_as_df + if formatter: + data_formatted = list(starmap(formatter, zip(barcodes, data_as_df))) + data_joined = join_dfs(data_formatted, barcodes) + all_logs_data.append(data_joined) + + logger.info("Joining entries across the different logs together.") + all_stats = pd.concat(all_logs_data, axis=1) + logger.info("Log statistics were gathered for the following barcodes: %s", + ", ".join(all_stats.index.to_list())) + dtypes = { + 'NumberOfInputReads': pd.UInt64Dtype(), + 'NumberOfMappedReads': pd.UInt64Dtype(), + 'PctMappedReads': pd.Float64Dtype(), + 'NumberOfReadsMappedToMultipleLoci': pd.UInt64Dtype(), + 'PectOfReadsMappedToMultipleLoci': pd.Float64Dtype(), + 'NumberOfReadsMappedToTooManyLoci': pd.UInt64Dtype(), + 'PectOfReadsMappedToTooManyLoci': pd.Float64Dtype(), + 'NumberOfReadsUnmappedTooManyMismatches': pd.UInt64Dtype(), + 'PectOfReadsUnmappedTooManyMismatches': pd.Float64Dtype(), + 'NumberOfReadsUnmappedTooShort': pd.UInt64Dtype(), + 'PectOfReadsUnmappedTooShort': pd.Float64Dtype(), + 'NumberOfReadsUnmappedOther': pd.UInt64Dtype(), + 'PectOfReadsUnmappedOther': pd.Float64Dtype(), + 'ReadsWithValidBarcodes': pd.Float64Dtype(), + 'SequencingSaturation': pd.Float64Dtype(), + 'Q30BasesInCB+UMI': pd.Float64Dtype(), + 'ReadsMappedToTranscriptome:Unique+MultipeGenes': pd.Float64Dtype(), + 'EstimatedNumberOfCells': pd.UInt64Dtype(), + 'FractionOfReadsInCells': pd.Float64Dtype(), + 'MeanReadsPerCell': pd.UInt64Dtype(), + 'NumberOfUMIs': pd.UInt64Dtype(), + 'NumberOfGenes': pd.UInt64Dtype(), + 'NumberOfCountedReads': pd.UInt64Dtype(), + } + all_stats = all_stats.astype(dtypes) + # batched() is used here to print a limited amount of columnns at a time + # to make sure that they are all displayed (pandas might limit the view for readability) + logger.info("Summary of final output:\\\\n%s\\\\n", + "\\\\n".join(repr(all_stats.loc[:,columns].describe()) + for columns in batched(all_stats.columns, 3))) + logger.info("Writing output to %s", par["output"]) + all_stats.reset_index("WellBC").to_csv(par["output"], sep="\\\\t", header=True, + index=False, float_format='%g') + logger.info("Finished %s.", meta["name"]) + +if __name__ == "__main__": + main(par) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/htrnaseq/stats/combine_star_logs", + "tag" : "updatecraftbox" + }, + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/stats/combine_star_logs/nextflow.config b/target/nextflow/stats/combine_star_logs/nextflow.config new file mode 100644 index 00000000..e2e43d51 --- /dev/null +++ b/target/nextflow/stats/combine_star_logs/nextflow.config @@ -0,0 +1,125 @@ +manifest { + name = 'stats/combine_star_logs' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'updatecraftbox' + author = 'Dries Schaumont' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/stats/combine_star_logs/nextflow_labels.config b/target/nextflow/stats/combine_star_logs/nextflow_labels.config new file mode 100644 index 00000000..2821ec46 --- /dev/null +++ b/target/nextflow/stats/combine_star_logs/nextflow_labels.config @@ -0,0 +1,108 @@ +executor { + $k8s { + submitRateLimit = '10sec' + pollInterval = '1 sec' + } +} + +process { + container = 'nextflow/bash:latest' + + // default resources + memory = { 8.Gb * task.attempt } + cpus = 8 + maxForks = 36 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + maxMemory = 192.GB + + // Resource labels + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 8 } + withLabel: midcpu { cpus = 16 } + withLabel: highcpu { cpus = 32 } + + withLabel: verylowmem { memory = { get_memory( 4.GB * task.attempt ) } } + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 16.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 64.GB * task.attempt ) } } + +} + +profiles { + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.fixOwnership = true + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + + local { + // This config is for local processing. + process { + withName: ".*parallel_map_process" { + maxForks = 1 + } + maxMemory = 25.GB + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 6 } + withLabel: highcpu { cpus = 8 } + + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 12.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 20.GB * task.attempt ) } } + } + } +} + +def get_memory(to_compare) { + if (!process.containsKey("maxMemory") || !process.maxMemory) { + return to_compare + } + + try { + if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) { + return process.maxMemory + } + else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) { + return max_memory as nextflow.util.MemoryUnit + } + else { + return to_compare + } + } catch (all) { + println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!" + System.exit(1) + } + } diff --git a/target/nextflow/stats/combine_star_logs/nextflow_schema.json b/target/nextflow/stats/combine_star_logs/nextflow_schema.json new file mode 100644 index 00000000..a65b913d --- /dev/null +++ b/target/nextflow/stats/combine_star_logs/nextflow_schema.json @@ -0,0 +1,80 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "combine_star_logs", + "description": "No description", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "barcodes": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Barcodes responding to the respective log files.\n", + "help_text": "Type: `string`, multiple: `True`, required. " + }, + "star_logs": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "exists": true, + "description": "Paths to the STAR log files (most frequently called Log.final.out)\n", + "help_text": "Type: `file`, multiple: `True`, required, direction: `input`, example: `[\"Log.final.out\"]`. " + }, + "gene_summary_logs": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "exists": true, + "description": "Paths to the Summary.csv files from the STAR Solo output", + "help_text": "Type: `file`, multiple: `True`, required, direction: `input`, example: `[\"Summary.txt\"]`. " + }, + "reads_per_gene_logs": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "exists": true, + "description": "Paths to the 'ReadsPerGene.out.tab' files as output by STAR.\n", + "help_text": "Type: `file`, multiple: `True`, required, direction: `input`. " + }, + "output": { + "type": "string", + "format": "path", + "description": "Tab-delimited file describing for each barcode (as the rows), the metrics (as columns)\ngathered from the different input files", + "help_text": "Type: `file`, multiple: `False`, default: `\"starLogs.txt\"`, direction: `output`. ", + "default": "starLogs.txt" + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/stats/generate_pool_statistics/.config.vsh.yaml b/target/nextflow/stats/generate_pool_statistics/.config.vsh.yaml new file mode 100644 index 00000000..5dd64068 --- /dev/null +++ b/target/nextflow/stats/generate_pool_statistics/.config.vsh.yaml @@ -0,0 +1,241 @@ +name: "generate_pool_statistics" +namespace: "stats" +version: "updatecraftbox" +authors: +- name: "Dries Schaumont" + roles: + - "author" + - "maintainer" + info: + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Marijke Van Moerbeke" + roles: + - "contributor" + info: + links: + github: "mvanmoerbeke" + orcid: "0000-0002-3097-5621" + linkedin: "marijke-van-moerbeke-84303a34" + organizations: + - name: "OpenAnalytics" + href: "https://www.openanalytics.eu" + role: "Statistical Consultant" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--nrReadsNrGenesPerChrom" + description: "Path to an output file that contains a .tsv formatted table describing\n\ + per chromosome the number of reads that were mapped to that chromosome (NumberOfReads\n\ + column) and the number of genes on that chromosome that had at least one\nread\ + \ mapped to it (NumberOfGenes).\n" + info: null + default: + - "processedBamFile_well1.tsv" + - "processedBamfile_well2.tsv" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--nrReadsNrGenesPerChromPool" + description: "Pivot table in tsv format of the combined input nrReadsNrGenesPerChrom\ + \ files. Describes\nper chromosome (as columns) the number of reads, as well\ + \ as the total number \nof reads per cell barcode and the percentage of nuclear,\ + \ ERCC and mitochondrial\nreads.\n" + info: null + example: + - "nrReadsNrGenesPerChrom.txt" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +- type: "file" + path: "_viash.yaml" + dest: "_viash.yaml" +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +info: null +status: "enabled" +scope: + image: "public" + target: "public" +requirements: + commands: + - "ps" +license: "MIT" +links: + repository: "https://github.com/viash-hub/htrnaseq" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "updatecraftbox" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "pandas" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/stats/generate_pool_statistics/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/stats/generate_pool_statistics" + executable: "target/nextflow/stats/generate_pool_statistics/main.nf" + viash_version: "0.9.4" + git_commit: "e6da525fc57aaec74f348eb974b68faa647bf800" + git_remote: "https://github.com/viash-hub/htrnaseq" +package_config: + name: "htrnaseq" + version: "updatecraftbox" + summary: "A workflow for high-throughput RNA-seq data analyses.\n" + description: "This workflow is designed to process high-throughput RNA-seq data,\ + \ where every\nwell of a microarray plate is a sample. A fasta file provided as\ + \ input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow\ + \ is built in a modular fashion, where most of the base functionality\nis provided\ + \ by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\n\ + supplemented by custom base components and workflow components in this package.\n\ + \nThe full workflow is split in two major subworkflows that can be run independently:\n\ + \n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per\ + \ well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate\ + \ QC reports.\n\nEach of those can be started individually, or the full workflow\ + \ can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq)\ + \ \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner)\ + \ where a\nnumber of choices (input/output structure and location) have been made.\n\ + \nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other\ + \ formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex)\ + \ first.\n" + info: + test_resources: + - path: "gs://viash-hub-resources/htrnaseq/v2" + dest: "resources_test" + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script\ + \ += 'includeConfig(\"nextflow_labels.config\")'\n.resources += {path: '/src/config/labels.config',\ + \ dest: 'nextflow_labels.config'}\n.resources += {path: '/_viash.yaml', dest:\ + \ '_viash.yaml'}\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'updatecraftbox'" + keywords: + - "bioinformatics" + - "sequencing" + - "high-throughput" + - "RNAseq" + - "mapping" + - "counting" + - "pipeline" + - "workflow" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/viash-hub/htrnaseq" + issue_tracker: "https://github.com/viash-hub/htrnaseq/issues" diff --git a/target/nextflow/stats/generate_pool_statistics/_viash.yaml b/target/nextflow/stats/generate_pool_statistics/_viash.yaml new file mode 100644 index 00000000..d08de7b1 --- /dev/null +++ b/target/nextflow/stats/generate_pool_statistics/_viash.yaml @@ -0,0 +1,21 @@ +name: htrnaseq +summary: | + A workflow for high-throughput RNA-seq data analyses. +description: "This workflow is designed to process high-throughput RNA-seq data, where every\nwell of a microarray plate is a sample. A fasta file provided as input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow is built in a modular fashion, where most of the base functionality\nis provided by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\nsupplemented by custom base components and workflow components in this package.\n\nThe full workflow is split in two major subworkflows that can be run independently:\n\n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate QC reports.\n\nEach of those can be started individually, or the full workflow can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq) \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner) where a\nnumber of choices (input/output structure and location) have been made.\n\nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex) first.\n" +license: MIT +keywords: [bioinformatics, sequencing, high-throughput, RNAseq, mapping, counting, pipeline, workflow] +links: + issue_tracker: https://github.com/viash-hub/htrnaseq/issues + repository: https://github.com/viash-hub/htrnaseq +viash_version: 0.9.4 +info: + test_resources: + - path: gs://viash-hub-resources/htrnaseq/v2 + dest: resources_test +config_mods: | + .requirements.commands := ['ps'] + .runners[.type == 'nextflow'].config.script += 'includeConfig("nextflow_labels.config")' + .resources += {path: '/src/config/labels.config', dest: 'nextflow_labels.config'} + .resources += {path: '/_viash.yaml', dest: '_viash.yaml'} +version: updatecraftbox +organization: vsh diff --git a/target/nextflow/stats/generate_pool_statistics/main.nf b/target/nextflow/stats/generate_pool_statistics/main.nf new file mode 100644 index 00000000..9adce11d --- /dev/null +++ b/target/nextflow/stats/generate_pool_statistics/main.nf @@ -0,0 +1,3954 @@ +// generate_pool_statistics updatecraftbox +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries Schaumont (author, maintainer) +// * Marijke Van Moerbeke (contributor) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "generate_pool_statistics", + "namespace" : "stats", + "version" : "updatecraftbox", + "authors" : [ + { + "name" : "Dries Schaumont", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + }, + { + "name" : "Marijke Van Moerbeke", + "roles" : [ + "contributor" + ], + "info" : { + "links" : { + "github" : "mvanmoerbeke", + "orcid" : "0000-0002-3097-5621", + "linkedin" : "marijke-van-moerbeke-84303a34" + }, + "organizations" : [ + { + "name" : "OpenAnalytics", + "href" : "https://www.openanalytics.eu", + "role" : "Statistical Consultant" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--nrReadsNrGenesPerChrom", + "description" : "Path to an output file that contains a .tsv formatted table describing\nper chromosome the number of reads that were mapped to that chromosome (NumberOfReads\ncolumn) and the number of genes on that chromosome that had at least one\nread mapped to it (NumberOfGenes).\n", + "default" : [ + "processedBamFile_well1.tsv", + "processedBamfile_well2.tsv" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--nrReadsNrGenesPerChromPool", + "description" : "Pivot table in tsv format of the combined input nrReadsNrGenesPerChrom files. Describes\nper chromosome (as columns) the number of reads, as well as the total number \nof reads per cell barcode and the percentage of nuclear, ERCC and mitochondrial\nreads.\n", + "example" : [ + "nrReadsNrGenesPerChrom.txt" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/config/labels.config", + "dest" : "nextflow_labels.config" + }, + { + "type" : "file", + "path" : "/_viash.yaml", + "dest" : "_viash.yaml" + } + ], + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "requirements" : { + "commands" : [ + "ps" + ] + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/viash-hub/htrnaseq" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "updatecraftbox", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "pandas" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/stats/generate_pool_statistics/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "target/nextflow/stats/generate_pool_statistics", + "viash_version" : "0.9.4", + "git_commit" : "e6da525fc57aaec74f348eb974b68faa647bf800", + "git_remote" : "https://github.com/viash-hub/htrnaseq" + }, + "package_config" : { + "name" : "htrnaseq", + "version" : "updatecraftbox", + "summary" : "A workflow for high-throughput RNA-seq data analyses.\n", + "description" : "This workflow is designed to process high-throughput RNA-seq data, where every\nwell of a microarray plate is a sample. A fasta file provided as input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow is built in a modular fashion, where most of the base functionality\nis provided by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\nsupplemented by custom base components and workflow components in this package.\n\nThe full workflow is split in two major subworkflows that can be run independently:\n\n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate QC reports.\n\nEach of those can be started individually, or the full workflow can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq) \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner) where a\nnumber of choices (input/output structure and location) have been made.\n\nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex) first.\n", + "info" : { + "test_resources" : [ + { + "path" : "gs://viash-hub-resources/htrnaseq/v2", + "dest" : "resources_test" + } + ] + }, + "viash_version" : "0.9.4", + "source" : "src", + "target" : "target", + "config_mods" : [ + ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script += 'includeConfig(\\"nextflow_labels.config\\")'\n.resources += {path: '/src/config/labels.config', dest: 'nextflow_labels.config'}\n.resources += {path: '/_viash.yaml', dest: '_viash.yaml'}\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'updatecraftbox'" + ], + "keywords" : [ + "bioinformatics", + "sequencing", + "high-throughput", + "RNAseq", + "mapping", + "counting", + "pipeline", + "workflow" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/viash-hub/htrnaseq", + "issue_tracker" : "https://github.com/viash-hub/htrnaseq/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import pandas as pd +from pathlib import Path +import re + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'nrReadsNrGenesPerChrom': $( if [ ! -z ${VIASH_PAR_NRREADSNRGENESPERCHROM+x} ]; then echo "r'${VIASH_PAR_NRREADSNRGENESPERCHROM//\\'/\\'\\"\\'\\"r\\'}'.split(';')"; else echo None; fi ), + 'nrReadsNrGenesPerChromPool': $( if [ ! -z ${VIASH_PAR_NRREADSNRGENESPERCHROMPOOL+x} ]; then echo "r'${VIASH_PAR_NRREADSNRGENESPERCHROMPOOL//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END + +INDEX_COL = ["WellBC", "WellID"] + +if __name__ == "__main__": + ######### + # nrReadsNrGenesPerChrom file + ######### + nr_reads_nr_genes_wells = [] + par["nrReadsNrGenesPerChrom"] = list(map(Path, par["nrReadsNrGenesPerChrom"])) + for nr_reads_nr_genes_file in par["nrReadsNrGenesPerChrom"]: + nr_reads_nr_gene_well = pd.read_csv(nr_reads_nr_genes_file, + header=0, delimiter="\\\\t", + dtype={"WellBC": pd.StringDtype(), + "WellID": pd.StringDtype(), + "Chr": pd.StringDtype(), + "NumberOfReads": pd.UInt64Dtype(), + "NumberOfGenes": pd.UInt64Dtype()}) + if nr_reads_nr_gene_well.empty: + raise ValueError(f"{nr_reads_nr_genes_file.name} does not seem to contain any information!") + nr_reads_nr_genes_wells.append(nr_reads_nr_gene_well) + nr_reads_nr_genes_pool = pd.concat(nr_reads_nr_genes_wells, ignore_index=True,) + total_nr_reads_per_chromosome = nr_reads_nr_genes_pool.pivot_table(index=INDEX_COL, columns="Chr", + values=["NumberOfReads"], fill_value=0, + aggfunc="sum").droplevel(0, axis=1) + total_nr_reads_per_chromosome.columns.name = None + # Remove scaffolds/chromosomes with no counts + total_nr_reads_per_chromosome = total_nr_reads_per_chromosome.loc[:, (total_nr_reads_per_chromosome != 0).any(axis=0)] + ##### Total number of genes from all chromosomes + total_nr_genes = nr_reads_nr_genes_pool.loc[:, INDEX_COL + ['NumberOfGenes']].groupby(["WellBC", "WellID"]).sum() + + ##### Total counts across (irrespective of chromosome) + total_sum_of_reads = total_nr_reads_per_chromosome.sum(numeric_only=True, axis=1) + + ##### Logic to split up chromosome per type + chromosome_names = total_nr_reads_per_chromosome.columns.to_list() + chr_regex = re.compile(r"^(chr)?\\\\d+") + matching_chromosomes = [chr_name for chr_name + in chromosome_names + if chr_regex.match(chr_name)] + sex_chromosome_names = ["X", "Y"] + mitochondrial_chr_name = "MT" + # This is logic from the original HT pipeline, + # only when all of the matched chromosomes start with "chr", the mitochonrial, X and Y + # chromosomes should also start with 'chr' + if all(chr_name.startswith("chr") for chr_name in matching_chromosomes): + sex_chromosome_names += ["chrX", "chrY"] + mitochondrial_chr_name = "chrM" + + ###### Counts for mitochondrial reads + try: + mitochondrial_reads = total_nr_reads_per_chromosome.loc[:,mitochondrial_chr_name] + except KeyError: + mitochondrial_reads = 0 + percentage_mitochondrial_reads = round(mitochondrial_reads / total_sum_of_reads * 100, 2) + + ###### Counts for ERCC reads + total_ercc_reads = total_nr_reads_per_chromosome.filter(regex=r"^ERCC").sum(axis=1) + percentage_ercc_reads = round(total_ercc_reads / total_sum_of_reads * 100, 2) + + ###### Counts for nuclear chromosomes + total_chromosomal_reads = total_nr_reads_per_chromosome.loc[:,matching_chromosomes].sum(axis=1) + percentage_chromosomal_reads = round(total_chromosomal_reads / total_sum_of_reads * 100, 2) + + cols_to_add = { + "pctChrom": percentage_chromosomal_reads, + "pctMT": percentage_mitochondrial_reads, + "pctERCC": percentage_ercc_reads, + "SumReads": total_sum_of_reads, + "NumberOfGenes": total_nr_genes, + "NumberOfERCCReads": total_ercc_reads, + "NumberOfChromReads": total_chromosomal_reads, + "NumberOfMTReads": mitochondrial_reads, + } + total_nr_reads_per_chromosome = total_nr_reads_per_chromosome.assign( + **cols_to_add + ) + + total_nr_reads_per_chromosome.reset_index(names=INDEX_COL)\\\\ + .to_csv(par["nrReadsNrGenesPerChromPool"], sep="\\\\t", + header=True, index=False, float_format="%g", + columns=tuple(INDEX_COL) + tuple(chromosome_names) + tuple(cols_to_add.keys()) + ) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/htrnaseq/stats/generate_pool_statistics", + "tag" : "updatecraftbox" + }, + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/stats/generate_pool_statistics/nextflow.config b/target/nextflow/stats/generate_pool_statistics/nextflow.config new file mode 100644 index 00000000..d4404941 --- /dev/null +++ b/target/nextflow/stats/generate_pool_statistics/nextflow.config @@ -0,0 +1,125 @@ +manifest { + name = 'stats/generate_pool_statistics' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'updatecraftbox' + author = 'Dries Schaumont, Marijke Van Moerbeke' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/stats/generate_pool_statistics/nextflow_labels.config b/target/nextflow/stats/generate_pool_statistics/nextflow_labels.config new file mode 100644 index 00000000..2821ec46 --- /dev/null +++ b/target/nextflow/stats/generate_pool_statistics/nextflow_labels.config @@ -0,0 +1,108 @@ +executor { + $k8s { + submitRateLimit = '10sec' + pollInterval = '1 sec' + } +} + +process { + container = 'nextflow/bash:latest' + + // default resources + memory = { 8.Gb * task.attempt } + cpus = 8 + maxForks = 36 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + maxMemory = 192.GB + + // Resource labels + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 8 } + withLabel: midcpu { cpus = 16 } + withLabel: highcpu { cpus = 32 } + + withLabel: verylowmem { memory = { get_memory( 4.GB * task.attempt ) } } + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 16.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 64.GB * task.attempt ) } } + +} + +profiles { + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.fixOwnership = true + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + + local { + // This config is for local processing. + process { + withName: ".*parallel_map_process" { + maxForks = 1 + } + maxMemory = 25.GB + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 6 } + withLabel: highcpu { cpus = 8 } + + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 12.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 20.GB * task.attempt ) } } + } + } +} + +def get_memory(to_compare) { + if (!process.containsKey("maxMemory") || !process.maxMemory) { + return to_compare + } + + try { + if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) { + return process.maxMemory + } + else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) { + return max_memory as nextflow.util.MemoryUnit + } + else { + return to_compare + } + } catch (all) { + println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!" + System.exit(1) + } + } diff --git a/target/nextflow/stats/generate_pool_statistics/nextflow_schema.json b/target/nextflow/stats/generate_pool_statistics/nextflow_schema.json new file mode 100644 index 00000000..e69de29b diff --git a/target/nextflow/stats/generate_well_statistics/.config.vsh.yaml b/target/nextflow/stats/generate_well_statistics/.config.vsh.yaml new file mode 100644 index 00000000..7c201ac7 --- /dev/null +++ b/target/nextflow/stats/generate_well_statistics/.config.vsh.yaml @@ -0,0 +1,313 @@ +name: "generate_well_statistics" +namespace: "stats" +version: "updatecraftbox" +authors: +- name: "Dries Schaumont" + roles: + - "author" + - "maintainer" + info: + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Marijke Van Moerbeke" + roles: + - "contributor" + info: + links: + github: "mvanmoerbeke" + orcid: "0000-0002-3097-5621" + linkedin: "marijke-van-moerbeke-84303a34" + organizations: + - name: "OpenAnalytics" + href: "https://www.openanalytics.eu" + role: "Statistical Consultant" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + description: "The .bam file as returned by the mapping tool STAR." + info: null + example: + - "input.bam" + must_exist: true + create_parent: true + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--barcode" + description: "The barcode for the well that is being processed. Is only used to\ + \ add a metadata\ncolumn to all output files.\n" + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--well_id" + description: "ID of this well. Only used to add a metadata column to the output\ + \ files.\n" + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--processedBAMFile" + description: "Path to a .tsv file listing, per read in the BAM file,\nthe value\ + \ for the \"CB\", \"UX\", \"GX\" and \"GN\" tag, together with the\nchromsome\ + \ to which the read was mapped to.\n" + info: null + default: + - "processedBamFile.txt" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--nrReadsNrGenesPerChrom" + description: "Path to an output file that contains a .tsv formatted table describing\n\ + per chromosome the number of reads that were mapped to that chromosome (NumberOfReads\n\ + column) and the number of genes on that chromosome that had at least one\nread\ + \ mapped to it (NumberOfGenes).\n" + info: null + default: + - "nrReadsNrGenesPerChrom.txt" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--nrReadsNrUMIsPerCB" + description: "Path to an output file that contains a .tsv formatted table describing\n\ + per barcode the number of UMI's (nrUMIs) and the total number of reads (NumberOfReads).\n" + info: null + default: + - "nrReadsNrUMIsPerCB.txt" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--umiFreqTop" + description: "Path to an output file that contains a .tsv formatted table describing\n\ + per UMI (column UB) the frequency at which they occur in the reads (column\n\ + N). Only the top 100 UMIs are included.\n" + info: null + default: + - "umiFreqTop100.txt" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--threads" + description: "Number of threads to use for decompressing BAM files.\n" + info: null + default: + - 1 + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +- type: "file" + path: "_viash.yaml" + dest: "_viash.yaml" +description: "Generate summary statistics from BAM files generated by STAR solo." +test_resources: +- type: "python_script" + path: "test.py" + is_executable: true +- type: "file" + path: "test.sam" +- type: "file" + path: "empty.sam" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +requirements: + commands: + - "ps" +license: "MIT" +links: + repository: "https://github.com/viash-hub/htrnaseq" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.13-trixie" + target_registry: "images.viash-hub.com" + target_tag: "updatecraftbox" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "pysam" + - "pandas" + upgrade: true + test_setup: + - type: "python" + user: false + packages: + - "viashpy" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/stats/generate_well_statistics/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/stats/generate_well_statistics" + executable: "target/nextflow/stats/generate_well_statistics/main.nf" + viash_version: "0.9.4" + git_commit: "e6da525fc57aaec74f348eb974b68faa647bf800" + git_remote: "https://github.com/viash-hub/htrnaseq" +package_config: + name: "htrnaseq" + version: "updatecraftbox" + summary: "A workflow for high-throughput RNA-seq data analyses.\n" + description: "This workflow is designed to process high-throughput RNA-seq data,\ + \ where every\nwell of a microarray plate is a sample. A fasta file provided as\ + \ input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow\ + \ is built in a modular fashion, where most of the base functionality\nis provided\ + \ by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\n\ + supplemented by custom base components and workflow components in this package.\n\ + \nThe full workflow is split in two major subworkflows that can be run independently:\n\ + \n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per\ + \ well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate\ + \ QC reports.\n\nEach of those can be started individually, or the full workflow\ + \ can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq)\ + \ \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner)\ + \ where a\nnumber of choices (input/output structure and location) have been made.\n\ + \nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other\ + \ formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex)\ + \ first.\n" + info: + test_resources: + - path: "gs://viash-hub-resources/htrnaseq/v2" + dest: "resources_test" + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script\ + \ += 'includeConfig(\"nextflow_labels.config\")'\n.resources += {path: '/src/config/labels.config',\ + \ dest: 'nextflow_labels.config'}\n.resources += {path: '/_viash.yaml', dest:\ + \ '_viash.yaml'}\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'updatecraftbox'" + keywords: + - "bioinformatics" + - "sequencing" + - "high-throughput" + - "RNAseq" + - "mapping" + - "counting" + - "pipeline" + - "workflow" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/viash-hub/htrnaseq" + issue_tracker: "https://github.com/viash-hub/htrnaseq/issues" diff --git a/target/nextflow/stats/generate_well_statistics/_viash.yaml b/target/nextflow/stats/generate_well_statistics/_viash.yaml new file mode 100644 index 00000000..d08de7b1 --- /dev/null +++ b/target/nextflow/stats/generate_well_statistics/_viash.yaml @@ -0,0 +1,21 @@ +name: htrnaseq +summary: | + A workflow for high-throughput RNA-seq data analyses. +description: "This workflow is designed to process high-throughput RNA-seq data, where every\nwell of a microarray plate is a sample. A fasta file provided as input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow is built in a modular fashion, where most of the base functionality\nis provided by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\nsupplemented by custom base components and workflow components in this package.\n\nThe full workflow is split in two major subworkflows that can be run independently:\n\n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate QC reports.\n\nEach of those can be started individually, or the full workflow can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq) \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner) where a\nnumber of choices (input/output structure and location) have been made.\n\nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex) first.\n" +license: MIT +keywords: [bioinformatics, sequencing, high-throughput, RNAseq, mapping, counting, pipeline, workflow] +links: + issue_tracker: https://github.com/viash-hub/htrnaseq/issues + repository: https://github.com/viash-hub/htrnaseq +viash_version: 0.9.4 +info: + test_resources: + - path: gs://viash-hub-resources/htrnaseq/v2 + dest: resources_test +config_mods: | + .requirements.commands := ['ps'] + .runners[.type == 'nextflow'].config.script += 'includeConfig("nextflow_labels.config")' + .resources += {path: '/src/config/labels.config', dest: 'nextflow_labels.config'} + .resources += {path: '/_viash.yaml', dest: '_viash.yaml'} +version: updatecraftbox +organization: vsh diff --git a/target/nextflow/stats/generate_well_statistics/main.nf b/target/nextflow/stats/generate_well_statistics/main.nf new file mode 100644 index 00000000..564639e2 --- /dev/null +++ b/target/nextflow/stats/generate_well_statistics/main.nf @@ -0,0 +1,4027 @@ +// generate_well_statistics updatecraftbox +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries Schaumont (author, maintainer) +// * Marijke Van Moerbeke (contributor) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "generate_well_statistics", + "namespace" : "stats", + "version" : "updatecraftbox", + "authors" : [ + { + "name" : "Dries Schaumont", + "roles" : [ + "author", + "maintainer" + ], + "info" : { + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + }, + { + "name" : "Marijke Van Moerbeke", + "roles" : [ + "contributor" + ], + "info" : { + "links" : { + "github" : "mvanmoerbeke", + "orcid" : "0000-0002-3097-5621", + "linkedin" : "marijke-van-moerbeke-84303a34" + }, + "organizations" : [ + { + "name" : "OpenAnalytics", + "href" : "https://www.openanalytics.eu", + "role" : "Statistical Consultant" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "The .bam file as returned by the mapping tool STAR.", + "example" : [ + "input.bam" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--barcode", + "description" : "The barcode for the well that is being processed. Is only used to add a metadata\ncolumn to all output files.\n", + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--well_id", + "description" : "ID of this well. Only used to add a metadata column to the output files.\n", + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--processedBAMFile", + "description" : "Path to a .tsv file listing, per read in the BAM file,\nthe value for the \\"CB\\", \\"UX\\", \\"GX\\" and \\"GN\\" tag, together with the\nchromsome to which the read was mapped to.\n", + "default" : [ + "processedBamFile.txt" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--nrReadsNrGenesPerChrom", + "description" : "Path to an output file that contains a .tsv formatted table describing\nper chromosome the number of reads that were mapped to that chromosome (NumberOfReads\ncolumn) and the number of genes on that chromosome that had at least one\nread mapped to it (NumberOfGenes).\n", + "default" : [ + "nrReadsNrGenesPerChrom.txt" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--nrReadsNrUMIsPerCB", + "description" : "Path to an output file that contains a .tsv formatted table describing\nper barcode the number of UMI's (nrUMIs) and the total number of reads (NumberOfReads).\n", + "default" : [ + "nrReadsNrUMIsPerCB.txt" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--umiFreqTop", + "description" : "Path to an output file that contains a .tsv formatted table describing\nper UMI (column UB) the frequency at which they occur in the reads (column\nN). Only the top 100 UMIs are included.\n", + "default" : [ + "umiFreqTop100.txt" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--threads", + "description" : "Number of threads to use for decompressing BAM files.\n", + "default" : [ + 1 + ], + "required" : false, + "min" : 1, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/config/labels.config", + "dest" : "nextflow_labels.config" + }, + { + "type" : "file", + "path" : "/_viash.yaml", + "dest" : "_viash.yaml" + } + ], + "description" : "Generate summary statistics from BAM files generated by STAR solo.", + "test_resources" : [ + { + "type" : "python_script", + "path" : "test.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "test.sam" + }, + { + "type" : "file", + "path" : "empty.sam" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "requirements" : { + "commands" : [ + "ps" + ] + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/viash-hub/htrnaseq" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.13-trixie", + "target_registry" : "images.viash-hub.com", + "target_tag" : "updatecraftbox", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "pysam", + "pandas" + ], + "upgrade" : true + } + ], + "test_setup" : [ + { + "type" : "python", + "user" : false, + "packages" : [ + "viashpy" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/stats/generate_well_statistics/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "target/nextflow/stats/generate_well_statistics", + "viash_version" : "0.9.4", + "git_commit" : "e6da525fc57aaec74f348eb974b68faa647bf800", + "git_remote" : "https://github.com/viash-hub/htrnaseq" + }, + "package_config" : { + "name" : "htrnaseq", + "version" : "updatecraftbox", + "summary" : "A workflow for high-throughput RNA-seq data analyses.\n", + "description" : "This workflow is designed to process high-throughput RNA-seq data, where every\nwell of a microarray plate is a sample. A fasta file provided as input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow is built in a modular fashion, where most of the base functionality\nis provided by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\nsupplemented by custom base components and workflow components in this package.\n\nThe full workflow is split in two major subworkflows that can be run independently:\n\n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate QC reports.\n\nEach of those can be started individually, or the full workflow can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq) \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner) where a\nnumber of choices (input/output structure and location) have been made.\n\nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex) first.\n", + "info" : { + "test_resources" : [ + { + "path" : "gs://viash-hub-resources/htrnaseq/v2", + "dest" : "resources_test" + } + ] + }, + "viash_version" : "0.9.4", + "source" : "src", + "target" : "target", + "config_mods" : [ + ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script += 'includeConfig(\\"nextflow_labels.config\\")'\n.resources += {path: '/src/config/labels.config', dest: 'nextflow_labels.config'}\n.resources += {path: '/_viash.yaml', dest: '_viash.yaml'}\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'updatecraftbox'" + ], + "keywords" : [ + "bioinformatics", + "sequencing", + "high-throughput", + "RNAseq", + "mapping", + "counting", + "pipeline", + "workflow" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/viash-hub/htrnaseq", + "issue_tracker" : "https://github.com/viash-hub/htrnaseq/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import pysam +import pandas as pd +import logging + +### VIASH START +# The following code has been auto-generated by Viash. +par = { + 'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'barcode': $( if [ ! -z ${VIASH_PAR_BARCODE+x} ]; then echo "r'${VIASH_PAR_BARCODE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'well_id': $( if [ ! -z ${VIASH_PAR_WELL_ID+x} ]; then echo "r'${VIASH_PAR_WELL_ID//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'processedBAMFile': $( if [ ! -z ${VIASH_PAR_PROCESSEDBAMFILE+x} ]; then echo "r'${VIASH_PAR_PROCESSEDBAMFILE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'nrReadsNrGenesPerChrom': $( if [ ! -z ${VIASH_PAR_NRREADSNRGENESPERCHROM+x} ]; then echo "r'${VIASH_PAR_NRREADSNRGENESPERCHROM//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'nrReadsNrUMIsPerCB': $( if [ ! -z ${VIASH_PAR_NRREADSNRUMISPERCB+x} ]; then echo "r'${VIASH_PAR_NRREADSNRUMISPERCB//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'umiFreqTop': $( if [ ! -z ${VIASH_PAR_UMIFREQTOP+x} ]; then echo "r'${VIASH_PAR_UMIFREQTOP//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'threads': $( if [ ! -z ${VIASH_PAR_THREADS+x} ]; then echo "int(r'${VIASH_PAR_THREADS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +### VIASH END +logger = logging.getLogger() +console_handler = logging.StreamHandler() +logger.addHandler(console_handler) +logger.setLevel(logging.DEBUG) + +if __name__ == "__main__": + logger.info("Component started.") + parameters_str = [f'\\\\t{param}: {param_val}\\\\n' for param, param_val in par.items()] + logger.info("Parameters:\\\\n%s", "".join(parameters_str).rstrip()) + logger.info("Opening '%s'", par["input"]) + samfile = pysam.AlignmentFile(par["input"], "rb", threads=par["threads"]) + all_tags = [] + index = [] + tags_selection = ("CB", "UB", "GX", "GN") + for aligned_segment in samfile: + tags = dict(aligned_segment.get_tags()) + all_tags.append(tags) + reference_name = aligned_segment.reference_name + index.append("*" if not reference_name else reference_name) + if not index: + # Workaround for https://github.com/pandas-dev/pandas/issues/58594 + tag_dataframe = pd.DataFrame([], index=[], columns=tags_selection) + else: + tag_dataframe = pd.DataFrame.from_records(all_tags, index=index, + columns=tags_selection) + tag_dataframe_to_write = tag_dataframe.copy() + logger.info("Done reading BAM file. Found %i entries", tag_dataframe.shape[0]) + tag_dataframe.assign(WellBC=par["barcode"], WellID=par["well_id"])\\\\ + .reset_index(names="Chr")\\\\ + .to_csv(par["processedBAMFile"], sep="\\\\t", na_rep="", + header=True, index=False, + columns=("WellBC", "WellID", "Chr") + tags_selection) + logger.info("Constructing of dataframe done.") + # Number of genes that had a read mapped to them per chromosome, + # and the number of reads mapped to those genes per chromosome. + nr_reads_nr_genes = tag_dataframe.dropna(subset=["GX"]).groupby(level=0).agg( + NumberOfReads=pd.NamedAgg("GX", aggfunc="size"), + NumberOfGenes=pd.NamedAgg(column="GX", aggfunc="nunique") + ) + nr_reads_nr_genes = nr_reads_nr_genes.reindex(samfile.header.references, fill_value=0) + logger.info("Done calculating number of reads per gene and per chromesome. Writing to %s", + par['nrReadsNrGenesPerChrom']) + nr_reads_nr_genes.reset_index(names="Chr").assign(WellBC=par["barcode"], WellID=par["well_id"])\\\\ + .to_csv(par["nrReadsNrGenesPerChrom"], sep="\\\\t", + header=True, index=False, + columns=("WellBC", "WellID", "Chr", "NumberOfReads", "NumberOfGenes")) + + # Number of reads mapped to the reference, grouped by UMI + nr_read_per_umi = tag_dataframe.groupby('UB').size()\\\\ + .drop("", errors="ignore").sort_values(ascending=False).head(100) + nr_read_per_umi_df = nr_read_per_umi.to_frame(name="N") + logger.info("Done calculating number of mapped reads per UMI, writing to %s", par["umiFreqTop"]) + nr_read_per_umi_df.assign(WellBC=par["barcode"], WellID=par["well_id"]).reset_index(names="UB")\\\\ + .to_csv(par["umiFreqTop"], header=True, sep="\\\\t", + index=False, columns=("WellBC", "WellID", "UB", "N")) + + # Total number of mapped reads and total number of UMIs (not grouped per chromosome) + nr_reads_and_umi_per_barcode = tag_dataframe.groupby(by="CB").agg( + NumberOfReads=pd.NamedAgg("CB", "size"), + nrUMIs=pd.NamedAgg("UB", "nunique") + ) + logger.info("Done calculating number of mapped reads and number of UMIs per Cell Barcode, writing to %s", + par["nrReadsNrUMIsPerCB"]) + nr_reads_and_umi_per_barcode.assign(WellBC=par["barcode"], WellID=par["well_id"]).reset_index(names="CB")\\\\ + .to_csv(par["nrReadsNrUMIsPerCB"], sep="\\\\t", header=True, + index=False, columns=("WellBC", "WellID", "CB", "NumberOfReads", "nrUMIs")) + logger.info("Finished!") +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/htrnaseq/stats/generate_well_statistics", + "tag" : "updatecraftbox" + }, + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/stats/generate_well_statistics/nextflow.config b/target/nextflow/stats/generate_well_statistics/nextflow.config new file mode 100644 index 00000000..6f25b94d --- /dev/null +++ b/target/nextflow/stats/generate_well_statistics/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'stats/generate_well_statistics' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'updatecraftbox' + description = 'Generate summary statistics from BAM files generated by STAR solo.' + author = 'Dries Schaumont, Marijke Van Moerbeke' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/stats/generate_well_statistics/nextflow_labels.config b/target/nextflow/stats/generate_well_statistics/nextflow_labels.config new file mode 100644 index 00000000..2821ec46 --- /dev/null +++ b/target/nextflow/stats/generate_well_statistics/nextflow_labels.config @@ -0,0 +1,108 @@ +executor { + $k8s { + submitRateLimit = '10sec' + pollInterval = '1 sec' + } +} + +process { + container = 'nextflow/bash:latest' + + // default resources + memory = { 8.Gb * task.attempt } + cpus = 8 + maxForks = 36 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + maxMemory = 192.GB + + // Resource labels + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 8 } + withLabel: midcpu { cpus = 16 } + withLabel: highcpu { cpus = 32 } + + withLabel: verylowmem { memory = { get_memory( 4.GB * task.attempt ) } } + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 16.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 64.GB * task.attempt ) } } + +} + +profiles { + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.fixOwnership = true + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + + local { + // This config is for local processing. + process { + withName: ".*parallel_map_process" { + maxForks = 1 + } + maxMemory = 25.GB + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 6 } + withLabel: highcpu { cpus = 8 } + + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 12.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 20.GB * task.attempt ) } } + } + } +} + +def get_memory(to_compare) { + if (!process.containsKey("maxMemory") || !process.maxMemory) { + return to_compare + } + + try { + if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) { + return process.maxMemory + } + else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) { + return max_memory as nextflow.util.MemoryUnit + } + else { + return to_compare + } + } catch (all) { + println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!" + System.exit(1) + } + } diff --git a/target/nextflow/stats/generate_well_statistics/nextflow_schema.json b/target/nextflow/stats/generate_well_statistics/nextflow_schema.json new file mode 100644 index 00000000..3df2dae3 --- /dev/null +++ b/target/nextflow/stats/generate_well_statistics/nextflow_schema.json @@ -0,0 +1,85 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "generate_well_statistics", + "description": "Generate summary statistics from BAM files generated by STAR solo.", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "description": "The .bam file as returned by the mapping tool STAR.", + "help_text": "Type: `file`, multiple: `False`, direction: `input`, example: `\"input.bam\"`. " + }, + "barcode": { + "type": "string", + "description": "The barcode for the well that is being processed", + "help_text": "Type: `string`, multiple: `False`, required. " + }, + "well_id": { + "type": "string", + "description": "ID of this well", + "help_text": "Type: `string`, multiple: `False`, required. " + }, + "processedBAMFile": { + "type": "string", + "format": "path", + "description": "Path to a .tsv file listing, per read in the BAM file,\nthe value for the \"CB\", \"UX\", \"GX\" and \"GN\" tag, together with the\nchromsome to which the read was mapped to.\n", + "help_text": "Type: `file`, multiple: `False`, default: `\"processedBamFile.txt\"`, direction: `output`. ", + "default": "processedBamFile.txt" + }, + "nrReadsNrGenesPerChrom": { + "type": "string", + "format": "path", + "description": "Path to an output file that contains a .tsv formatted table describing\nper chromosome the number of reads that were mapped to that chromosome (NumberOfReads\ncolumn) and the number of genes on that chromosome that had at least one\nread mapped to it (NumberOfGenes).\n", + "help_text": "Type: `file`, multiple: `False`, default: `\"nrReadsNrGenesPerChrom.txt\"`, direction: `output`. ", + "default": "nrReadsNrGenesPerChrom.txt" + }, + "nrReadsNrUMIsPerCB": { + "type": "string", + "format": "path", + "description": "Path to an output file that contains a .tsv formatted table describing\nper barcode the number of UMI's (nrUMIs) and the total number of reads (NumberOfReads).\n", + "help_text": "Type: `file`, multiple: `False`, default: `\"nrReadsNrUMIsPerCB.txt\"`, direction: `output`. ", + "default": "nrReadsNrUMIsPerCB.txt" + }, + "umiFreqTop": { + "type": "string", + "format": "path", + "description": "Path to an output file that contains a .tsv formatted table describing\nper UMI (column UB) the frequency at which they occur in the reads (column\nN)", + "help_text": "Type: `file`, multiple: `False`, default: `\"umiFreqTop100.txt\"`, direction: `output`. ", + "default": "umiFreqTop100.txt" + }, + "threads": { + "type": "integer", + "description": "Number of threads to use for decompressing BAM files.\n", + "help_text": "Type: `integer`, multiple: `False`, default: `1`. ", + "default": 1 + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/utils/concatRuns/.config.vsh.yaml b/target/nextflow/utils/concatRuns/.config.vsh.yaml new file mode 100644 index 00000000..4915542b --- /dev/null +++ b/target/nextflow/utils/concatRuns/.config.vsh.yaml @@ -0,0 +1,215 @@ +name: "concatRuns" +namespace: "utils" +version: "updatecraftbox" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input_r1" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--input_r2" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "string" + name: "--sample_id" + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output_r1" + description: "Path to read 1 fastq/fasta file" + info: null + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--output_r2" + description: "Path to read 2 fastq/fasta file" + info: null + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: true + multiple_sep: ";" +resources: +- type: "nextflow_script" + path: "main.nf" + is_executable: true + entrypoint: "run_wf" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +- type: "file" + path: "_viash.yaml" + dest: "_viash.yaml" +description: "Concatenate well FASTQ files from different runs in order to increase\ + \ sequencing depth.\n" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +requirements: + commands: + - "ps" +dependencies: +- name: "concat_text" + repository: + type: "vsh" + repo: "craftbox" + tag: "v0.3.0" +repositories: +- type: "vsh" + name: "cb" + repo: "craftbox" + tag: "v0.3.0" +license: "MIT" +links: + repository: "https://github.com/viash-hub/htrnaseq" +runners: +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "native" + id: "native" +- type: "native" + id: "native" +build_info: + config: "src/utils/concatRuns/config.vsh.yaml" + runner: "nextflow" + engine: "native|native" + output: "target/nextflow/utils/concatRuns" + executable: "target/nextflow/utils/concatRuns/main.nf" + viash_version: "0.9.4" + git_commit: "e6da525fc57aaec74f348eb974b68faa647bf800" + git_remote: "https://github.com/viash-hub/htrnaseq" + dependencies: + - "target/dependencies/vsh/vsh/craftbox/v0.3.0/nextflow/concat_text" +package_config: + name: "htrnaseq" + version: "updatecraftbox" + summary: "A workflow for high-throughput RNA-seq data analyses.\n" + description: "This workflow is designed to process high-throughput RNA-seq data,\ + \ where every\nwell of a microarray plate is a sample. A fasta file provided as\ + \ input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow\ + \ is built in a modular fashion, where most of the base functionality\nis provided\ + \ by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\n\ + supplemented by custom base components and workflow components in this package.\n\ + \nThe full workflow is split in two major subworkflows that can be run independently:\n\ + \n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per\ + \ well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate\ + \ QC reports.\n\nEach of those can be started individually, or the full workflow\ + \ can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq)\ + \ \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner)\ + \ where a\nnumber of choices (input/output structure and location) have been made.\n\ + \nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other\ + \ formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex)\ + \ first.\n" + info: + test_resources: + - path: "gs://viash-hub-resources/htrnaseq/v2" + dest: "resources_test" + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script\ + \ += 'includeConfig(\"nextflow_labels.config\")'\n.resources += {path: '/src/config/labels.config',\ + \ dest: 'nextflow_labels.config'}\n.resources += {path: '/_viash.yaml', dest:\ + \ '_viash.yaml'}\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'updatecraftbox'" + keywords: + - "bioinformatics" + - "sequencing" + - "high-throughput" + - "RNAseq" + - "mapping" + - "counting" + - "pipeline" + - "workflow" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/viash-hub/htrnaseq" + issue_tracker: "https://github.com/viash-hub/htrnaseq/issues" diff --git a/target/nextflow/utils/concatRuns/_viash.yaml b/target/nextflow/utils/concatRuns/_viash.yaml new file mode 100644 index 00000000..d08de7b1 --- /dev/null +++ b/target/nextflow/utils/concatRuns/_viash.yaml @@ -0,0 +1,21 @@ +name: htrnaseq +summary: | + A workflow for high-throughput RNA-seq data analyses. +description: "This workflow is designed to process high-throughput RNA-seq data, where every\nwell of a microarray plate is a sample. A fasta file provided as input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow is built in a modular fashion, where most of the base functionality\nis provided by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\nsupplemented by custom base components and workflow components in this package.\n\nThe full workflow is split in two major subworkflows that can be run independently:\n\n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate QC reports.\n\nEach of those can be started individually, or the full workflow can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq) \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner) where a\nnumber of choices (input/output structure and location) have been made.\n\nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex) first.\n" +license: MIT +keywords: [bioinformatics, sequencing, high-throughput, RNAseq, mapping, counting, pipeline, workflow] +links: + issue_tracker: https://github.com/viash-hub/htrnaseq/issues + repository: https://github.com/viash-hub/htrnaseq +viash_version: 0.9.4 +info: + test_resources: + - path: gs://viash-hub-resources/htrnaseq/v2 + dest: resources_test +config_mods: | + .requirements.commands := ['ps'] + .runners[.type == 'nextflow'].config.script += 'includeConfig("nextflow_labels.config")' + .resources += {path: '/src/config/labels.config', dest: 'nextflow_labels.config'} + .resources += {path: '/_viash.yaml', dest: '_viash.yaml'} +version: updatecraftbox +organization: vsh diff --git a/target/nextflow/utils/concatRuns/main.nf b/target/nextflow/utils/concatRuns/main.nf new file mode 100644 index 00000000..f2b93eed --- /dev/null +++ b/target/nextflow/utils/concatRuns/main.nf @@ -0,0 +1,3544 @@ +// concatRuns updatecraftbox +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "concatRuns", + "namespace" : "utils", + "version" : "updatecraftbox", + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input_r1", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--input_r2", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--sample_id", + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output_r1", + "description" : "Path to read 1 fastq/fasta file", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output_r2", + "description" : "Path to read 2 fastq/fasta file", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : true, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "nextflow_script", + "path" : "main.nf", + "is_executable" : true, + "entrypoint" : "run_wf" + }, + { + "type" : "file", + "path" : "/src/config/labels.config", + "dest" : "nextflow_labels.config" + }, + { + "type" : "file", + "path" : "/_viash.yaml", + "dest" : "_viash.yaml" + } + ], + "description" : "Concatenate well FASTQ files from different runs in order to increase sequencing depth.\n", + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "requirements" : { + "commands" : [ + "ps" + ] + }, + "dependencies" : [ + { + "name" : "concat_text", + "repository" : { + "type" : "vsh", + "repo" : "craftbox", + "tag" : "v0.3.0" + } + } + ], + "repositories" : [ + { + "type" : "vsh", + "name" : "cb", + "repo" : "craftbox", + "tag" : "v0.3.0" + } + ], + "license" : "MIT", + "links" : { + "repository" : "https://github.com/viash-hub/htrnaseq" + }, + "runners" : [ + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "native", + "id" : "native" + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/utils/concatRuns/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "native|native", + "output" : "target/nextflow/utils/concatRuns", + "viash_version" : "0.9.4", + "git_commit" : "e6da525fc57aaec74f348eb974b68faa647bf800", + "git_remote" : "https://github.com/viash-hub/htrnaseq" + }, + "package_config" : { + "name" : "htrnaseq", + "version" : "updatecraftbox", + "summary" : "A workflow for high-throughput RNA-seq data analyses.\n", + "description" : "This workflow is designed to process high-throughput RNA-seq data, where every\nwell of a microarray plate is a sample. A fasta file provided as input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow is built in a modular fashion, where most of the base functionality\nis provided by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\nsupplemented by custom base components and workflow components in this package.\n\nThe full workflow is split in two major subworkflows that can be run independently:\n\n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate QC reports.\n\nEach of those can be started individually, or the full workflow can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq) \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner) where a\nnumber of choices (input/output structure and location) have been made.\n\nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex) first.\n", + "info" : { + "test_resources" : [ + { + "path" : "gs://viash-hub-resources/htrnaseq/v2", + "dest" : "resources_test" + } + ] + }, + "viash_version" : "0.9.4", + "source" : "src", + "target" : "target", + "config_mods" : [ + ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script += 'includeConfig(\\"nextflow_labels.config\\")'\n.resources += {path: '/src/config/labels.config', dest: 'nextflow_labels.config'}\n.resources += {path: '/_viash.yaml', dest: '_viash.yaml'}\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'updatecraftbox'" + ], + "keywords" : [ + "bioinformatics", + "sequencing", + "high-throughput", + "RNAseq", + "mapping", + "counting", + "pipeline", + "workflow" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/viash-hub/htrnaseq", + "issue_tracker" : "https://github.com/viash-hub/htrnaseq/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) +meta["root_dir"] = getRootDir() +include { concat_text } from "${meta.root_dir}/dependencies/vsh/vsh/craftbox/v0.3.0/nextflow/concat_text/main.nf" + +// inner workflow +// user-provided Nextflow code +workflow run_wf { + + take: + input_ch + + main: + // Count the number of input events per sample + // Results from events with the same sample ID need to be concatenated. + event_counts_ch = input_ch + | map {id, state -> + def new_state = state + ["event_id": id] + def new_event = [state.sample_id, new_state] + return new_event + } + | groupTuple(by: 0) + | flatMap { id, states -> + def orig_event_ids = states.collect{it.event_id} + def new_events = orig_event_ids.collect{ orig_event_id -> + [orig_event_id, ["n_events": states.size()]] + } + return new_events + } + + + // The number of events per sample needs is passed number to `groupTuple()` + // so that it can emit the sample as soon as it is ready. This makes sure + // that the samples are processed asynchronously. + output_ch = input_ch.join(event_counts_ch) + | flatMap {id, state_demultiplex, state_event_counts -> + assert state_demultiplex.input_r1.size() == state_demultiplex.input_r2.size(), + "Expected output from well demultiplexing to contain equal amount or forward and reverse FASTQ files." + def new_states = [state_demultiplex.input_r1, state_demultiplex.input_r2].transpose().collect{ fastq_files -> + def (r1_file, r2_file) = fastq_files + def regex = ~/^(\w+)_R[12]{1}_001\.fastq(\.gz)?$/ + def parsed_file_name = r1_file.name =~ regex + def parsed_file_name_r2 = r2_file.name =~ regex + def well_id = parsed_file_name[0][1] + def well_id_r2 = parsed_file_name_r2[0][1] + + assert (well_id.length() != 0) && (well_id == well_id_r2) + def new_state = state_demultiplex + [ + "input_r1": r1_file, + "input_r2": r2_file, + "event_id": id, + ] + def group_settings = groupKey("${state_demultiplex.sample_id}_${well_id}", state_event_counts.n_events) + return [group_settings, new_state] + + } + return new_states + } + | groupTuple(by: 0, sort: "hash", remainder: true) + | map {group_settings, sample_states -> + def input_r1 = sample_states.collect{it.input_r1}.flatten() + def input_r2 = sample_states.collect{it.input_r2}.flatten() + def event_ids = sample_states.collect{it.event_id} + def sample_id_list = sample_states.collect{it.sample_id}.unique() + assert sample_id_list.size() == 1 + def sample_id = sample_id_list[0] + assert input_r1.size() == input_r2.size() + + def new_state = [ + "input_r1": input_r1, + "input_r2": input_r2, + "event_id": event_ids, + "sample_id": sample_id, + ] + return [group_settings.target, new_state] + } + | concat_text.run( + directives: [label: ["lowmem", "lowcpu"]], + key: "concat_samples_r1", + runIf: {id, state -> state.input_r1.size() > 1}, + fromState: { id, state -> + def output_file_name = state.input_r1[0].name + [ + input: state.input_r1, + gzip_output: false, + output: output_file_name + ] + }, + toState: { id, result, state -> + def newState = state + [ input_r1: [ result.output ] ] + return newState + } + ) + | concat_text.run( + directives: [label: ["lowmem", "lowcpu"]], + key: "concat_samples_r2", + runIf: {id, state -> state.input_r2.size() > 1}, + fromState: { id, state -> + def output_file_name = state.input_r2[0].name + [ + input: state.input_r2, + gzip_output: false, + output: output_file_name + ] + }, + toState: { id, result, state -> + def newState = state + [ input_r2: [ result.output ] ] + return newState + } + ) + | map {id, state -> + def new_state = [state.sample_id, state] + return new_state + } + | groupTuple(by: 0, sort: 'hash') + | map {id, states -> + def new_state = [ + "input_r1": states.collect{it.input_r1}.flatten(), + "input_r2": states.collect{it.input_r2}.flatten(), + "_meta": ["join_id": states[0].event_id[0]] + ] + return [id, new_state] + } + | setState( + [ + "output_r1": "input_r1", + "output_r2": "input_r2", + "_meta": "_meta" + ] + ) + + emit: + output_ch + +} + +// inner workflow hook +def innerWorkflowFactory(args) { + return run_wf +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/utils/concatRuns/nextflow.config b/target/nextflow/utils/concatRuns/nextflow.config new file mode 100644 index 00000000..3cb595fc --- /dev/null +++ b/target/nextflow/utils/concatRuns/nextflow.config @@ -0,0 +1,125 @@ +manifest { + name = 'utils/concatRuns' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'updatecraftbox' + description = 'Concatenate well FASTQ files from different runs in order to increase sequencing depth.\n' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/utils/concatRuns/nextflow_labels.config b/target/nextflow/utils/concatRuns/nextflow_labels.config new file mode 100644 index 00000000..2821ec46 --- /dev/null +++ b/target/nextflow/utils/concatRuns/nextflow_labels.config @@ -0,0 +1,108 @@ +executor { + $k8s { + submitRateLimit = '10sec' + pollInterval = '1 sec' + } +} + +process { + container = 'nextflow/bash:latest' + + // default resources + memory = { 8.Gb * task.attempt } + cpus = 8 + maxForks = 36 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + maxMemory = 192.GB + + // Resource labels + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 8 } + withLabel: midcpu { cpus = 16 } + withLabel: highcpu { cpus = 32 } + + withLabel: verylowmem { memory = { get_memory( 4.GB * task.attempt ) } } + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 16.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 64.GB * task.attempt ) } } + +} + +profiles { + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.fixOwnership = true + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + + local { + // This config is for local processing. + process { + withName: ".*parallel_map_process" { + maxForks = 1 + } + maxMemory = 25.GB + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 6 } + withLabel: highcpu { cpus = 8 } + + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 12.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 20.GB * task.attempt ) } } + } + } +} + +def get_memory(to_compare) { + if (!process.containsKey("maxMemory") || !process.maxMemory) { + return to_compare + } + + try { + if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) { + return process.maxMemory + } + else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) { + return max_memory as nextflow.util.MemoryUnit + } + else { + return to_compare + } + } catch (all) { + println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!" + System.exit(1) + } + } diff --git a/target/nextflow/utils/concatRuns/nextflow_schema.json b/target/nextflow/utils/concatRuns/nextflow_schema.json new file mode 100644 index 00000000..9fb8392c --- /dev/null +++ b/target/nextflow/utils/concatRuns/nextflow_schema.json @@ -0,0 +1,80 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "concatRuns", + "description": "Concatenate well FASTQ files from different runs in order to increase sequencing depth.\n", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "input_r1": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "exists": true, + "description": "", + "help_text": "Type: `file`, multiple: `True`, required, direction: `input`. " + }, + "input_r2": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "exists": true, + "description": "", + "help_text": "Type: `file`, multiple: `True`, required, direction: `input`. " + }, + "sample_id": { + "type": "string", + "description": "", + "help_text": "Type: `string`, multiple: `False`, required. " + }, + "output_r1": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "description": "Path to read 1 fastq/fasta file", + "help_text": "Type: `file`, multiple: `True`, default: `\"$id.$key.output_r1_*\"`, direction: `output`. ", + "default": "$id.$key.output_r1_*" + }, + "output_r2": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "description": "Path to read 2 fastq/fasta file", + "help_text": "Type: `file`, multiple: `True`, default: `\"$id.$key.output_r2_*\"`, direction: `output`. ", + "default": "$id.$key.output_r2_*" + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/utils/listInputDir/.config.vsh.yaml b/target/nextflow/utils/listInputDir/.config.vsh.yaml new file mode 100644 index 00000000..3e9ff2fc --- /dev/null +++ b/target/nextflow/utils/listInputDir/.config.vsh.yaml @@ -0,0 +1,222 @@ +name: "listInputDir" +namespace: "utils" +version: "updatecraftbox" +argument_groups: +- name: "Arguments" + arguments: + - type: "file" + name: "--input" + alternatives: + - "-i" + description: "Path to the directory containing fastq files" + info: null + example: + - "fastq_dir" + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--pools" + description: "Pool names to include. By default all pools are selected for analysis." + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--r1_output" + description: "Path to read 1 fastq/fasta file" + info: null + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--r2_output" + description: "Path to read 2 fastq/fasta file" + info: null + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--lane" + description: "Lane nr" + info: null + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--sample" + description: "Sample nr" + info: null + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--sample_id" + description: "Sample name" + info: null + required: false + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "nextflow_script" + path: "main.nf" + is_executable: true + entrypoint: "run_wf" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +- type: "file" + path: "_viash.yaml" + dest: "_viash.yaml" +description: "List the contents of a directory and parse contained fastq files" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +requirements: + commands: + - "ps" +license: "MIT" +links: + repository: "https://github.com/viash-hub/htrnaseq" +runners: +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "native" + id: "native" +- type: "native" + id: "native" +build_info: + config: "src/utils/listInputDir/config.vsh.yaml" + runner: "nextflow" + engine: "native|native" + output: "target/nextflow/utils/listInputDir" + executable: "target/nextflow/utils/listInputDir/main.nf" + viash_version: "0.9.4" + git_commit: "e6da525fc57aaec74f348eb974b68faa647bf800" + git_remote: "https://github.com/viash-hub/htrnaseq" +package_config: + name: "htrnaseq" + version: "updatecraftbox" + summary: "A workflow for high-throughput RNA-seq data analyses.\n" + description: "This workflow is designed to process high-throughput RNA-seq data,\ + \ where every\nwell of a microarray plate is a sample. A fasta file provided as\ + \ input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow\ + \ is built in a modular fashion, where most of the base functionality\nis provided\ + \ by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\n\ + supplemented by custom base components and workflow components in this package.\n\ + \nThe full workflow is split in two major subworkflows that can be run independently:\n\ + \n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per\ + \ well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate\ + \ QC reports.\n\nEach of those can be started individually, or the full workflow\ + \ can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq)\ + \ \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner)\ + \ where a\nnumber of choices (input/output structure and location) have been made.\n\ + \nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other\ + \ formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex)\ + \ first.\n" + info: + test_resources: + - path: "gs://viash-hub-resources/htrnaseq/v2" + dest: "resources_test" + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script\ + \ += 'includeConfig(\"nextflow_labels.config\")'\n.resources += {path: '/src/config/labels.config',\ + \ dest: 'nextflow_labels.config'}\n.resources += {path: '/_viash.yaml', dest:\ + \ '_viash.yaml'}\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'updatecraftbox'" + keywords: + - "bioinformatics" + - "sequencing" + - "high-throughput" + - "RNAseq" + - "mapping" + - "counting" + - "pipeline" + - "workflow" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/viash-hub/htrnaseq" + issue_tracker: "https://github.com/viash-hub/htrnaseq/issues" diff --git a/target/nextflow/utils/listInputDir/_viash.yaml b/target/nextflow/utils/listInputDir/_viash.yaml new file mode 100644 index 00000000..d08de7b1 --- /dev/null +++ b/target/nextflow/utils/listInputDir/_viash.yaml @@ -0,0 +1,21 @@ +name: htrnaseq +summary: | + A workflow for high-throughput RNA-seq data analyses. +description: "This workflow is designed to process high-throughput RNA-seq data, where every\nwell of a microarray plate is a sample. A fasta file provided as input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow is built in a modular fashion, where most of the base functionality\nis provided by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\nsupplemented by custom base components and workflow components in this package.\n\nThe full workflow is split in two major subworkflows that can be run independently:\n\n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate QC reports.\n\nEach of those can be started individually, or the full workflow can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq) \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner) where a\nnumber of choices (input/output structure and location) have been made.\n\nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex) first.\n" +license: MIT +keywords: [bioinformatics, sequencing, high-throughput, RNAseq, mapping, counting, pipeline, workflow] +links: + issue_tracker: https://github.com/viash-hub/htrnaseq/issues + repository: https://github.com/viash-hub/htrnaseq +viash_version: 0.9.4 +info: + test_resources: + - path: gs://viash-hub-resources/htrnaseq/v2 + dest: resources_test +config_mods: | + .requirements.commands := ['ps'] + .runners[.type == 'nextflow'].config.script += 'includeConfig("nextflow_labels.config")' + .resources += {path: '/src/config/labels.config', dest: 'nextflow_labels.config'} + .resources += {path: '/_viash.yaml', dest: '_viash.yaml'} +version: updatecraftbox +organization: vsh diff --git a/target/nextflow/utils/listInputDir/main.nf b/target/nextflow/utils/listInputDir/main.nf new file mode 100644 index 00000000..9815f969 --- /dev/null +++ b/target/nextflow/utils/listInputDir/main.nf @@ -0,0 +1,3494 @@ +// listInputDir updatecraftbox +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "listInputDir", + "namespace" : "utils", + "version" : "updatecraftbox", + "argument_groups" : [ + { + "name" : "Arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "alternatives" : [ + "-i" + ], + "description" : "Path to the directory containing fastq files", + "example" : [ + "fastq_dir" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--pools", + "description" : "Pool names to include. By default all pools are selected for analysis.", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--r1_output", + "description" : "Path to read 1 fastq/fasta file", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--r2_output", + "description" : "Path to read 2 fastq/fasta file", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--lane", + "description" : "Lane nr", + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--sample", + "description" : "Sample nr", + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--sample_id", + "description" : "Sample name", + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "nextflow_script", + "path" : "main.nf", + "is_executable" : true, + "entrypoint" : "run_wf" + }, + { + "type" : "file", + "path" : "/src/config/labels.config", + "dest" : "nextflow_labels.config" + }, + { + "type" : "file", + "path" : "/_viash.yaml", + "dest" : "_viash.yaml" + } + ], + "description" : "List the contents of a directory and parse contained fastq files", + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "requirements" : { + "commands" : [ + "ps" + ] + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/viash-hub/htrnaseq" + }, + "runners" : [ + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "native", + "id" : "native" + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/utils/listInputDir/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "native|native", + "output" : "target/nextflow/utils/listInputDir", + "viash_version" : "0.9.4", + "git_commit" : "e6da525fc57aaec74f348eb974b68faa647bf800", + "git_remote" : "https://github.com/viash-hub/htrnaseq" + }, + "package_config" : { + "name" : "htrnaseq", + "version" : "updatecraftbox", + "summary" : "A workflow for high-throughput RNA-seq data analyses.\n", + "description" : "This workflow is designed to process high-throughput RNA-seq data, where every\nwell of a microarray plate is a sample. A fasta file provided as input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow is built in a modular fashion, where most of the base functionality\nis provided by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\nsupplemented by custom base components and workflow components in this package.\n\nThe full workflow is split in two major subworkflows that can be run independently:\n\n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate QC reports.\n\nEach of those can be started individually, or the full workflow can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq) \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner) where a\nnumber of choices (input/output structure and location) have been made.\n\nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex) first.\n", + "info" : { + "test_resources" : [ + { + "path" : "gs://viash-hub-resources/htrnaseq/v2", + "dest" : "resources_test" + } + ] + }, + "viash_version" : "0.9.4", + "source" : "src", + "target" : "target", + "config_mods" : [ + ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script += 'includeConfig(\\"nextflow_labels.config\\")'\n.resources += {path: '/src/config/labels.config', dest: 'nextflow_labels.config'}\n.resources += {path: '/_viash.yaml', dest: '_viash.yaml'}\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'updatecraftbox'" + ], + "keywords" : [ + "bioinformatics", + "sequencing", + "high-throughput", + "RNAseq", + "mapping", + "counting", + "pipeline", + "workflow" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/viash-hub/htrnaseq", + "issue_tracker" : "https://github.com/viash-hub/htrnaseq/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// user-provided Nextflow code +workflow run_wf { + + take: in_ + + main: + + out_ = in_ + | flatMap{ id, state -> + println "Looking for fastq files in ${state.input}" + def allFastqs = state.input + .listFiles() + .findAll{ + it.isFile() && + it.name ==~ /^.+\.fastq.gz$|^.+\.fastq$|^.+\.fasta$/ + } + println "Found ${allFastqs.size()} fastq/fasta files in ${state.input}" + assert allFastqs.size() > 0: "No fastq/fasta files found" + + println("Extracting information from fastq/fasta filenames") + def processed_fastqs = allFastqs.collect { f -> + def regex = ~/^(\S+)_S(\d+)_(L(\d+)_)?R(\d)_(\d+)\.fast[qa](\.gz)?$/ + def validFastq = f.name ==~ regex + + assert validFastq: "${f} does not match the regex ${regex}" + + def parsedFastq = f.name =~ regex + def lane = parsedFastq[0][3] + // Remove the trailing '_' + def lane_remove_trailing = lane == null ? "" : lane.replaceAll('_$', "") + def sample_id = parsedFastq[0][1] + if (sample_id in ["Undetermined"] || (state.pools && !state.pools.isEmpty() && !state.pools.contains(sample_id))) { + return null + } + return [ + "fastq": f, + "sample_id": sample_id, + "sample": parsedFastq[0][2], + "lane": lane_remove_trailing, + "read": parsedFastq[0][5], + ] + } + + println("Group paired fastq/fasta files") + def grouped = processed_fastqs + .findAll{it != null} + .groupBy({it.sample_id}, {it.lane}) + .collectMany{ sample_id, states_per_lane -> + def result = states_per_lane.collect{lane, lane_states -> + assert lane_states.size() == 2, "Expected to find two fastq files per lane! " + + "Found ${lane_states.size()}. State: ${states_per_lane}" + def r1_state = lane_states.find({it.read == "1"}) + def r2_state = lane_states.find({it.read == "2"}) + def fastq_state = [ + "r1_output": r1_state.fastq, + "r2_output": r2_state.fastq + ] + def new_state = fastq_state + + r1_state.findAll{it.key in ["sample_id", "sample", "lane"]} + + ["_meta": ["join_id": id]] + def new_id = lane?.trim() ? "${sample_id}_${lane}".toString() : sample_id + return [new_id, new_state] + } + return result + + } + return grouped + + } + + emit: out_ + +} + +// inner workflow hook +def innerWorkflowFactory(args) { + return run_wf +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/utils/listInputDir/nextflow.config b/target/nextflow/utils/listInputDir/nextflow.config new file mode 100644 index 00000000..d27292aa --- /dev/null +++ b/target/nextflow/utils/listInputDir/nextflow.config @@ -0,0 +1,125 @@ +manifest { + name = 'utils/listInputDir' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'updatecraftbox' + description = 'List the contents of a directory and parse contained fastq files' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/utils/listInputDir/nextflow_labels.config b/target/nextflow/utils/listInputDir/nextflow_labels.config new file mode 100644 index 00000000..2821ec46 --- /dev/null +++ b/target/nextflow/utils/listInputDir/nextflow_labels.config @@ -0,0 +1,108 @@ +executor { + $k8s { + submitRateLimit = '10sec' + pollInterval = '1 sec' + } +} + +process { + container = 'nextflow/bash:latest' + + // default resources + memory = { 8.Gb * task.attempt } + cpus = 8 + maxForks = 36 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + maxMemory = 192.GB + + // Resource labels + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 8 } + withLabel: midcpu { cpus = 16 } + withLabel: highcpu { cpus = 32 } + + withLabel: verylowmem { memory = { get_memory( 4.GB * task.attempt ) } } + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 16.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 64.GB * task.attempt ) } } + +} + +profiles { + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.fixOwnership = true + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + + local { + // This config is for local processing. + process { + withName: ".*parallel_map_process" { + maxForks = 1 + } + maxMemory = 25.GB + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 6 } + withLabel: highcpu { cpus = 8 } + + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 12.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 20.GB * task.attempt ) } } + } + } +} + +def get_memory(to_compare) { + if (!process.containsKey("maxMemory") || !process.maxMemory) { + return to_compare + } + + try { + if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) { + return process.maxMemory + } + else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) { + return max_memory as nextflow.util.MemoryUnit + } + else { + return to_compare + } + } catch (all) { + println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!" + System.exit(1) + } + } diff --git a/target/nextflow/utils/listInputDir/nextflow_schema.json b/target/nextflow/utils/listInputDir/nextflow_schema.json new file mode 100644 index 00000000..8559d482 --- /dev/null +++ b/target/nextflow/utils/listInputDir/nextflow_schema.json @@ -0,0 +1,79 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "listInputDir", + "description": "List the contents of a directory and parse contained fastq files", + "type": "object", + "$defs": { + "arguments": { + "title": "Arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Path to the directory containing fastq files", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`, example: `\"fastq_dir\"`. " + }, + "pools": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Pool names to include", + "help_text": "Type: `string`, multiple: `True`. " + }, + "r1_output": { + "type": "string", + "format": "path", + "description": "Path to read 1 fastq/fasta file", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.r1_output\"`, direction: `output`. ", + "default": "$id.$key.r1_output" + }, + "r2_output": { + "type": "string", + "format": "path", + "description": "Path to read 2 fastq/fasta file", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.r2_output\"`, direction: `output`. ", + "default": "$id.$key.r2_output" + }, + "lane": { + "type": "string", + "description": "Lane nr", + "help_text": "Type: `string`, multiple: `False`. " + }, + "sample": { + "type": "string", + "description": "Sample nr", + "help_text": "Type: `string`, multiple: `False`. " + }, + "sample_id": { + "type": "string", + "description": "Sample name", + "help_text": "Type: `string`, multiple: `False`. " + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/utils/save_params/.config.vsh.yaml b/target/nextflow/utils/save_params/.config.vsh.yaml new file mode 100644 index 00000000..53390bfa --- /dev/null +++ b/target/nextflow/utils/save_params/.config.vsh.yaml @@ -0,0 +1,204 @@ +name: "save_params" +namespace: "utils" +version: "updatecraftbox" +argument_groups: +- name: "Inputs" + arguments: + - type: "string" + name: "--id" + description: "The id of the job\n" + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--params_yaml" + description: "base64 encoded yaml containing the state\n" + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Outputs" + arguments: + - type: "file" + name: "--output" + description: "The output YAML file\n" + info: null + example: + - "output.yaml" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "python_script" + path: "script.py" + is_executable: true +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +- type: "file" + path: "_viash.yaml" + dest: "_viash.yaml" +description: "Save parameters to a YAML file\n" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +requirements: + commands: + - "ps" +license: "MIT" +links: + repository: "https://github.com/viash-hub/htrnaseq" +runners: +- type: "executable" + id: "executable" + docker_setup_strategy: "ifneedbepullelsecachedbuild" +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "docker" + id: "docker" + image: "python:3.12-slim" + target_registry: "images.viash-hub.com" + target_tag: "updatecraftbox" + namespace_separator: "/" + setup: + - type: "apt" + packages: + - "procps" + interactive: false + - type: "python" + user: false + packages: + - "pyyaml" + upgrade: true + entrypoint: [] + cmd: null +- type: "native" + id: "native" +build_info: + config: "src/utils/save_params/config.vsh.yaml" + runner: "nextflow" + engine: "docker|native" + output: "target/nextflow/utils/save_params" + executable: "target/nextflow/utils/save_params/main.nf" + viash_version: "0.9.4" + git_commit: "e6da525fc57aaec74f348eb974b68faa647bf800" + git_remote: "https://github.com/viash-hub/htrnaseq" +package_config: + name: "htrnaseq" + version: "updatecraftbox" + summary: "A workflow for high-throughput RNA-seq data analyses.\n" + description: "This workflow is designed to process high-throughput RNA-seq data,\ + \ where every\nwell of a microarray plate is a sample. A fasta file provided as\ + \ input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow\ + \ is built in a modular fashion, where most of the base functionality\nis provided\ + \ by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\n\ + supplemented by custom base components and workflow components in this package.\n\ + \nThe full workflow is split in two major subworkflows that can be run independently:\n\ + \n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per\ + \ well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate\ + \ QC reports.\n\nEach of those can be started individually, or the full workflow\ + \ can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq)\ + \ \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner)\ + \ where a\nnumber of choices (input/output structure and location) have been made.\n\ + \nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other\ + \ formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex)\ + \ first.\n" + info: + test_resources: + - path: "gs://viash-hub-resources/htrnaseq/v2" + dest: "resources_test" + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script\ + \ += 'includeConfig(\"nextflow_labels.config\")'\n.resources += {path: '/src/config/labels.config',\ + \ dest: 'nextflow_labels.config'}\n.resources += {path: '/_viash.yaml', dest:\ + \ '_viash.yaml'}\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'updatecraftbox'" + keywords: + - "bioinformatics" + - "sequencing" + - "high-throughput" + - "RNAseq" + - "mapping" + - "counting" + - "pipeline" + - "workflow" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/viash-hub/htrnaseq" + issue_tracker: "https://github.com/viash-hub/htrnaseq/issues" diff --git a/target/nextflow/utils/save_params/_viash.yaml b/target/nextflow/utils/save_params/_viash.yaml new file mode 100644 index 00000000..d08de7b1 --- /dev/null +++ b/target/nextflow/utils/save_params/_viash.yaml @@ -0,0 +1,21 @@ +name: htrnaseq +summary: | + A workflow for high-throughput RNA-seq data analyses. +description: "This workflow is designed to process high-throughput RNA-seq data, where every\nwell of a microarray plate is a sample. A fasta file provided as input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow is built in a modular fashion, where most of the base functionality\nis provided by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\nsupplemented by custom base components and workflow components in this package.\n\nThe full workflow is split in two major subworkflows that can be run independently:\n\n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate QC reports.\n\nEach of those can be started individually, or the full workflow can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq) \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner) where a\nnumber of choices (input/output structure and location) have been made.\n\nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex) first.\n" +license: MIT +keywords: [bioinformatics, sequencing, high-throughput, RNAseq, mapping, counting, pipeline, workflow] +links: + issue_tracker: https://github.com/viash-hub/htrnaseq/issues + repository: https://github.com/viash-hub/htrnaseq +viash_version: 0.9.4 +info: + test_resources: + - path: gs://viash-hub-resources/htrnaseq/v2 + dest: resources_test +config_mods: | + .requirements.commands := ['ps'] + .runners[.type == 'nextflow'].config.script += 'includeConfig("nextflow_labels.config")' + .resources += {path: '/src/config/labels.config', dest: 'nextflow_labels.config'} + .resources += {path: '/_viash.yaml', dest: '_viash.yaml'} +version: updatecraftbox +organization: vsh diff --git a/target/nextflow/utils/save_params/main.nf b/target/nextflow/utils/save_params/main.nf new file mode 100644 index 00000000..19547838 --- /dev/null +++ b/target/nextflow/utils/save_params/main.nf @@ -0,0 +1,3833 @@ +// save_params updatecraftbox +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "save_params", + "namespace" : "utils", + "version" : "updatecraftbox", + "argument_groups" : [ + { + "name" : "Inputs", + "arguments" : [ + { + "type" : "string", + "name" : "--id", + "description" : "The id of the job\n", + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--params_yaml", + "description" : "base64 encoded yaml containing the state\n", + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Outputs", + "arguments" : [ + { + "type" : "file", + "name" : "--output", + "description" : "The output YAML file\n", + "example" : [ + "output.yaml" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "python_script", + "path" : "script.py", + "is_executable" : true + }, + { + "type" : "file", + "path" : "/src/config/labels.config", + "dest" : "nextflow_labels.config" + }, + { + "type" : "file", + "path" : "/_viash.yaml", + "dest" : "_viash.yaml" + } + ], + "description" : "Save parameters to a YAML file\n", + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "requirements" : { + "commands" : [ + "ps" + ] + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/viash-hub/htrnaseq" + }, + "runners" : [ + { + "type" : "executable", + "id" : "executable", + "docker_setup_strategy" : "ifneedbepullelsecachedbuild" + }, + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "docker", + "id" : "docker", + "image" : "python:3.12-slim", + "target_registry" : "images.viash-hub.com", + "target_tag" : "updatecraftbox", + "namespace_separator" : "/", + "setup" : [ + { + "type" : "apt", + "packages" : [ + "procps" + ], + "interactive" : false + }, + { + "type" : "python", + "user" : false, + "packages" : [ + "pyyaml" + ], + "upgrade" : true + } + ] + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/utils/save_params/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "docker|native", + "output" : "target/nextflow/utils/save_params", + "viash_version" : "0.9.4", + "git_commit" : "e6da525fc57aaec74f348eb974b68faa647bf800", + "git_remote" : "https://github.com/viash-hub/htrnaseq" + }, + "package_config" : { + "name" : "htrnaseq", + "version" : "updatecraftbox", + "summary" : "A workflow for high-throughput RNA-seq data analyses.\n", + "description" : "This workflow is designed to process high-throughput RNA-seq data, where every\nwell of a microarray plate is a sample. A fasta file provided as input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow is built in a modular fashion, where most of the base functionality\nis provided by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\nsupplemented by custom base components and workflow components in this package.\n\nThe full workflow is split in two major subworkflows that can be run independently:\n\n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate QC reports.\n\nEach of those can be started individually, or the full workflow can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq) \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner) where a\nnumber of choices (input/output structure and location) have been made.\n\nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex) first.\n", + "info" : { + "test_resources" : [ + { + "path" : "gs://viash-hub-resources/htrnaseq/v2", + "dest" : "resources_test" + } + ] + }, + "viash_version" : "0.9.4", + "source" : "src", + "target" : "target", + "config_mods" : [ + ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script += 'includeConfig(\\"nextflow_labels.config\\")'\n.resources += {path: '/src/config/labels.config', dest: 'nextflow_labels.config'}\n.resources += {path: '/_viash.yaml', dest: '_viash.yaml'}\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'updatecraftbox'" + ], + "keywords" : [ + "bioinformatics", + "sequencing", + "high-throughput", + "RNAseq", + "mapping", + "counting", + "pipeline", + "workflow" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/viash-hub/htrnaseq", + "issue_tracker" : "https://github.com/viash-hub/htrnaseq/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// inner workflow hook +def innerWorkflowFactory(args) { + def rawScript = '''set -e +tempscript=".viash_script.py" +cat > "$tempscript" << VIASHMAIN +import re +import yaml +import base64 + +## VIASH START +# The following code has been auto-generated by Viash. +par = { + 'id': $( if [ ! -z ${VIASH_PAR_ID+x} ]; then echo "r'${VIASH_PAR_ID//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'params_yaml': $( if [ ! -z ${VIASH_PAR_PARAMS_YAML+x} ]; then echo "r'${VIASH_PAR_PARAMS_YAML//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ) +} +meta = { + 'name': $( if [ ! -z ${VIASH_META_NAME+x} ]; then echo "r'${VIASH_META_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ), + 'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_kib': $( if [ ! -z ${VIASH_META_MEMORY_KIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_mib': $( if [ ! -z ${VIASH_META_MEMORY_MIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_gib': $( if [ ! -z ${VIASH_META_MEMORY_GIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_tib': $( if [ ! -z ${VIASH_META_MEMORY_TIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ), + 'memory_pib': $( if [ ! -z ${VIASH_META_MEMORY_PIB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PIB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ) +} +dep = { + +} + +## VIASH END + +class Dumper(yaml.Dumper): + def increase_indent(self, flow=False, indentless=False): + return super(Dumper, self).increase_indent(flow, False) + +def decode_params_yaml(encoded_yaml): + yaml_bytes = base64.b64decode(encoded_yaml) + yaml_string = yaml_bytes.decode('utf-8') + yaml_data = yaml.safe_load(yaml_string) + + return yaml_data + +params = decode_params_yaml(par['params_yaml']) + +with open(par["output"], 'w') as f: + yaml.dump(params, f, default_flow_style=False, Dumper=Dumper) +VIASHMAIN +python -B "$tempscript" +''' + + return vdsl3WorkflowFactory(args, meta, rawScript) +} + + + +/** + * Generate a workflow for VDSL3 modules. + * + * This function is called by the workflowFactory() function. + * + * Input channel: [id, input_map] + * Output channel: [id, output_map] + * + * Internally, this workflow will convert the input channel + * to a format which the Nextflow module will be able to handle. + */ +def vdsl3WorkflowFactory(Map args, Map meta, String rawScript) { + def key = args["key"] + def processObj = null + + workflow processWf { + take: input_ + main: + + if (processObj == null) { + processObj = _vdsl3ProcessFactory(args, meta, rawScript) + } + + output_ = input_ + | map { tuple -> + def id = tuple[0] + def data_ = tuple[1] + + if (workflow.stubRun) { + // add id if missing + data_ = [id: 'stub'] + data_ + } + + // process input files separately + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { par -> + def val = data_.containsKey(par.plainName) ? data_[par.plainName] : [] + def inputFiles = [] + if (val == null) { + inputFiles = [] + } else if (val instanceof List) { + inputFiles = val + } else if (val instanceof Path) { + inputFiles = [ val ] + } else { + inputFiles = [] + } + if (!workflow.stubRun) { + // throw error when an input file doesn't exist + inputFiles.each{ file -> + assert file.exists() : + "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" + + " Required input file does not exist.\n" + + " Path: '$file'.\n" + + " Expected input file to exist" + } + } + inputFiles + } + + // remove input files + def argsExclInputFiles = meta.config.allArguments + .findAll { (it.type != "file" || it.direction != "input") && data_.containsKey(it.plainName) } + .collectEntries { par -> + def parName = par.plainName + def val = data_[parName] + if (par.multiple && val instanceof Collection) { + val = val.join(par.multiple_sep) + } + if (par.direction == "output" && par.type == "file") { + val = val + .replaceAll('\\$id', id) + .replaceAll('\\$\\{id\\}', id) + .replaceAll('\\$key', key) + .replaceAll('\\$\\{key\\}', key) + } + [parName, val] + } + + [ id ] + inputPaths + [ argsExclInputFiles, meta.resources_dir ] + } + | processObj + | map { output -> + def outputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .indexed() + .collectEntries{ index, par -> + def out = output[index + 1] + // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678) + if (!out instanceof List || out.size() <= 1) { + if (par.multiple) { + out = [] + } else { + assert !par.required : + "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" + + " Required output file is missing" + out = null + } + } else if (out.size() == 2 && !par.multiple) { + out = out[1] + } else { + out = out.drop(1) + } + [ par.plainName, out ] + } + + // drop null outputs + outputFiles.removeAll{it.value == null} + + [ output[0], outputFiles ] + } + emit: output_ + } + + return processWf +} + +// depends on: session? +def _vdsl3ProcessFactory(Map workflowArgs, Map meta, String rawScript) { + // autodetect process key + def wfKey = workflowArgs["key"] + def procKeyPrefix = "${wfKey}_process" + def scriptMeta = nextflow.script.ScriptMeta.current() + def existing = scriptMeta.getProcessNames().findAll{it.startsWith(procKeyPrefix)} + def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()} + def newNumber = (numbers + [-1]).max() + 1 + + def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber" + + if (newNumber > 0) { + log.warn "Key for module '${wfKey}' is duplicated.\n", + "If you run a component multiple times in the same workflow,\n" + + "it's recommended you set a unique key for every call,\n" + + "for example: ${wfKey}.run(key: \"foo\")." + } + + // subset directives and convert to list of tuples + def drctv = workflowArgs.directives + + // TODO: unit test the two commands below + // convert publish array into tags + def valueToStr = { val -> + // ignore closures + if (val instanceof CharSequence) { + if (!val.matches('^[{].*[}]$')) { + '"' + val + '"' + } else { + val + } + } else if (val instanceof List) { + "[" + val.collect{valueToStr(it)}.join(", ") + "]" + } else if (val instanceof Map) { + "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]" + } else { + val.inspect() + } + } + + // multiple entries allowed: label, publishdir + def drctvStrs = drctv.collect { key, value -> + if (key in ["label", "publishDir"]) { + value.collect{ val -> + if (val instanceof Map) { + "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else if (val == null) { + "" + } else { + "\n$key " + valueToStr(val) + } + }.join() + } else if (value instanceof Map) { + "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ") + } else { + "\n$key " + valueToStr(value) + } + }.join() + + def inputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + .collect { ', path(viash_par_' + it.plainName + ', stageAs: "_viash_par/' + it.plainName + '_?/*")' } + .join() + + def outputPaths = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + // insert dummy into every output (see nextflow-io/nextflow#2678) + if (!par.multiple) { + ', path{[".exitcode", args.' + par.plainName + ']}' + } else { + ', path{[".exitcode"] + args.' + par.plainName + '}' + } + } + .join() + + // TODO: move this functionality somewhere else? + if (workflowArgs.auto.transcript) { + outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}' + } else { + outputPaths = outputPaths + ', path{[".exitcode"]}' + } + + // create dirs for output files (based on BashWrapper.createParentFiles) + def createParentStr = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" && it.create_parent } + .collect { par -> + def contents = "args[\"${par.plainName}\"] instanceof List ? args[\"${par.plainName}\"].join('\" \"') : args[\"${par.plainName}\"]" + "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent '\" + escapeText(${contents}) + \"'\" : \"\" }" + } + .join("\n") + + // construct inputFileExports + def inputFileExports = meta.config.allArguments + .findAll { it.type == "file" && it.direction.toLowerCase() == "input" } + .collect { par -> + def contents = "viash_par_${par.plainName} instanceof List ? viash_par_${par.plainName}.join(\"${par.multiple_sep}\") : viash_par_${par.plainName}" + "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}='\" + escapeText(${contents}) + \"'\"}" + } + + // NOTE: if using docker, use /tmp instead of tmpDir! + def tmpDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('VIASH_TMPDIR') ?: + System.getenv('VIASH_TEMPDIR') ?: + System.getenv('VIASH_TMP') ?: + System.getenv('TEMP') ?: + System.getenv('TMPDIR') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMP') ?: + '/tmp' + ).toAbsolutePath() + + // construct stub + def stub = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "output" } + .collect { par -> + "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }" + } + .join("\n") + + // escape script + def escapedScript = rawScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"') + + // publishdir assert + def assertStr = (workflowArgs.auto.publish == true) || workflowArgs.auto.transcript ? + """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n Example: --publish_dir './output/'" """ : + "" + + // generate process string + def procStr = + """nextflow.enable.dsl=2 + | + |def escapeText = { s -> s.toString().replaceAll("'", "'\\\"'\\\"'") } + |process $procKey {$drctvStrs + |input: + | tuple val(id)$inputPaths, val(args), path(resourcesDir, stageAs: ".viash_meta_resources") + |output: + | tuple val("\$id")$outputPaths, optional: true + |stub: + |\"\"\" + |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; } + |$stub + |\"\"\" + |script:$assertStr + |def parInject = args + | .findAll{key, value -> value != null} + | .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}='\${escapeText(value)}'"} + | .join("\\n") + |\"\"\" + |# meta exports + |export VIASH_META_RESOURCES_DIR="\${resourcesDir}" + |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}" + |export VIASH_META_NAME="${meta.config.name}" + |# export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_NAME" + |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml" + |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" } + |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" } + |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then + | export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+999) / 1000 )) + | export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+999) / 1000 )) + | export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+999) / 1000 )) + | export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+999) / 1000 )) + | export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+999) / 1000 )) + | export VIASH_META_MEMORY_KIB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 )) + | export VIASH_META_MEMORY_MIB=\\\$(( (\\\$VIASH_META_MEMORY_KIB+1023) / 1024 )) + | export VIASH_META_MEMORY_GIB=\\\$(( (\\\$VIASH_META_MEMORY_MIB+1023) / 1024 )) + | export VIASH_META_MEMORY_TIB=\\\$(( (\\\$VIASH_META_MEMORY_GIB+1023) / 1024 )) + | export VIASH_META_MEMORY_PIB=\\\$(( (\\\$VIASH_META_MEMORY_TIB+1023) / 1024 )) + |fi + | + |# meta synonyms + |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR" + |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR" + | + |# create output dirs if need be + |function mkdir_parent { + | for file in "\\\$@"; do + | mkdir -p "\\\$(dirname "\\\$file")" + | done + |} + |$createParentStr + | + |# argument exports${inputFileExports.join()} + |\$parInject + | + |# process script + |${escapedScript} + |\"\"\" + |} + |""".stripMargin() + + // TODO: print on debug + // if (workflowArgs.debug == true) { + // println("######################\n$procStr\n######################") + // } + + // write process to temp file + def tempFile = java.nio.file.Files.createTempFile("viash-process-${procKey}-", ".nf") + addShutdownHook { java.nio.file.Files.deleteIfExists(tempFile) } + tempFile.text = procStr + + // create process from temp file + def binding = new nextflow.script.ScriptBinding([:]) + def session = nextflow.Nextflow.getSession() + def parser = _getScriptLoader(session) + .setModule(true) + .setBinding(binding) + def moduleScript = parser.runScript(tempFile) + .getScript() + + // register module in meta + def module = new nextflow.script.IncludeDef.Module(name: procKey) + scriptMeta.addModule(moduleScript, module.name, module.alias) + + // retrieve and return process from meta + return scriptMeta.getProcess(procKey) +} + +// use Reflection to get a ScriptParser / ScriptLoader +// <25.02.0-edge: new nextflow.script.ScriptParser(session) +// >=25.02.0-edge: nextflow.script.ScriptLoaderFactory.create(session) +def _getScriptLoader(nextflow.Session session) { + // try using the old method + try { + Class scriptParserClass = Class.forName('nextflow.script.ScriptParser') + return scriptParserClass.getDeclaredConstructor(nextflow.Session).newInstance(session) + } catch (ClassNotFoundException e) { + // else try with the new method + try { + Class scriptLoaderFactoryClass = Class.forName('nextflow.script.ScriptLoaderFactory') + def createMethod = scriptLoaderFactoryClass.getDeclaredMethod('create', nextflow.Session) + return createMethod.invoke(null, session) // null because create is static + } catch (ClassNotFoundException | NoSuchMethodException | IllegalAccessException | java.lang.reflect.InvocationTargetException e2) { + // Handle the case where neither class is found + throw new Exception("Neither nextflow.script.ScriptParser nor nextflow.script.ScriptLoaderFactory could be found. Is this a compatible Nextflow version?", e2) + } + } +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "container" : { + "registry" : "images.viash-hub.com", + "image" : "vsh/htrnaseq/utils/save_params", + "tag" : "updatecraftbox" + }, + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/utils/save_params/nextflow.config b/target/nextflow/utils/save_params/nextflow.config new file mode 100644 index 00000000..0809087a --- /dev/null +++ b/target/nextflow/utils/save_params/nextflow.config @@ -0,0 +1,125 @@ +manifest { + name = 'utils/save_params' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'updatecraftbox' + description = 'Save parameters to a YAML file\n' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/utils/save_params/nextflow_labels.config b/target/nextflow/utils/save_params/nextflow_labels.config new file mode 100644 index 00000000..2821ec46 --- /dev/null +++ b/target/nextflow/utils/save_params/nextflow_labels.config @@ -0,0 +1,108 @@ +executor { + $k8s { + submitRateLimit = '10sec' + pollInterval = '1 sec' + } +} + +process { + container = 'nextflow/bash:latest' + + // default resources + memory = { 8.Gb * task.attempt } + cpus = 8 + maxForks = 36 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + maxMemory = 192.GB + + // Resource labels + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 8 } + withLabel: midcpu { cpus = 16 } + withLabel: highcpu { cpus = 32 } + + withLabel: verylowmem { memory = { get_memory( 4.GB * task.attempt ) } } + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 16.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 64.GB * task.attempt ) } } + +} + +profiles { + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.fixOwnership = true + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + + local { + // This config is for local processing. + process { + withName: ".*parallel_map_process" { + maxForks = 1 + } + maxMemory = 25.GB + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 6 } + withLabel: highcpu { cpus = 8 } + + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 12.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 20.GB * task.attempt ) } } + } + } +} + +def get_memory(to_compare) { + if (!process.containsKey("maxMemory") || !process.maxMemory) { + return to_compare + } + + try { + if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) { + return process.maxMemory + } + else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) { + return max_memory as nextflow.util.MemoryUnit + } + else { + return to_compare + } + } catch (all) { + println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!" + System.exit(1) + } + } diff --git a/target/nextflow/utils/save_params/nextflow_schema.json b/target/nextflow/utils/save_params/nextflow_schema.json new file mode 100644 index 00000000..d6f579fc --- /dev/null +++ b/target/nextflow/utils/save_params/nextflow_schema.json @@ -0,0 +1,62 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "save_params", + "description": "Save parameters to a YAML file\n", + "type": "object", + "$defs": { + "inputs": { + "title": "Inputs", + "type": "object", + "description": "No description", + "properties": { + "id": { + "type": "string", + "description": "The id of the job\n", + "help_text": "Type: `string`, multiple: `False`, required. " + }, + "params_yaml": { + "type": "string", + "description": "base64 encoded yaml containing the state\n", + "help_text": "Type: `string`, multiple: `False`, required. " + } + } + }, + "outputs": { + "title": "Outputs", + "type": "object", + "description": "No description", + "properties": { + "output": { + "type": "string", + "format": "path", + "description": "The output YAML file\n", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output.yaml\"`, direction: `output`, example: `\"output.yaml\"`. ", + "default": "$id.$key.output.yaml" + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/inputs" + }, + { + "$ref": "#/$defs/outputs" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/workflows/htrnaseq/.config.vsh.yaml b/target/nextflow/workflows/htrnaseq/.config.vsh.yaml new file mode 100644 index 00000000..3727dbdc --- /dev/null +++ b/target/nextflow/workflows/htrnaseq/.config.vsh.yaml @@ -0,0 +1,411 @@ +name: "htrnaseq" +namespace: "workflows" +version: "updatecraftbox" +authors: +- name: "Dries Schaumont" + roles: + - "maintainer" + info: + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Input arguments" + arguments: + - type: "file" + name: "--input_r1" + description: "Forward reads in FASTQ format. Multiple files corresponding to different\ + \ lanes can be provided which will\nbe demultiplexed separately before joining\ + \ the results for each individual well.\n" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--input_r2" + description: "Reverse reads in FASTQ format. Multiple files corresponding to different\ + \ lanes can be provided which will\nbe demultiplexed separately before joining\ + \ the results for each individual well.\n" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--barcodesFasta" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--umi_length" + description: "Length of the UMI sequences\n" + info: null + default: + - 10 + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--genomeDir" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--annotation" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--sample_id" + description: "Sample ID for the provided input files. If not provided, the value\ + \ of --id\nwill be used. Input files will allways be demultiplexed separately,\n\ + but the FASTQs for wells with matching sample IDs will be concatenated before\ + \ mapping.\n" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output arguments" + arguments: + - type: "file" + name: "--fastq_output" + description: "Directory containing output fastq files" + info: null + default: + - "fastq/*" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--star_output" + description: "Output from mapping with STAR" + info: null + default: + - "star.$id/*" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--nrReadsNrGenesPerChrom" + info: null + default: + - "nrReadsNrGenesPerChrom.$id.txt" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--star_qc_metrics" + info: null + default: + - "starLogs.$id.txt" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--eset" + info: null + default: + - "eset.$id.rds" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--f_data" + info: null + default: + - "fData.$id.tsv" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--p_data" + info: null + default: + - "pData.$id.tsv" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--html_report" + info: null + default: + - "report.$id.html" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--run_params" + info: null + default: + - "params.$id.yaml" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "nextflow_script" + path: "main.nf" + is_executable: true + entrypoint: "run_wf" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +- type: "file" + path: "_viash.yaml" + dest: "_viash.yaml" +test_resources: +- type: "nextflow_script" + path: "test.nf" + is_executable: true + entrypoint: "test_wf" +- type: "nextflow_script" + path: "test.nf" + is_executable: true + entrypoint: "test_wf2" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +requirements: + commands: + - "ps" +dependencies: +- name: "stats/combine_star_logs" + repository: + type: "local" +- name: "stats/generate_pool_statistics" + repository: + type: "local" +- name: "stats/generate_well_statistics" + repository: + type: "local" +- name: "workflows/well_demultiplex" + repository: + type: "local" +- name: "workflows/well_metadata" + repository: + type: "local" +- name: "parallel_map" + repository: + type: "local" +- name: "eset/create_eset" + repository: + type: "local" +- name: "eset/create_fdata" + repository: + type: "local" +- name: "eset/create_pdata" + repository: + type: "local" +- name: "report/create_report" + repository: + type: "local" +- name: "utils/concatRuns" + repository: + type: "local" +- name: "utils/save_params" + repository: + type: "local" +repositories: +- type: "local" + name: "local" +- type: "vsh" + name: "bb" + repo: "biobox" + tag: "v0.3.1" +license: "MIT" +links: + repository: "https://github.com/viash-hub/htrnaseq" +runners: +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "native" + id: "native" +- type: "native" + id: "native" +build_info: + config: "src/workflows/htrnaseq/config.vsh.yaml" + runner: "nextflow" + engine: "native|native" + output: "target/nextflow/workflows/htrnaseq" + executable: "target/nextflow/workflows/htrnaseq/main.nf" + viash_version: "0.9.4" + git_commit: "e6da525fc57aaec74f348eb974b68faa647bf800" + git_remote: "https://github.com/viash-hub/htrnaseq" + dependencies: + - "target/nextflow/stats/combine_star_logs" + - "target/nextflow/stats/generate_pool_statistics" + - "target/nextflow/stats/generate_well_statistics" + - "target/nextflow/workflows/well_demultiplex" + - "target/nextflow/workflows/well_metadata" + - "target/nextflow/parallel_map" + - "target/nextflow/eset/create_eset" + - "target/nextflow/eset/create_fdata" + - "target/nextflow/eset/create_pdata" + - "target/nextflow/report/create_report" + - "target/nextflow/utils/concatRuns" + - "target/nextflow/utils/save_params" +package_config: + name: "htrnaseq" + version: "updatecraftbox" + summary: "A workflow for high-throughput RNA-seq data analyses.\n" + description: "This workflow is designed to process high-throughput RNA-seq data,\ + \ where every\nwell of a microarray plate is a sample. A fasta file provided as\ + \ input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow\ + \ is built in a modular fashion, where most of the base functionality\nis provided\ + \ by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\n\ + supplemented by custom base components and workflow components in this package.\n\ + \nThe full workflow is split in two major subworkflows that can be run independently:\n\ + \n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per\ + \ well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate\ + \ QC reports.\n\nEach of those can be started individually, or the full workflow\ + \ can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq)\ + \ \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner)\ + \ where a\nnumber of choices (input/output structure and location) have been made.\n\ + \nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other\ + \ formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex)\ + \ first.\n" + info: + test_resources: + - path: "gs://viash-hub-resources/htrnaseq/v2" + dest: "resources_test" + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script\ + \ += 'includeConfig(\"nextflow_labels.config\")'\n.resources += {path: '/src/config/labels.config',\ + \ dest: 'nextflow_labels.config'}\n.resources += {path: '/_viash.yaml', dest:\ + \ '_viash.yaml'}\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'updatecraftbox'" + keywords: + - "bioinformatics" + - "sequencing" + - "high-throughput" + - "RNAseq" + - "mapping" + - "counting" + - "pipeline" + - "workflow" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/viash-hub/htrnaseq" + issue_tracker: "https://github.com/viash-hub/htrnaseq/issues" diff --git a/target/nextflow/workflows/htrnaseq/_viash.yaml b/target/nextflow/workflows/htrnaseq/_viash.yaml new file mode 100644 index 00000000..d08de7b1 --- /dev/null +++ b/target/nextflow/workflows/htrnaseq/_viash.yaml @@ -0,0 +1,21 @@ +name: htrnaseq +summary: | + A workflow for high-throughput RNA-seq data analyses. +description: "This workflow is designed to process high-throughput RNA-seq data, where every\nwell of a microarray plate is a sample. A fasta file provided as input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow is built in a modular fashion, where most of the base functionality\nis provided by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\nsupplemented by custom base components and workflow components in this package.\n\nThe full workflow is split in two major subworkflows that can be run independently:\n\n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate QC reports.\n\nEach of those can be started individually, or the full workflow can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq) \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner) where a\nnumber of choices (input/output structure and location) have been made.\n\nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex) first.\n" +license: MIT +keywords: [bioinformatics, sequencing, high-throughput, RNAseq, mapping, counting, pipeline, workflow] +links: + issue_tracker: https://github.com/viash-hub/htrnaseq/issues + repository: https://github.com/viash-hub/htrnaseq +viash_version: 0.9.4 +info: + test_resources: + - path: gs://viash-hub-resources/htrnaseq/v2 + dest: resources_test +config_mods: | + .requirements.commands := ['ps'] + .runners[.type == 'nextflow'].config.script += 'includeConfig("nextflow_labels.config")' + .resources += {path: '/src/config/labels.config', dest: 'nextflow_labels.config'} + .resources += {path: '/_viash.yaml', dest: '_viash.yaml'} +version: updatecraftbox +organization: vsh diff --git a/target/nextflow/workflows/htrnaseq/main.nf b/target/nextflow/workflows/htrnaseq/main.nf new file mode 100644 index 00000000..ac3cb7ce --- /dev/null +++ b/target/nextflow/workflows/htrnaseq/main.nf @@ -0,0 +1,4041 @@ +// htrnaseq updatecraftbox +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries Schaumont (maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "htrnaseq", + "namespace" : "workflows", + "version" : "updatecraftbox", + "authors" : [ + { + "name" : "Dries Schaumont", + "roles" : [ + "maintainer" + ], + "info" : { + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Input arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input_r1", + "description" : "Forward reads in FASTQ format. Multiple files corresponding to different lanes can be provided which will\nbe demultiplexed separately before joining the results for each individual well.\n", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--input_r2", + "description" : "Reverse reads in FASTQ format. Multiple files corresponding to different lanes can be provided which will\nbe demultiplexed separately before joining the results for each individual well.\n", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--barcodesFasta", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--umi_length", + "description" : "Length of the UMI sequences\n", + "default" : [ + 10 + ], + "required" : false, + "min" : 1, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--genomeDir", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--annotation", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--sample_id", + "description" : "Sample ID for the provided input files. If not provided, the value of --id\nwill be used. Input files will allways be demultiplexed separately,\nbut the FASTQs for wells with matching sample IDs will be concatenated before mapping.\n", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Output arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--fastq_output", + "description" : "Directory containing output fastq files", + "default" : [ + "fastq/*" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--star_output", + "description" : "Output from mapping with STAR", + "default" : [ + "star.$id/*" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--nrReadsNrGenesPerChrom", + "default" : [ + "nrReadsNrGenesPerChrom.$id.txt" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--star_qc_metrics", + "default" : [ + "starLogs.$id.txt" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--eset", + "default" : [ + "eset.$id.rds" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--f_data", + "default" : [ + "fData.$id.tsv" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--p_data", + "default" : [ + "pData.$id.tsv" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--html_report", + "default" : [ + "report.$id.html" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--run_params", + "default" : [ + "params.$id.yaml" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "nextflow_script", + "path" : "main.nf", + "is_executable" : true, + "entrypoint" : "run_wf" + }, + { + "type" : "file", + "path" : "/src/config/labels.config", + "dest" : "nextflow_labels.config" + }, + { + "type" : "file", + "path" : "/_viash.yaml", + "dest" : "_viash.yaml" + } + ], + "test_resources" : [ + { + "type" : "nextflow_script", + "path" : "test.nf", + "is_executable" : true, + "entrypoint" : "test_wf" + }, + { + "type" : "nextflow_script", + "path" : "test.nf", + "is_executable" : true, + "entrypoint" : "test_wf2" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "requirements" : { + "commands" : [ + "ps" + ] + }, + "dependencies" : [ + { + "name" : "stats/combine_star_logs", + "repository" : { + "type" : "local" + } + }, + { + "name" : "stats/generate_pool_statistics", + "repository" : { + "type" : "local" + } + }, + { + "name" : "stats/generate_well_statistics", + "repository" : { + "type" : "local" + } + }, + { + "name" : "workflows/well_demultiplex", + "repository" : { + "type" : "local" + } + }, + { + "name" : "workflows/well_metadata", + "repository" : { + "type" : "local" + } + }, + { + "name" : "parallel_map", + "repository" : { + "type" : "local" + } + }, + { + "name" : "eset/create_eset", + "repository" : { + "type" : "local" + } + }, + { + "name" : "eset/create_fdata", + "repository" : { + "type" : "local" + } + }, + { + "name" : "eset/create_pdata", + "repository" : { + "type" : "local" + } + }, + { + "name" : "report/create_report", + "repository" : { + "type" : "local" + } + }, + { + "name" : "utils/concatRuns", + "repository" : { + "type" : "local" + } + }, + { + "name" : "utils/save_params", + "repository" : { + "type" : "local" + } + } + ], + "repositories" : [ + { + "type" : "local", + "name" : "local" + }, + { + "type" : "vsh", + "name" : "bb", + "repo" : "biobox", + "tag" : "v0.3.1" + } + ], + "license" : "MIT", + "links" : { + "repository" : "https://github.com/viash-hub/htrnaseq" + }, + "runners" : [ + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "native", + "id" : "native" + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/workflows/htrnaseq/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "native|native", + "output" : "target/nextflow/workflows/htrnaseq", + "viash_version" : "0.9.4", + "git_commit" : "e6da525fc57aaec74f348eb974b68faa647bf800", + "git_remote" : "https://github.com/viash-hub/htrnaseq" + }, + "package_config" : { + "name" : "htrnaseq", + "version" : "updatecraftbox", + "summary" : "A workflow for high-throughput RNA-seq data analyses.\n", + "description" : "This workflow is designed to process high-throughput RNA-seq data, where every\nwell of a microarray plate is a sample. A fasta file provided as input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow is built in a modular fashion, where most of the base functionality\nis provided by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\nsupplemented by custom base components and workflow components in this package.\n\nThe full workflow is split in two major subworkflows that can be run independently:\n\n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate QC reports.\n\nEach of those can be started individually, or the full workflow can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq) \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner) where a\nnumber of choices (input/output structure and location) have been made.\n\nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex) first.\n", + "info" : { + "test_resources" : [ + { + "path" : "gs://viash-hub-resources/htrnaseq/v2", + "dest" : "resources_test" + } + ] + }, + "viash_version" : "0.9.4", + "source" : "src", + "target" : "target", + "config_mods" : [ + ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script += 'includeConfig(\\"nextflow_labels.config\\")'\n.resources += {path: '/src/config/labels.config', dest: 'nextflow_labels.config'}\n.resources += {path: '/_viash.yaml', dest: '_viash.yaml'}\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'updatecraftbox'" + ], + "keywords" : [ + "bioinformatics", + "sequencing", + "high-throughput", + "RNAseq", + "mapping", + "counting", + "pipeline", + "workflow" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/viash-hub/htrnaseq", + "issue_tracker" : "https://github.com/viash-hub/htrnaseq/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) +meta["root_dir"] = getRootDir() +include { combine_star_logs } from "${meta.resources_dir}/../../../nextflow/stats/combine_star_logs/main.nf" +include { generate_pool_statistics } from "${meta.resources_dir}/../../../nextflow/stats/generate_pool_statistics/main.nf" +include { generate_well_statistics } from "${meta.resources_dir}/../../../nextflow/stats/generate_well_statistics/main.nf" +include { well_demultiplex } from "${meta.resources_dir}/../../../nextflow/workflows/well_demultiplex/main.nf" +include { well_metadata } from "${meta.resources_dir}/../../../nextflow/workflows/well_metadata/main.nf" +include { parallel_map } from "${meta.resources_dir}/../../../nextflow/parallel_map/main.nf" +include { create_eset } from "${meta.resources_dir}/../../../nextflow/eset/create_eset/main.nf" +include { create_fdata } from "${meta.resources_dir}/../../../nextflow/eset/create_fdata/main.nf" +include { create_pdata } from "${meta.resources_dir}/../../../nextflow/eset/create_pdata/main.nf" +include { create_report } from "${meta.resources_dir}/../../../nextflow/report/create_report/main.nf" +include { concatRuns } from "${meta.resources_dir}/../../../nextflow/utils/concatRuns/main.nf" +include { save_params } from "${meta.resources_dir}/../../../nextflow/utils/save_params/main.nf" + +// inner workflow +// user-provided Nextflow code +workflow run_wf { + take: + raw_ch + + main: + input_ch = raw_ch + // Use the event ID as the default for the sample ID + | map {id, state -> + def sample_id = state.sample_id ?: id + def newState = state + ["sample_id": sample_id, "run_id": id] + return [id, newState] + } + + | save_params.run( + runIf: { id, state -> + state.run_params != null + }, + fromState: {id, state -> + // Define the function before using it + def convertPaths + convertPaths = { value -> + if (value instanceof java.nio.file.Path) + return value.toUriString() + else if (value instanceof List) + return value.collect { convertPaths(it) } + else if (value instanceof Collection) + throw new UnsupportedOperationException("Collections other than Lists are not supported") + else + return value + } + + // Apply conversion to all state values + def convertedState = state.collectEntries { k, v -> [(k): convertPaths(v)] } + + def yaml = new org.yaml.snakeyaml.Yaml() + def yamlString = yaml.dump(convertedState) + def encodedYaml = yamlString.bytes.encodeBase64().toString() + + return [ + "id": id, + "params_yaml": encodedYaml, + "output": state.run_params + ] + }, + toState: ["run_params": "output"] + ) + + // The featureData only has one requirement: the genome annotation. + // It can be generated straight away. Most of the time, there is one shared + // annotation for all of the inputs and the fData should only be calculated once. + // The state is manpulated in such a way that there is one event created per unique + // input annotation file. In turn, the featureData file can joined into the original input + // channel which allows it to be shared across events if required. + f_data_ch = input_ch + | toSortedList() + | flatMap {ids_and_states -> + def annotation_files = ids_and_states.inject([:]){ old_state, id_and_state -> + def (id, state) = id_and_state + def annotation_file = state.annotation + def new_state = old_state + [(annotation_file): (old_state.getOrDefault(annotation_file, []) + [id])] + return new_state + } + def file_names = annotation_files.keySet().collect{it.name} + assert (file_names.toSet().size() == file_names.size()), + "Please make sure that the annotation files have unique file names." + def new_states = annotation_files.collect{annotation_file, value -> + def new_state = [annotation_file.name , ["annotation": annotation_file, "event_ids": value]] + return new_state + } + return new_states + } + | create_fdata.run( + directives: [label: ["lowmem", "lowcpu"]], + fromState: [ + "gtf": "annotation", + "output": "f_data" + ], + toState: ["f_data": "output"] + ) + | flatMap {_, state -> + def new_states = state.event_ids.collect{event_id -> + [event_id, ["f_data": state.f_data]] + } + return new_states + } + + // Perform mapping of each well. + demultiplex_ch = input_ch + | well_demultiplex.run( + fromState: [ + "input_r1": "input_r1", + "input_r2": "input_r2", + "barcodesFasta": "barcodesFasta", + ], + toState: {id, result, state -> + def all_fastq = result.output_r1 + result.output_r2 + def output_dir = all_fastq.collect{it.parent}.unique() + assert output_dir.size() == 1, "Expected output from well demultiplexing (id $id) to reside into one directory. Found: $output_dir" + def new_state = state + [ + "input_r1": result.output_r1, + "input_r2": result.output_r2, + "fastq_output_directory": output_dir[0], + ] + return new_state + } + ) + + fastq_output_directory_ch = demultiplex_ch + | map {id, state -> + def new_event = [state.sample_id, state] + return new_event + } + | groupTuple(by: 0, sort: "hash") + | map {id, states -> + def fastq_output_dirs = states.collect{it.fastq_output_directory} + def new_state = ["fastq_output_directory": fastq_output_dirs] + def new_event = [id, new_state] + return [id, new_state] + } + + + concat_samples_ch = demultiplex_ch.join(f_data_ch) + | map {id, demultiplex_state, f_data_state -> + def newState = demultiplex_state + ["f_data": f_data_state["f_data"]] + [id, newState] + } + | concatRuns.run( + fromState: [ + "input_r1": "input_r1", + "input_r2": "input_r2", + "sample_id": "sample_id", + ], + toState: {id, result, state -> + def state_overwite = [ + "input_r1": result.output_r1, + "input_r2": result.output_r2, + "_meta": ["join_id": state.run_id] + ] + return state + state_overwite + } + ) + + pool_ch = concat_samples_ch.join(fastq_output_directory_ch) + | map {id, concat_state, fastq_output_directory_state -> + def new_state = concat_state + fastq_output_directory_state + return [id, new_state] + } + | parallel_map.run( + directives: ["label": ["highmem", "lowcpu"]], + fromState: {id, state -> + [ + "input_r1": state.input_r1, + "input_r2": state.input_r2, + "barcodesFasta": state.barcodesFasta, + "umiLength": state.umi_length, + "output": state.star_output[0], + "genomeDir": state.genomeDir, + ] + }, + toState: [ + "star_output": "output", + ] + ) + // Split the events from 1 event per pool into events per well + // and add extra metadata about the wells to the state. + | well_metadata.run( + fromState: [ + "barcodesFasta": "barcodesFasta", + "input_r1": "input_r1", + "input_r2": "input_r2", + "star_mapping": "star_output" + ], + toState: [ + "input_r1": "output_r1", + "input_r2": "output_r2", + "pool": "pool", + "well_id": "well_id", + "barcode": "barcode", + "lane": "lane", + "n_wells": "n_wells", + "star_mapping": "well_star_mapping", + ] + ) + // Use the bam file to generate statistics + | generate_well_statistics.run( + directives: [label: ["verylowmem", "verylowcpu"]], + fromState: { id, state -> + [ + "input": state.star_mapping.resolve('Aligned.sortedByCoord.out.bam'), + "barcode": state.barcode, + "well_id": state.well_id, + ] + }, + toState: [ + "nrReadsNrGenesPerChromWell": "nrReadsNrGenesPerChrom", + ] + ) + // Join the events back to pool-level + | map {id, state -> + // Create a special groupKey, such that groupTuple + // knows when all the barcodes have been grouped into 1 event. + // This way the processing is as distributed as possible. + def key = groupKey(state.pool, state.n_wells) + def newEvent = [key, state] + return newEvent + } + // Use a custom sorting function because sort: 'hash' + // requires a hash to be calculated on every entry of the state + // This is inefficient when the number of events is large + // (i.e large number or barcodes). + // Sorting on lexographical order of the barcode is sufficient here. + | groupTuple(sort: {a, b -> a.barcode <=> b.barcode}) + | map {id, states -> + // Gather the keys from all states. for some state items, + // we need gather all the different items from across the states + def barcodes = states.collect{it.barcode} + assert barcodes.clone().unique().size() == barcodes.size(), \ + "Error when gathering information for pool ${id}, barcodes are not unique!" + def well_ids = states.collect{it.well_id} + assert well_ids.clone().unique().size() == well_ids.size(), \ + "Error when gathering information for pool ${id}, well IDs are not unique!" + def custom_state = [ + "input_r1": states.collect{it.input_r1}, + "input_r2": states.collect{it.input_r2}, + "barcode": barcodes, + "well_id": well_ids, + "star_mapping": states.collect{it.star_mapping}, + // Well and pool stats should be carefully kept separate. + // The workflow argument points to the name for the pool statistics: + "nrReadsNrGenesPerChromWell": states.collect{it.nrReadsNrGenesPerChromWell}, + "nrReadsNrGenesPerChromPool": states[0].nrReadsNrGenesPerChrom + ] + //For many state items, the value is the same across states. + def other_state_keys = states.inject([].toSet()){ current_keys, state -> + def new_keys = current_keys + state.keySet() + return new_keys + }.minus(custom_state.keySet()) + // All other state should have a unique value + def old_state_items = other_state_keys.inject([:]){ old_state, argument_name -> + argument_values = states.collect{it.get(argument_name)}.unique() + assert argument_values.size() == 1, "Arguments should be the same across modalities. Please report this \ + as a bug. Argument name: $argument_name, \ + argument value: $argument_values" + def argument_value + argument_values.each { argument_value = it } + def current_state = old_state + [(argument_name): argument_value] + return current_state + } + + def new_state = custom_state + old_state_items + [id.getGroupTarget(), new_state] + } + + pool_statistics_ch = pool_ch + | generate_pool_statistics.run( + directives: ["label": ["lowmem", "verylowcpu"]], + fromState: [ + "nrReadsNrGenesPerChrom": "nrReadsNrGenesPerChromWell", + "nrReadsNrGenesPerChromPool": "nrReadsNrGenesPerChromPool" + ], + toState: [ + "nrReadsNrGenesPerChromPool": "nrReadsNrGenesPerChromPool" + ] + ) + + // The statistics from the STAR logs of different wells are joined + // on pool level + star_logs_ch = pool_ch + | combine_star_logs.run( + directives: ["label": ["lowmem", "verylowcpu"]], + fromState: {id, state -> [ + "star_logs": state.star_output.collect{it.resolve("Log.final.out")}, + "gene_summary_logs": state.star_output.collect{it.resolve("Solo.out/Gene/Summary.csv")}, + "reads_per_gene_logs": state.star_output.collect{it.resolve("ReadsPerGene.out.tab")}, + "barcodes": state.barcode, + "output": state.star_qc_metrics + ] + }, + toState: [ + "star_qc_metrics": "output", + ] + ) + + eset_ch = star_logs_ch.join(pool_statistics_ch, remainder: true) + | map {id, star_logs_state, pool_statistics_state -> + def newState = star_logs_state + ["nrReadsNrGenesPerChromPool": pool_statistics_state.nrReadsNrGenesPerChromPool] + return [id, newState] + } + | create_pdata.run( + directives: [label: ["lowmem", "lowcpu"]], + fromState: [ + "star_stats_file": "star_qc_metrics", + "nrReadsNrGenesPerChromPool": "nrReadsNrGenesPerChromPool", + "output": "p_data" + ], + toState: ["p_data": "output"], + ) + | create_eset.run( + directives: [label: ["lowmem", "lowcpu"]], + fromState: [ + "pDataFile": "p_data", + "fDataFile": "f_data", + "mappingDir": "star_output", + "output": "eset", + "barcodes": "barcode", + "poolName": "pool", + ], + toState: [ + "eset": "output", + ] + ) + + report_channel = eset_ch + | toSortedList() + | map {ids_and_states -> + def states = ids_and_states.collect{it[1]} + def html_report = states[0].html_report + def ids = ids_and_states.collect{it[0]} + def esets = states.collect{it.eset} + ["report", ["esets": esets, "html_report": html_report, "original_ids": ids]] + } + | create_report.run( + fromState: [ + "eset": "esets", + "output_report": "html_report", + ], + toState: [ + "html_report": "output_report" + ] + ) + | flatMap {id, state -> + state.original_ids.collect{original_id -> + [original_id, ["html_report": state.html_report]] + } + } + + output_ch = eset_ch.join(report_channel) + | map {id, state_eset, state_report -> + def new_state = state_eset + [ + "html_report": state_report.html_report, + ] + [id, new_state] + } + | setState([ + "star_output": "star_output", + "fastq_output": "fastq_output_directory", + "nrReadsNrGenesPerChrom": "nrReadsNrGenesPerChromPool", + "star_qc_metrics": "star_qc_metrics", + "eset": "eset", + "f_data": "f_data", + "p_data": "p_data", + "html_report": "html_report", + "run_params": "run_params", + "_meta": "_meta", + ]) + + emit: + output_ch +} + +// inner workflow hook +def innerWorkflowFactory(args) { + return run_wf +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/workflows/htrnaseq/nextflow.config b/target/nextflow/workflows/htrnaseq/nextflow.config new file mode 100644 index 00000000..99641c32 --- /dev/null +++ b/target/nextflow/workflows/htrnaseq/nextflow.config @@ -0,0 +1,125 @@ +manifest { + name = 'workflows/htrnaseq' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'updatecraftbox' + author = 'Dries Schaumont' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/workflows/htrnaseq/nextflow_labels.config b/target/nextflow/workflows/htrnaseq/nextflow_labels.config new file mode 100644 index 00000000..2821ec46 --- /dev/null +++ b/target/nextflow/workflows/htrnaseq/nextflow_labels.config @@ -0,0 +1,108 @@ +executor { + $k8s { + submitRateLimit = '10sec' + pollInterval = '1 sec' + } +} + +process { + container = 'nextflow/bash:latest' + + // default resources + memory = { 8.Gb * task.attempt } + cpus = 8 + maxForks = 36 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + maxMemory = 192.GB + + // Resource labels + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 8 } + withLabel: midcpu { cpus = 16 } + withLabel: highcpu { cpus = 32 } + + withLabel: verylowmem { memory = { get_memory( 4.GB * task.attempt ) } } + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 16.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 64.GB * task.attempt ) } } + +} + +profiles { + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.fixOwnership = true + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + + local { + // This config is for local processing. + process { + withName: ".*parallel_map_process" { + maxForks = 1 + } + maxMemory = 25.GB + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 6 } + withLabel: highcpu { cpus = 8 } + + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 12.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 20.GB * task.attempt ) } } + } + } +} + +def get_memory(to_compare) { + if (!process.containsKey("maxMemory") || !process.maxMemory) { + return to_compare + } + + try { + if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) { + return process.maxMemory + } + else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) { + return max_memory as nextflow.util.MemoryUnit + } + else { + return to_compare + } + } catch (all) { + println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!" + System.exit(1) + } + } diff --git a/target/nextflow/workflows/htrnaseq/nextflow_schema.json b/target/nextflow/workflows/htrnaseq/nextflow_schema.json new file mode 100644 index 00000000..9fabc233 --- /dev/null +++ b/target/nextflow/workflows/htrnaseq/nextflow_schema.json @@ -0,0 +1,170 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "htrnaseq", + "description": "No description", + "type": "object", + "$defs": { + "input arguments": { + "title": "Input arguments", + "type": "object", + "description": "No description", + "properties": { + "input_r1": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "exists": true, + "description": "Forward reads in FASTQ format", + "help_text": "Type: `file`, multiple: `True`, required, direction: `input`. " + }, + "input_r2": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "exists": true, + "description": "Reverse reads in FASTQ format", + "help_text": "Type: `file`, multiple: `True`, required, direction: `input`. " + }, + "barcodesFasta": { + "type": "string", + "format": "path", + "exists": true, + "description": "", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`. " + }, + "umi_length": { + "type": "integer", + "description": "Length of the UMI sequences\n", + "help_text": "Type: `integer`, multiple: `False`, default: `10`. ", + "default": 10 + }, + "genomeDir": { + "type": "string", + "format": "path", + "exists": true, + "description": "", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`. " + }, + "annotation": { + "type": "string", + "format": "path", + "exists": true, + "description": "", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`. " + }, + "sample_id": { + "type": "string", + "description": "Sample ID for the provided input files", + "help_text": "Type: `string`, multiple: `False`. " + } + } + }, + "output arguments": { + "title": "Output arguments", + "type": "object", + "description": "No description", + "properties": { + "fastq_output": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "description": "Directory containing output fastq files", + "help_text": "Type: `file`, multiple: `True`, required, default: `[\"fastq/*\"]`, direction: `output`. ", + "default": [ + "fastq/*" + ] + }, + "star_output": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "description": "Output from mapping with STAR", + "help_text": "Type: `file`, multiple: `True`, required, default: `[\"star.$id/*\"]`, direction: `output`. ", + "default": [ + "star.$id/*" + ] + }, + "nrReadsNrGenesPerChrom": { + "type": "string", + "format": "path", + "description": "", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"nrReadsNrGenesPerChrom.$id.txt\"`, direction: `output`. ", + "default": "nrReadsNrGenesPerChrom.$id.txt" + }, + "star_qc_metrics": { + "type": "string", + "format": "path", + "description": "", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"starLogs.$id.txt\"`, direction: `output`. ", + "default": "starLogs.$id.txt" + }, + "eset": { + "type": "string", + "format": "path", + "description": "", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"eset.$id.rds\"`, direction: `output`. ", + "default": "eset.$id.rds" + }, + "f_data": { + "type": "string", + "format": "path", + "description": "", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"fData.$id.tsv\"`, direction: `output`. ", + "default": "fData.$id.tsv" + }, + "p_data": { + "type": "string", + "format": "path", + "description": "", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"pData.$id.tsv\"`, direction: `output`. ", + "default": "pData.$id.tsv" + }, + "html_report": { + "type": "string", + "format": "path", + "description": "", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"report.$id.html\"`, direction: `output`. ", + "default": "report.$id.html" + }, + "run_params": { + "type": "string", + "format": "path", + "description": "", + "help_text": "Type: `file`, multiple: `False`, default: `\"params.$id.yaml\"`, direction: `output`. ", + "default": "params.$id.yaml" + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/input arguments" + }, + { + "$ref": "#/$defs/output arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/workflows/runner/.config.vsh.yaml b/target/nextflow/workflows/runner/.config.vsh.yaml new file mode 100644 index 00000000..9c39bd18 --- /dev/null +++ b/target/nextflow/workflows/runner/.config.vsh.yaml @@ -0,0 +1,371 @@ +name: "runner" +namespace: "workflows" +version: "updatecraftbox" +argument_groups: +- name: "Input arguments" + arguments: + - type: "file" + name: "--input" + description: "Base directory of the form `s3://Sequencing///`.\n\ + Must contains FASTQ files in the format `PoolName_S*_L*_R1_001.fastq.gz` where\n\ + \ * PoolName is a unique ID for the microwell plates or combination thereof.\n\ + \ * S followed by a running number: the sample number based on the order\n\ + \ that samples are listed in the sample sheet (that was used to demultiplex\ + \ the pools)\n starting with 1 (e.g. S1)\n * (Optional) the lane number\ + \ (e.g. L001)\n * _001 fixed suffix.\n" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--barcodesFasta" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--genomeDir" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--annotation" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--pools" + description: "Filter the FASTQ files in the input directory to only include pools\ + \ from the provided list.\nPool names are inferred from the FASTQ file names\ + \ (see input argument for more information).\nBy default all pools are included.\n" + info: null + required: false + direction: "input" + multiple: true + multiple_sep: ";" + - type: "integer" + name: "--umi_length" + description: "Length of the UMI sequences\n" + info: null + default: + - 10 + required: false + min: 1 + direction: "input" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--run_params" + info: null + default: + - "params.yaml" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +- name: "Metadata arguments" + arguments: + - type: "string" + name: "--id" + description: "Unique identifier for the run" + info: null + required: false + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--project_id" + description: "Project ID" + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--experiment_id" + description: "Experiment ID" + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Publish arguments" + arguments: + - type: "string" + name: "--fastq_publish_dir" + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--results_publish_dir" + info: null + required: true + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output arguments" + arguments: + - type: "file" + name: "--star_output_dir" + info: null + default: + - "star_output" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--nrReadsNrGenesPerChrom_dir" + info: null + default: + - "nrReadsNrGenesPerChrom" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--star_qc_metrics_dir" + info: null + default: + - "starLogs" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--eset_dir" + info: null + default: + - "esets" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--f_data_dir" + info: null + default: + - "fData" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--p_data_dir" + info: null + default: + - "pData" + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "nextflow_script" + path: "main.nf" + is_executable: true + entrypoint: "run_wf" +- type: "file" + path: "disable_publishfiles_process.config" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +- type: "file" + path: "_viash.yaml" + dest: "_viash.yaml" +description: "Runner for HT RNA-seq pipeline" +test_resources: +- type: "nextflow_script" + path: "test.nf" + is_executable: true + entrypoint: "test_wf" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +requirements: + commands: + - "ps" +dependencies: +- name: "utils/listInputDir" + repository: + type: "local" +- name: "workflows/htrnaseq" + repository: + type: "local" +- name: "io/publish_fastqs" + repository: + type: "local" +- name: "io/publish_results" + repository: + type: "local" +- name: "utils/save_params" + repository: + type: "local" +license: "MIT" +links: + repository: "https://github.com/viash-hub/htrnaseq" +runners: +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"disable_publishfiles_process.config\")" + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "native" + id: "native" +- type: "native" + id: "native" +build_info: + config: "src/workflows/runner/config.vsh.yaml" + runner: "nextflow" + engine: "native|native" + output: "target/nextflow/workflows/runner" + executable: "target/nextflow/workflows/runner/main.nf" + viash_version: "0.9.4" + git_commit: "e6da525fc57aaec74f348eb974b68faa647bf800" + git_remote: "https://github.com/viash-hub/htrnaseq" + dependencies: + - "target/nextflow/utils/listInputDir" + - "target/nextflow/workflows/htrnaseq" + - "target/nextflow/io/publish_fastqs" + - "target/nextflow/io/publish_results" + - "target/nextflow/utils/save_params" +package_config: + name: "htrnaseq" + version: "updatecraftbox" + summary: "A workflow for high-throughput RNA-seq data analyses.\n" + description: "This workflow is designed to process high-throughput RNA-seq data,\ + \ where every\nwell of a microarray plate is a sample. A fasta file provided as\ + \ input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow\ + \ is built in a modular fashion, where most of the base functionality\nis provided\ + \ by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\n\ + supplemented by custom base components and workflow components in this package.\n\ + \nThe full workflow is split in two major subworkflows that can be run independently:\n\ + \n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per\ + \ well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate\ + \ QC reports.\n\nEach of those can be started individually, or the full workflow\ + \ can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq)\ + \ \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner)\ + \ where a\nnumber of choices (input/output structure and location) have been made.\n\ + \nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other\ + \ formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex)\ + \ first.\n" + info: + test_resources: + - path: "gs://viash-hub-resources/htrnaseq/v2" + dest: "resources_test" + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script\ + \ += 'includeConfig(\"nextflow_labels.config\")'\n.resources += {path: '/src/config/labels.config',\ + \ dest: 'nextflow_labels.config'}\n.resources += {path: '/_viash.yaml', dest:\ + \ '_viash.yaml'}\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'updatecraftbox'" + keywords: + - "bioinformatics" + - "sequencing" + - "high-throughput" + - "RNAseq" + - "mapping" + - "counting" + - "pipeline" + - "workflow" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/viash-hub/htrnaseq" + issue_tracker: "https://github.com/viash-hub/htrnaseq/issues" diff --git a/target/nextflow/workflows/runner/_viash.yaml b/target/nextflow/workflows/runner/_viash.yaml new file mode 100644 index 00000000..d08de7b1 --- /dev/null +++ b/target/nextflow/workflows/runner/_viash.yaml @@ -0,0 +1,21 @@ +name: htrnaseq +summary: | + A workflow for high-throughput RNA-seq data analyses. +description: "This workflow is designed to process high-throughput RNA-seq data, where every\nwell of a microarray plate is a sample. A fasta file provided as input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow is built in a modular fashion, where most of the base functionality\nis provided by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\nsupplemented by custom base components and workflow components in this package.\n\nThe full workflow is split in two major subworkflows that can be run independently:\n\n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate QC reports.\n\nEach of those can be started individually, or the full workflow can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq) \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner) where a\nnumber of choices (input/output structure and location) have been made.\n\nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex) first.\n" +license: MIT +keywords: [bioinformatics, sequencing, high-throughput, RNAseq, mapping, counting, pipeline, workflow] +links: + issue_tracker: https://github.com/viash-hub/htrnaseq/issues + repository: https://github.com/viash-hub/htrnaseq +viash_version: 0.9.4 +info: + test_resources: + - path: gs://viash-hub-resources/htrnaseq/v2 + dest: resources_test +config_mods: | + .requirements.commands := ['ps'] + .runners[.type == 'nextflow'].config.script += 'includeConfig("nextflow_labels.config")' + .resources += {path: '/src/config/labels.config', dest: 'nextflow_labels.config'} + .resources += {path: '/_viash.yaml', dest: '_viash.yaml'} +version: updatecraftbox +organization: vsh diff --git a/target/nextflow/workflows/runner/disable_publishfiles_process.config b/target/nextflow/workflows/runner/disable_publishfiles_process.config new file mode 100644 index 00000000..6cff5e26 --- /dev/null +++ b/target/nextflow/workflows/runner/disable_publishfiles_process.config @@ -0,0 +1,5 @@ +process { + withName: publishFilesProc { + publishDir = [ enabled: false ] + } +} \ No newline at end of file diff --git a/target/nextflow/workflows/runner/main.nf b/target/nextflow/workflows/runner/main.nf new file mode 100644 index 00000000..7055763d --- /dev/null +++ b/target/nextflow/workflows/runner/main.nf @@ -0,0 +1,3904 @@ +// runner updatecraftbox +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "runner", + "namespace" : "workflows", + "version" : "updatecraftbox", + "argument_groups" : [ + { + "name" : "Input arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input", + "description" : "Base directory of the form `s3://Sequencing///`.\nMust contains FASTQ files in the format `PoolName_S*_L*_R1_001.fastq.gz` where\n * PoolName is a unique ID for the microwell plates or combination thereof.\n * S followed by a running number: the sample number based on the order\n that samples are listed in the sample sheet (that was used to demultiplex the pools)\n starting with 1 (e.g. S1)\n * (Optional) the lane number (e.g. L001)\n * _001 fixed suffix.\n", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--barcodesFasta", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--genomeDir", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--annotation", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--pools", + "description" : "Filter the FASTQ files in the input directory to only include pools from the provided list.\nPool names are inferred from the FASTQ file names (see input argument for more information).\nBy default all pools are included.\n", + "required" : false, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--umi_length", + "description" : "Length of the UMI sequences\n", + "default" : [ + 10 + ], + "required" : false, + "min" : 1, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--run_params", + "default" : [ + "params.yaml" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Metadata arguments", + "arguments" : [ + { + "type" : "string", + "name" : "--id", + "description" : "Unique identifier for the run", + "required" : false, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--project_id", + "description" : "Project ID", + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--experiment_id", + "description" : "Experiment ID", + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Publish arguments", + "arguments" : [ + { + "type" : "string", + "name" : "--fastq_publish_dir", + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--results_publish_dir", + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Output arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--star_output_dir", + "default" : [ + "star_output" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--nrReadsNrGenesPerChrom_dir", + "default" : [ + "nrReadsNrGenesPerChrom" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--star_qc_metrics_dir", + "default" : [ + "starLogs" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--eset_dir", + "default" : [ + "esets" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--f_data_dir", + "default" : [ + "fData" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--p_data_dir", + "default" : [ + "pData" + ], + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "nextflow_script", + "path" : "main.nf", + "is_executable" : true, + "entrypoint" : "run_wf" + }, + { + "type" : "file", + "path" : "disable_publishfiles_process.config" + }, + { + "type" : "file", + "path" : "/src/config/labels.config", + "dest" : "nextflow_labels.config" + }, + { + "type" : "file", + "path" : "/_viash.yaml", + "dest" : "_viash.yaml" + } + ], + "description" : "Runner for HT RNA-seq pipeline", + "test_resources" : [ + { + "type" : "nextflow_script", + "path" : "test.nf", + "is_executable" : true, + "entrypoint" : "test_wf" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "requirements" : { + "commands" : [ + "ps" + ] + }, + "dependencies" : [ + { + "name" : "utils/listInputDir", + "repository" : { + "type" : "local" + } + }, + { + "name" : "workflows/htrnaseq", + "repository" : { + "type" : "local" + } + }, + { + "name" : "io/publish_fastqs", + "repository" : { + "type" : "local" + } + }, + { + "name" : "io/publish_results", + "repository" : { + "type" : "local" + } + }, + { + "name" : "utils/save_params", + "repository" : { + "type" : "local" + } + } + ], + "license" : "MIT", + "links" : { + "repository" : "https://github.com/viash-hub/htrnaseq" + }, + "runners" : [ + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"disable_publishfiles_process.config\\")", + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "native", + "id" : "native" + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/workflows/runner/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "native|native", + "output" : "target/nextflow/workflows/runner", + "viash_version" : "0.9.4", + "git_commit" : "e6da525fc57aaec74f348eb974b68faa647bf800", + "git_remote" : "https://github.com/viash-hub/htrnaseq" + }, + "package_config" : { + "name" : "htrnaseq", + "version" : "updatecraftbox", + "summary" : "A workflow for high-throughput RNA-seq data analyses.\n", + "description" : "This workflow is designed to process high-throughput RNA-seq data, where every\nwell of a microarray plate is a sample. A fasta file provided as input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow is built in a modular fashion, where most of the base functionality\nis provided by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\nsupplemented by custom base components and workflow components in this package.\n\nThe full workflow is split in two major subworkflows that can be run independently:\n\n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate QC reports.\n\nEach of those can be started individually, or the full workflow can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq) \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner) where a\nnumber of choices (input/output structure and location) have been made.\n\nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex) first.\n", + "info" : { + "test_resources" : [ + { + "path" : "gs://viash-hub-resources/htrnaseq/v2", + "dest" : "resources_test" + } + ] + }, + "viash_version" : "0.9.4", + "source" : "src", + "target" : "target", + "config_mods" : [ + ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script += 'includeConfig(\\"nextflow_labels.config\\")'\n.resources += {path: '/src/config/labels.config', dest: 'nextflow_labels.config'}\n.resources += {path: '/_viash.yaml', dest: '_viash.yaml'}\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'updatecraftbox'" + ], + "keywords" : [ + "bioinformatics", + "sequencing", + "high-throughput", + "RNAseq", + "mapping", + "counting", + "pipeline", + "workflow" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/viash-hub/htrnaseq", + "issue_tracker" : "https://github.com/viash-hub/htrnaseq/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) +meta["root_dir"] = getRootDir() +include { listInputDir } from "${meta.resources_dir}/../../../nextflow/utils/listInputDir/main.nf" +include { htrnaseq } from "${meta.resources_dir}/../../../nextflow/workflows/htrnaseq/main.nf" +include { publish_fastqs } from "${meta.resources_dir}/../../../nextflow/io/publish_fastqs/main.nf" +include { publish_results } from "${meta.resources_dir}/../../../nextflow/io/publish_results/main.nf" +include { save_params } from "${meta.resources_dir}/../../../nextflow/utils/save_params/main.nf" + +// inner workflow +// user-provided Nextflow code +def date = new Date().format('yyyyMMdd_hhmmss') + +def viash_config = java.nio.file.Paths.get("${moduleDir}/_viash.yaml") +def version = get_version(viash_config) + +workflow run_wf { + take: + raw_ch + + main: + input_ch = raw_ch + // List the FASTQ files per input directory + // Be careful: an event per lane is created! + | map {id, state -> + def new_state = state + ["run_id": id] + return [id, new_state] + } + + save_params_ch = input_ch + | toSortedList() + | map { states -> + def new_id = "save_params" + def all_states = states.collect{it[1]} + def run_params_output_templates = all_states.collect{it.run_params} + assert run_params_output_templates.unique().size() == 1: "The value for the 'run_params' parameter is not the same across runs." + def new_state = ["run_params": run_params_output_templates[0], "all_states": all_states] + return [new_id, new_state] + } + | save_params.run( + key: "save_params_runner", + fromState: {id, state -> + + def convertPaths + convertPaths = { value -> + if (value instanceof java.nio.file.Path) + return value.toUriString() + else if (value instanceof List) + return value.collect { convertPaths(it) } + else if (value instanceof Collection) + throw new UnsupportedOperationException("Collections other than Lists are not supported") + else + return value + } + + // Apply conversion to all state values + def convertedState = state.all_states.collect{it.collectEntries { k, v -> [(k): convertPaths(v)] }} + + def yaml = new org.yaml.snakeyaml.Yaml() + def yamlString = yaml.dump(convertedState) + def encodedYaml = yamlString.bytes.encodeBase64().toString() + + return [ + "id": id, + "params_yaml": encodedYaml, + "output": state.run_params + ] + }, + toState: ["run_params": "output"] + ) + + htrnaseq_ch = input_ch + | map { id, state -> + // The argument names for this workflow and the htrnaseq workflow may overlap + // here, we store a copy in order to make sure to not accidentally overwrite the state. + def new_state = state + [ + "star_output_dir_workflow": state.star_output_dir, + "nrReadsNrGenesPerChrom_dir_workflow": state.nrReadsNrGenesPerChrom_dir, + "star_qc_metrics_dir_workflow": state.star_qc_metrics_dir, + "eset_dir_workflow": state.eset_dir, + "f_data_dir_workflow": state.f_data_dir, + "p_data_dir_workflow": state.p_data_dir + ] + return [id, new_state] + } + | listInputDir.run( + fromState: [ + "input": "input", + "pools": "pools", + ], + toState: { id, state, result -> + def clean_state = state.findAll{ it.key != "input" } + clean_state + result + } + ) + // ListInputDir puts the sample_id as the event ID (slot 0 from the tuple). + // The sample_id was inferred from the start of the file name, + // and it can be used to group the FASTQ files, because an input folder + // can contain input files from multiple samples (pools). Additionally, + // there might be multiple FASTQs for a single sample that correspond to the + // lanes. So the fastq files must be gathered across lanes and input folders + // in order to create an input lists for R1 and R2. + // The ID of the event here is important! It determines the name of the output + // folders for the FASTQ files and these folders are published as-is later. + // The folder where the FASTQ files are stored in should be named after the run ID. + | map {id, state -> ["${state.sample_id}/${state.run_id}".toString(), state]} + | groupTuple(by: 0, sort: "hash") + | map {id, states -> + def new_r1 = states.collect{it.r1_output} + def new_r2 = states.collect{it.r2_output} + // This assumes that, except for r1 and r2, + // the keys across the grouped states are the same. + // TODO: this can be asserted. + def new_state = states[0] + [ + "r1": new_r1, + "r2": new_r2, + ] + return [id, new_state] + } + | view {"Pool inputs after listing directory: $it"} + | htrnaseq.run( + args: [ + f_data: 'fData/$id.txt', + p_data: 'pData/$id.txt', + star_output: 'star_output/$id/*', + fastq_output: 'fastq/*', + eset: 'esets/$id.rds', + nrReadsNrGenesPerChrom: 'nrReadsNrGenesPerChrom/$id.txt', + star_qc_metrics: 'starLogs/$id.txt', + html_report: "report.html", + run_params: null + ], + fromState: [ + input_r1: "r1", + input_r2: "r2", + barcodesFasta: "barcodesFasta", + genomeDir: "genomeDir", + annotation: "annotation", + umi_length: "umi_length", + sample_id: "sample_id", + ], + toState: { id, result, state -> state + result } + ) + + // The HT-RNAseq workflow outputs multiple events, one per 'pool' (usually a plate) + // but for publishing the results, this is not handy because we want to use the $id + // variable as a pointer to the target data. + // So, we should combine everything together + results_publish_ch = htrnaseq_ch + | combine(save_params_ch) + | map {new_id, grouped_ch_state, save_params_id, save_params_state -> + def new_state = grouped_ch_state + ["run_params": save_params_state.run_params] + return [new_id, new_state] + } + | toSortedList + | map{ vs -> + def states = vs.collect{it[1]} + + // The STAR output is a directory for each well in a plate (or pool of plates). + // The wells are grouped into a directory per sample. The name of this directory should + // match the sample_id. + def star_output_samples = states.collectMany{state -> + state.star_output.collect{ + def star_sample_dir = it.parent + assert star_sample_dir.name == state.sample_id: "Unexpected state: the parent directory of STAR output \ + path '${it}' should match with the sample ID ${sample_id}" + star_sample_dir + } + } + def new_state = [ + "star_output": star_output_samples, + ] + + // Keys for which the values should be the same across samples + def state_keys_unique = [ + "html_report", + "project_id", + "experiment_id", + "star_output_dir_workflow", + "nrReadsNrGenesPerChrom_dir_workflow", + "star_qc_metrics_dir_workflow", + "eset_dir_workflow", + "f_data_dir_workflow", + "p_data_dir_workflow", + "f_data", + "run_params" + ] + def state_unique_keys = state_keys_unique.inject([:]) { state_to_update, argument_name -> + argument_values = states.collect{it.get(argument_name)}.unique() + assert argument_values.size() == 1, "State error: values for argument $argument_name should be the same across states. \ + Argument values: $argument_values" + // take the unique value from the set (there is only one) + def argument_value + argument_values.each { argument_value = it } + state_to_update + [(argument_name): argument_value] + } + + // Keys that just require gathering of values across samples + def state_keys_collect = [ + "nrReadsNrGenesPerChrom", + "star_qc_metrics", + "eset", + "p_data", + ] + def state_collect = state_keys_collect.collectEntries{ key_ -> + [key_, states.collect{it.get(key_)}] + } + + new_state = new_state + state_unique_keys + state_collect + [states[0].run_id, new_state] + } + | publish_results.run( + fromState: { id, state -> + def prefix = "${state.project_id}/${state.experiment_id}/data_processed/${date}_htrnaseq_${version}" + + println("Publising results to ${params.results_publish_dir}/${prefix}") + + [ + // Inputs + star_output: state.star_output, + nrReadsNrGenesPerChrom: state.nrReadsNrGenesPerChrom, + star_qc_metrics: state.star_qc_metrics, + eset: state.eset, + f_data: state.f_data, + p_data: state.p_data, + html_report: state.html_report, + run_params: state.run_params, + // Output locations + run_params_output: "${prefix}/${state.run_params.name}", + html_report_output: "${prefix}/${state.html_report.name}", + star_output_dir: "${prefix}/${state.star_output_dir_workflow}", + nrReadsNrGenesPerChrom_dir: "${prefix}/${state.nrReadsNrGenesPerChrom_dir_workflow}", + star_qc_metrics_dir: "${prefix}/${state.star_qc_metrics_dir_workflow}", + eset_dir: "${prefix}/${state.eset_dir_workflow}", + f_data_dir: "${prefix}/${state.f_data_dir_workflow}", + p_data_dir: "${prefix}/${state.p_data_dir_workflow}" + ] + }, + toState: { id, result, state -> result }, + directives: [ + publishDir: [ + path: "${params.results_publish_dir}", + overwrite: false, + mode: "copy" + ] + ] + ) + | setState([ + "star_output_dir", + "nrReadsNrGenesPerChrom_dir", + "star_qc_metrics_dir", + "eset_dir", + "f_data_dir", + "p_data_dir", + ] + ) + + fastq_publish_ch = htrnaseq_ch + // The output from the htrnaseq workflow is on sample (i.e. pool) level + // Multiple sequencing runs may have contributes to the FASTQ files from this pool. + // So the fastq_output is a list of directories, one for each run. + // We assume that the names of the folders containing the FASTQ files are equal to the pool names. + | flatMap {id, state -> + state.fastq_output.collect{fastq_dir -> + def run_id = fastq_dir.name + def new_id = "${run_id}/${state.sample_id}" + def new_state = [ + "fastq_output": fastq_dir.listFiles(), + "sample_id": state.sample_id, + "run_id": run_id, + "output": "${run_id}/${date}_htrnaseq_${version}/${state.sample_id}".toString() + ] + [new_id, new_state] + } + } + // A folder containing the FASTQ files from a certain pool may be present in the state from + // multiple samples; if that pool contributed to the data from those samples. + // Those FASTQ files will only be published once by filtering out the duplicate events here. + | unique{it[0]} + | publish_fastqs.run( + fromState: [ + "input": "fastq_output", + "output": "output", + ], + toState: { id, result, state -> state }, + directives: [ + publishDir: [ + path: "${params.fastq_publish_dir}", + overwrite: false, + mode: "copy" + ] + ] + ) + + emit: + results_publish_ch + +} + +def get_version(inputFile) { + def yamlSlurper = new groovy.yaml.YamlSlurper() + def loaded_viash_config = yamlSlurper.parse(file(inputFile)) + def version = (loaded_viash_config.version) ? loaded_viash_config.version : "unknown_version" + println("HT-RNAseq version to be used: ${version}") + return version +} + +// inner workflow hook +def innerWorkflowFactory(args) { + return run_wf +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/workflows/runner/nextflow.config b/target/nextflow/workflows/runner/nextflow.config new file mode 100644 index 00000000..9162930c --- /dev/null +++ b/target/nextflow/workflows/runner/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'workflows/runner' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'updatecraftbox' + description = 'Runner for HT RNA-seq pipeline' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("disable_publishfiles_process.config") +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/workflows/runner/nextflow_labels.config b/target/nextflow/workflows/runner/nextflow_labels.config new file mode 100644 index 00000000..2821ec46 --- /dev/null +++ b/target/nextflow/workflows/runner/nextflow_labels.config @@ -0,0 +1,108 @@ +executor { + $k8s { + submitRateLimit = '10sec' + pollInterval = '1 sec' + } +} + +process { + container = 'nextflow/bash:latest' + + // default resources + memory = { 8.Gb * task.attempt } + cpus = 8 + maxForks = 36 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + maxMemory = 192.GB + + // Resource labels + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 8 } + withLabel: midcpu { cpus = 16 } + withLabel: highcpu { cpus = 32 } + + withLabel: verylowmem { memory = { get_memory( 4.GB * task.attempt ) } } + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 16.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 64.GB * task.attempt ) } } + +} + +profiles { + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.fixOwnership = true + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + + local { + // This config is for local processing. + process { + withName: ".*parallel_map_process" { + maxForks = 1 + } + maxMemory = 25.GB + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 6 } + withLabel: highcpu { cpus = 8 } + + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 12.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 20.GB * task.attempt ) } } + } + } +} + +def get_memory(to_compare) { + if (!process.containsKey("maxMemory") || !process.maxMemory) { + return to_compare + } + + try { + if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) { + return process.maxMemory + } + else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) { + return max_memory as nextflow.util.MemoryUnit + } + else { + return to_compare + } + } catch (all) { + println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!" + System.exit(1) + } + } diff --git a/target/nextflow/workflows/runner/nextflow_schema.json b/target/nextflow/workflows/runner/nextflow_schema.json new file mode 100644 index 00000000..bbd4c521 --- /dev/null +++ b/target/nextflow/workflows/runner/nextflow_schema.json @@ -0,0 +1,181 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "runner", + "description": "Runner for HT RNA-seq pipeline", + "type": "object", + "$defs": { + "input arguments": { + "title": "Input arguments", + "type": "object", + "description": "No description", + "properties": { + "input": { + "type": "string", + "format": "path", + "exists": true, + "description": "Base directory of the form `s3://Sequencing///`.\nMust contains FASTQ files in the format `PoolName_S*_L*_R1_001.fastq.gz` where\n * PoolName is a unique ID for the microwell plates or combination thereof.\n * S followed by a running number: the sample number based on the order\n that samples are listed in the sample sheet (that was used to demultiplex the pools)\n starting with 1 (e.g", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`. " + }, + "barcodesFasta": { + "type": "string", + "format": "path", + "exists": true, + "description": "", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`. " + }, + "genomeDir": { + "type": "string", + "format": "path", + "exists": true, + "description": "", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`. " + }, + "annotation": { + "type": "string", + "format": "path", + "exists": true, + "description": "", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`. " + }, + "pools": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Filter the FASTQ files in the input directory to only include pools from the provided list.\nPool names are inferred from the FASTQ file names (see input argument for more information).\nBy default all pools are included.\n", + "help_text": "Type: `string`, multiple: `True`. " + }, + "umi_length": { + "type": "integer", + "description": "Length of the UMI sequences\n", + "help_text": "Type: `integer`, multiple: `False`, default: `10`. ", + "default": 10 + }, + "run_params": { + "type": "string", + "format": "path", + "description": "", + "help_text": "Type: `file`, multiple: `False`, default: `\"params.yaml\"`, direction: `output`. ", + "default": "params.yaml" + } + } + }, + "metadata arguments": { + "title": "Metadata arguments", + "type": "object", + "description": "No description", + "properties": { + "id": { + "type": "string", + "description": "Unique identifier for the run", + "help_text": "Type: `string`, multiple: `False`. " + }, + "project_id": { + "type": "string", + "description": "Project ID", + "help_text": "Type: `string`, multiple: `False`, required. " + }, + "experiment_id": { + "type": "string", + "description": "Experiment ID", + "help_text": "Type: `string`, multiple: `False`, required. " + } + } + }, + "publish arguments": { + "title": "Publish arguments", + "type": "object", + "description": "No description", + "properties": { + "fastq_publish_dir": { + "type": "string", + "description": "", + "help_text": "Type: `string`, multiple: `False`, required. " + }, + "results_publish_dir": { + "type": "string", + "description": "", + "help_text": "Type: `string`, multiple: `False`, required. " + } + } + }, + "output arguments": { + "title": "Output arguments", + "type": "object", + "description": "No description", + "properties": { + "star_output_dir": { + "type": "string", + "format": "path", + "description": "", + "help_text": "Type: `file`, multiple: `False`, default: `\"star_output\"`, direction: `output`. ", + "default": "star_output" + }, + "nrReadsNrGenesPerChrom_dir": { + "type": "string", + "format": "path", + "description": "", + "help_text": "Type: `file`, multiple: `False`, default: `\"nrReadsNrGenesPerChrom\"`, direction: `output`. ", + "default": "nrReadsNrGenesPerChrom" + }, + "star_qc_metrics_dir": { + "type": "string", + "format": "path", + "description": "", + "help_text": "Type: `file`, multiple: `False`, default: `\"starLogs\"`, direction: `output`. ", + "default": "starLogs" + }, + "eset_dir": { + "type": "string", + "format": "path", + "description": "", + "help_text": "Type: `file`, multiple: `False`, default: `\"esets\"`, direction: `output`. ", + "default": "esets" + }, + "f_data_dir": { + "type": "string", + "format": "path", + "description": "", + "help_text": "Type: `file`, multiple: `False`, default: `\"fData\"`, direction: `output`. ", + "default": "fData" + }, + "p_data_dir": { + "type": "string", + "format": "path", + "description": "", + "help_text": "Type: `file`, multiple: `False`, default: `\"pData\"`, direction: `output`. ", + "default": "pData" + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/input arguments" + }, + { + "$ref": "#/$defs/metadata arguments" + }, + { + "$ref": "#/$defs/publish arguments" + }, + { + "$ref": "#/$defs/output arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/workflows/well_demultiplex/.config.vsh.yaml b/target/nextflow/workflows/well_demultiplex/.config.vsh.yaml new file mode 100644 index 00000000..2be14a2e --- /dev/null +++ b/target/nextflow/workflows/well_demultiplex/.config.vsh.yaml @@ -0,0 +1,279 @@ +name: "well_demultiplex" +namespace: "workflows" +version: "updatecraftbox" +authors: +- name: "Dries Schaumont" + roles: + - "maintainer" + info: + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +- name: "Marijke Van Moerbeke" + roles: + - "contributor" + info: + links: + github: "mvanmoerbeke" + orcid: "0000-0002-3097-5621" + linkedin: "marijke-van-moerbeke-84303a34" + organizations: + - name: "OpenAnalytics" + href: "https://www.openanalytics.eu" + role: "Statistical Consultant" +argument_groups: +- name: "Input arguments" + arguments: + - type: "file" + name: "--input_r1" + description: "Forward reads in FASTQ format. Multiple files can be provided which\ + \ will\nbe demultiplexed separately before joining the results for each individual\ + \ well.\n" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--input_r2" + description: "Reverse reads in FASTQ format. Multiple files can be provided which\ + \ will\nbe demultiplexed separately before joining the results for each individual\ + \ well.\n" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--barcodesFasta" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output arguments" + arguments: + - type: "file" + name: "--output_r1" + description: "List of demultiplexed fastq files" + info: null + default: + - "fastq/*_R1_001.fastq" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--output_r2" + description: "List of demultiplexed fastq files" + info: null + default: + - "fastq/*_R2_001.fastq" + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: true + multiple_sep: ";" +resources: +- type: "nextflow_script" + path: "main.nf" + is_executable: true + entrypoint: "run_wf" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +- type: "file" + path: "_viash.yaml" + dest: "_viash.yaml" +description: "Demultiplexing on well level" +test_resources: +- type: "nextflow_script" + path: "test.nf" + is_executable: true + entrypoint: "test_wf" +- type: "nextflow_script" + path: "test.nf" + is_executable: true + entrypoint: "test_wf2" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +requirements: + commands: + - "ps" +dependencies: +- name: "cutadapt" + repository: + type: "vsh" + repo: "biobox" + tag: "v0.3.1" +- name: "concat_text" + repository: + type: "vsh" + repo: "craftbox" + tag: "v0.3.0" +- name: "move_files_to_directory" + repository: + type: "vsh" + repo: "craftbox" + tag: "v0.3.0" +repositories: +- type: "vsh" + name: "bb" + repo: "biobox" + tag: "v0.3.1" +- type: "vsh" + name: "cb" + repo: "craftbox" + tag: "v0.3.0" +license: "MIT" +links: + repository: "https://github.com/viash-hub/htrnaseq" +runners: +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "native" + id: "native" +- type: "native" + id: "native" +build_info: + config: "src/workflows/well_demultiplex/config.vsh.yaml" + runner: "nextflow" + engine: "native|native" + output: "target/nextflow/workflows/well_demultiplex" + executable: "target/nextflow/workflows/well_demultiplex/main.nf" + viash_version: "0.9.4" + git_commit: "e6da525fc57aaec74f348eb974b68faa647bf800" + git_remote: "https://github.com/viash-hub/htrnaseq" + dependencies: + - "target/dependencies/vsh/vsh/biobox/v0.3.1/nextflow/cutadapt" + - "target/dependencies/vsh/vsh/craftbox/v0.3.0/nextflow/concat_text" + - "target/dependencies/vsh/vsh/craftbox/v0.3.0/nextflow/move_files_to_directory" +package_config: + name: "htrnaseq" + version: "updatecraftbox" + summary: "A workflow for high-throughput RNA-seq data analyses.\n" + description: "This workflow is designed to process high-throughput RNA-seq data,\ + \ where every\nwell of a microarray plate is a sample. A fasta file provided as\ + \ input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow\ + \ is built in a modular fashion, where most of the base functionality\nis provided\ + \ by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\n\ + supplemented by custom base components and workflow components in this package.\n\ + \nThe full workflow is split in two major subworkflows that can be run independently:\n\ + \n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per\ + \ well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate\ + \ QC reports.\n\nEach of those can be started individually, or the full workflow\ + \ can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq)\ + \ \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner)\ + \ where a\nnumber of choices (input/output structure and location) have been made.\n\ + \nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other\ + \ formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex)\ + \ first.\n" + info: + test_resources: + - path: "gs://viash-hub-resources/htrnaseq/v2" + dest: "resources_test" + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script\ + \ += 'includeConfig(\"nextflow_labels.config\")'\n.resources += {path: '/src/config/labels.config',\ + \ dest: 'nextflow_labels.config'}\n.resources += {path: '/_viash.yaml', dest:\ + \ '_viash.yaml'}\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'updatecraftbox'" + keywords: + - "bioinformatics" + - "sequencing" + - "high-throughput" + - "RNAseq" + - "mapping" + - "counting" + - "pipeline" + - "workflow" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/viash-hub/htrnaseq" + issue_tracker: "https://github.com/viash-hub/htrnaseq/issues" diff --git a/target/nextflow/workflows/well_demultiplex/_viash.yaml b/target/nextflow/workflows/well_demultiplex/_viash.yaml new file mode 100644 index 00000000..d08de7b1 --- /dev/null +++ b/target/nextflow/workflows/well_demultiplex/_viash.yaml @@ -0,0 +1,21 @@ +name: htrnaseq +summary: | + A workflow for high-throughput RNA-seq data analyses. +description: "This workflow is designed to process high-throughput RNA-seq data, where every\nwell of a microarray plate is a sample. A fasta file provided as input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow is built in a modular fashion, where most of the base functionality\nis provided by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\nsupplemented by custom base components and workflow components in this package.\n\nThe full workflow is split in two major subworkflows that can be run independently:\n\n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate QC reports.\n\nEach of those can be started individually, or the full workflow can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq) \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner) where a\nnumber of choices (input/output structure and location) have been made.\n\nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex) first.\n" +license: MIT +keywords: [bioinformatics, sequencing, high-throughput, RNAseq, mapping, counting, pipeline, workflow] +links: + issue_tracker: https://github.com/viash-hub/htrnaseq/issues + repository: https://github.com/viash-hub/htrnaseq +viash_version: 0.9.4 +info: + test_resources: + - path: gs://viash-hub-resources/htrnaseq/v2 + dest: resources_test +config_mods: | + .requirements.commands := ['ps'] + .runners[.type == 'nextflow'].config.script += 'includeConfig("nextflow_labels.config")' + .resources += {path: '/src/config/labels.config', dest: 'nextflow_labels.config'} + .resources += {path: '/_viash.yaml', dest: '_viash.yaml'} +version: updatecraftbox +organization: vsh diff --git a/target/nextflow/workflows/well_demultiplex/main.nf b/target/nextflow/workflows/well_demultiplex/main.nf new file mode 100644 index 00000000..459d71e3 --- /dev/null +++ b/target/nextflow/workflows/well_demultiplex/main.nf @@ -0,0 +1,3791 @@ +// well_demultiplex updatecraftbox +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries Schaumont (maintainer) +// * Marijke Van Moerbeke (contributor) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "well_demultiplex", + "namespace" : "workflows", + "version" : "updatecraftbox", + "authors" : [ + { + "name" : "Dries Schaumont", + "roles" : [ + "maintainer" + ], + "info" : { + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + }, + { + "name" : "Marijke Van Moerbeke", + "roles" : [ + "contributor" + ], + "info" : { + "links" : { + "github" : "mvanmoerbeke", + "orcid" : "0000-0002-3097-5621", + "linkedin" : "marijke-van-moerbeke-84303a34" + }, + "organizations" : [ + { + "name" : "OpenAnalytics", + "href" : "https://www.openanalytics.eu", + "role" : "Statistical Consultant" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Input arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input_r1", + "description" : "Forward reads in FASTQ format. Multiple files can be provided which will\nbe demultiplexed separately before joining the results for each individual well.\n", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--input_r2", + "description" : "Reverse reads in FASTQ format. Multiple files can be provided which will\nbe demultiplexed separately before joining the results for each individual well.\n", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--barcodesFasta", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Output arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--output_r1", + "description" : "List of demultiplexed fastq files", + "default" : [ + "fastq/*_R1_001.fastq" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output_r2", + "description" : "List of demultiplexed fastq files", + "default" : [ + "fastq/*_R2_001.fastq" + ], + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : true, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "nextflow_script", + "path" : "main.nf", + "is_executable" : true, + "entrypoint" : "run_wf" + }, + { + "type" : "file", + "path" : "/src/config/labels.config", + "dest" : "nextflow_labels.config" + }, + { + "type" : "file", + "path" : "/_viash.yaml", + "dest" : "_viash.yaml" + } + ], + "description" : "Demultiplexing on well level", + "test_resources" : [ + { + "type" : "nextflow_script", + "path" : "test.nf", + "is_executable" : true, + "entrypoint" : "test_wf" + }, + { + "type" : "nextflow_script", + "path" : "test.nf", + "is_executable" : true, + "entrypoint" : "test_wf2" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "requirements" : { + "commands" : [ + "ps" + ] + }, + "dependencies" : [ + { + "name" : "cutadapt", + "repository" : { + "type" : "vsh", + "repo" : "biobox", + "tag" : "v0.3.1" + } + }, + { + "name" : "concat_text", + "repository" : { + "type" : "vsh", + "repo" : "craftbox", + "tag" : "v0.3.0" + } + }, + { + "name" : "move_files_to_directory", + "repository" : { + "type" : "vsh", + "repo" : "craftbox", + "tag" : "v0.3.0" + } + } + ], + "repositories" : [ + { + "type" : "vsh", + "name" : "bb", + "repo" : "biobox", + "tag" : "v0.3.1" + }, + { + "type" : "vsh", + "name" : "cb", + "repo" : "craftbox", + "tag" : "v0.3.0" + } + ], + "license" : "MIT", + "links" : { + "repository" : "https://github.com/viash-hub/htrnaseq" + }, + "runners" : [ + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "native", + "id" : "native" + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/workflows/well_demultiplex/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "native|native", + "output" : "target/nextflow/workflows/well_demultiplex", + "viash_version" : "0.9.4", + "git_commit" : "e6da525fc57aaec74f348eb974b68faa647bf800", + "git_remote" : "https://github.com/viash-hub/htrnaseq" + }, + "package_config" : { + "name" : "htrnaseq", + "version" : "updatecraftbox", + "summary" : "A workflow for high-throughput RNA-seq data analyses.\n", + "description" : "This workflow is designed to process high-throughput RNA-seq data, where every\nwell of a microarray plate is a sample. A fasta file provided as input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow is built in a modular fashion, where most of the base functionality\nis provided by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\nsupplemented by custom base components and workflow components in this package.\n\nThe full workflow is split in two major subworkflows that can be run independently:\n\n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate QC reports.\n\nEach of those can be started individually, or the full workflow can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq) \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner) where a\nnumber of choices (input/output structure and location) have been made.\n\nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex) first.\n", + "info" : { + "test_resources" : [ + { + "path" : "gs://viash-hub-resources/htrnaseq/v2", + "dest" : "resources_test" + } + ] + }, + "viash_version" : "0.9.4", + "source" : "src", + "target" : "target", + "config_mods" : [ + ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script += 'includeConfig(\\"nextflow_labels.config\\")'\n.resources += {path: '/src/config/labels.config', dest: 'nextflow_labels.config'}\n.resources += {path: '/_viash.yaml', dest: '_viash.yaml'}\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'updatecraftbox'" + ], + "keywords" : [ + "bioinformatics", + "sequencing", + "high-throughput", + "RNAseq", + "mapping", + "counting", + "pipeline", + "workflow" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/viash-hub/htrnaseq", + "issue_tracker" : "https://github.com/viash-hub/htrnaseq/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) +meta["root_dir"] = getRootDir() +include { cutadapt } from "${meta.root_dir}/dependencies/vsh/vsh/biobox/v0.3.1/nextflow/cutadapt/main.nf" +include { concat_text } from "${meta.root_dir}/dependencies/vsh/vsh/craftbox/v0.3.0/nextflow/concat_text/main.nf" +include { move_files_to_directory } from "${meta.root_dir}/dependencies/vsh/vsh/craftbox/v0.3.0/nextflow/move_files_to_directory/main.nf" + +// inner workflow +// user-provided Nextflow code +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + /* + For each pool (i.e. event) in the channel, a list of R1 and R2 input + reads is provided which correspond to the lanes. If there are multiple lanes, + we can demultiplex into the wells for each lane in parallel. Therefore, cutadapt + must be started multiple times and we need an event per lane. The events are + created by taking the R1 and R2 pairs from the input lists. The index of the elements + in these lists are added to the ID in order to make them unique. + + The same pools may be present in multiple sequencing runs. Here, the events must be unique + across boths runs and samples. When called from the htrnaseq workflow; the events have the + format '{pool_id}/{run_id}' + */ + | flatMap {id, state -> + assert state.input_r1.size() == state.input_r2.size(), \ + "Expected equal number of inputs for R1 and R2" + if (state.input_r1.size() == 1) { + // special case where we do not want to adjust the ID to add an index. + // If we do add an index, the file paths will contain "_0", which + // will not be removed. For the scenarios where we do have multiple lanes, + // the files will be concatenated later and a new file path without the index + // is created at that point. + def newState = state + [ + "input_r1": state.input_r1[0], + "input_r2": state.input_r2[0], + "pool_and_run_id": id, + "n_lanes": 1, + "lane_sorting": 1, + ] + return [[id, newState]] + } + // Store the number of lanes that were encountered here in order to + // group them together in an asynchronous manner later by providing + // the expected number of events to be grouped to groupTuple. + // see https://www.nextflow.io/docs/latest/reference/operator.html#grouptuple + [state.input_r1, state.input_r2].transpose().withIndex().collect{ input_pair, index -> + def single_input_r1 = input_pair[0] + def single_input_r2 = input_pair[1] + def newState = state + ["input_r1": single_input_r1, + "input_r2": single_input_r2, + "pool_and_run_id": id, + "n_lanes": state.input_r1.size(), + "lane_sorting": index] + def newId = id + "_" + index + [newId, newState] + } + } + | cutadapt.run( + directives: [label: ["highmem", "midcpu"]], + fromState: { id, state -> + [ + input: state.input_r1, + input_r2: state.input_r2, + no_indels: true, + action: "none", + front_fasta: state.barcodesFasta, + output: "*_001.fastq", + error_rate: 0.10, + demultiplex_mode: "single", + output_r1: state.output_r1, + output_r2: state.output_r2, + ] + }, + toState: { id, result, state -> + def newState = [ + "pool_and_run_id": state.pool_and_run_id, + "n_lanes": state.n_lanes, + "output": result.output, + "lane_sorting": state.lane_sorting, + ] + return newState + } + ) + | flatMap{ id, state -> + // The output from cutadapt should be in the format {name}_R(1|2)_001.fastq + // See https://github.com/viash-hub/biobox/blob/952ff0843093b538cbfd6fefdecf2e7a0bc9e70b/src/cutadapt/script.sh#L226 + // Here, {name} is the name of the sequence in the barcode fasta: https://cutadapt.readthedocs.io/en/v5.0/guide.html#named-adapters + state.output.collect{ p -> + def path_as_string = p.name + // Check for correct output file name format + assert (path_as_string.endsWith("_R1_001.fastq") || path_as_string.endsWith("_R2_001.fastq")), \ + "Expected cutadapt output to contain files ending in '_R1_001.fastq' or _R1_001.fastq' only. Found: ${p}." + // Detect read orientation from file name + def pair_end = path_as_string.endsWith("_R1_001.fastq") ? "R1" : "R2" + // Use the start of the file + def barcode_id = p.name - ~/_R(1|2)_001\.fastq$/ + def new_id = state.pool_and_run_id + "__" + barcode_id + [ + new_id, + [ + "pool_and_run_id": state.pool_and_run_id, + "barcode_id": barcode_id, + "output": p, + "pair_end": pair_end, + "n_lanes": state.n_lanes, + "lane_sorting": state.lane_sorting, + ] + ] + } + } + /* + At this point, the events are provided on the smallest possible level, + as each event represents the reads for a certain orientation from a + particular lane and a single well. Here, we join these events back together + on well level, gathering FASTQS across the lanes and read orientations. + In order to make this joining as efficient as possible, the number of + lanes which are expected to be gathered were stored in the state earlier. + This way, the processing of a well can continue as as soon as all of + the lanes have been gathered. The number of lanes times 2 (forward + and reverse orientation) represents the total number of FASTQS (events) + to be included for a certain well. + */ + | map {id, state -> + def group_key = groupKey(id, state.n_lanes * 2) + return [group_key, state] + } + | groupTuple(by: 0, remainder: true, sort: {a, b -> + // Make sure that the grouped states are in order, + // meaning forward and reverse FASTQs are paired and the FASTQ + // for the forward reads comes before the reverse reads FASTQ. + if (a.lane_sorting == b.lane_sorting) { + return a.pair_end <=> b.pair_end + } + return a.lane_sorting <=> b.lane_sorting + }) + | map {group_key, states -> + // The states are in one long flat list, group them into pairs + // This assumes that the FASTQ files are already in order! + // (See the 'sort' argument of groupTuple above) + def output_pairs = states.collate(2) + + // Sanity check the state + output_pairs.each{ pair -> + assert pair.size() == 2, \ + "State error: expected FASTQ pairs as output from cutadapt, " + + "found output state: $pair" + def (first, second) = pair + def should_be_the_same = [ + "barcode_id", + "pool_and_run_id", + "lane_sorting", + ] + should_be_the_same.each { attr_to_check -> + first_attr = first.get(attr_to_check) + second_attr = second.get(attr_to_check) + assert first_attr == second_attr, \ + "State error: expected FASTQ pairs from cutadapt to have " + + "the same detected ${attr_to_check}. Found: " + + "$first_attr and $second_attr" + } + } + // Forward and reverse reads should be designated + // by 'R1' and 'R2', and sorted lexographically. + def r1_output = output_pairs.collect{ + def forward_output = it[0].output + assert forward_output.name.endsWith("R1_001.fastq"), \ + "State error: expected first item from FASTQ pair to have " + + "orientation 'R1', found ${forward_output.name}." + return it[0].output + } + def r2_output = output_pairs.collect{ + def forward_output = it[1].output + assert forward_output.name.endsWith("R2_001.fastq"), \ + "State error: expected first item from FASTQ pair to have " + + "orientation 'R2', found ${forward_output.name}." + return it[1].output + } + assert r1_output.size() == r2_output.size() + + /* The lane sorting represents the order of the FASTQ files + as provided by the input. The order of the FASTQ files should + remain the same in the well output. This is because the result of STAR + can differ based on the order of the reads in the FASTQ file. + Even when the same reads are provided, the order of them matters. + */ + def lane_sorting = output_pairs.it[0].lane_sorting + def sorting_is_monotonically_increasing = lane_sorting.withIndex().every { i, idx -> + idx == 0 || lane_sorting[idx - 1] <= i + } + assert sorting_is_monotonically_increasing, \ + "State error: expected the order of the FASTQ files after grouping " + + "the cutadapt output to be the same as the order in the input. " + + "Found sorting ${lane_sorting}, R1 output: ${r1_output}, R2 output: ${r2_output}." + + // Here we pick the state from the first item in the list of states + // and overwrite the keys which are different across states + def first_state = states[0] + // The id is the sequence name for the barcode (from the FASTA file). + def new_state = first_state + ["output_r1": r1_output, "output_r2": r2_output] + // group_key.target is an attribute from an object created with nextflow's groupKey() + // It is the Id by which the events were joined using groupTuple + return [group_key.target, new_state] + } + | view {"State after running cutadapt: $it"} + // TODO: Expand this into matching a whitelist/blacklist of barcodes + // ... and turn into separate component + + // This is contatenation of the FASTQ files from different lanes + // Concatenation of FASTQ files from the different runs is done later. + | concat_text.run( + directives: [label: ["lowmem", "lowcpu"]], + key: "concat_txt_r1", + runIf: {id, state -> state.output_r1.size() > 1}, + fromState: { id, state -> + [ + input: state.output_r1, + gzip_output: false, + output: "${state.barcode_id}_R1_001.fastq" + ] + }, + toState: { id, result, state -> + def newState = state + [ output_r1: [ result.output ] ] + return newState + } + ) + | concat_text.run( + directives: [label: ["lowmem", "lowcpu"]], + key: "concat_text_r2", + runIf: {id, state -> state.output_r2.size() > 1}, + fromState: { id, state -> + [ + input: state.output_r2, + gzip_output: false, + output: "${state.barcode_id}_R2_001.fastq", + ] + }, + toState: { id, result, state -> + def newState = state + [ output_r2: [ result.output ] ] + return newState + } + ) + // Group the concatenated files back on pool level + | map {id, state -> + def new_event = [state.pool_and_run_id, state] + return new_event + } + | groupTuple(by: 0, sort: {a, b -> a.barcode_id <=> b.barcode_id}) + | map {id, states -> + def output_r1 = states.collect{it.output_r1}.flatten() + def output_r2 = states.collect{it.output_r2}.flatten() + def pools = states.collect{it.pool_and_run_id} + assert pools.toSet().size() == 1, "Unexpected state: pool ID to be unique. Found: ${pools}." + def output_state = ["output_r1": output_r1, "output_r2": output_r2, "pool_and_run_id": pools[0]] + return [id, output_state] + } + // The concatenation of lanes happens in different work directories (each well is processed a different + // concat_text process). Here we make sure that the FASTQ files are gathered in a single directory. + // This could be skipped when no concatenation was done since cutadapt will output in a directory already. + // But since we are copying symlinks most of the time there is almost no performance penalty here. + | move_files_to_directory.run( + fromState: { id, state -> + [ + "input": state.output_r1 + state.output_r2, + // Remark: the fastq path part may seem superfluous but is necessary for publising later + "output": "fastq/${state.pool_and_run_id}/", + "keep_symbolic_links": true + ] + }, + toState: {id, result, state -> + def new_state = [ + "output_r1": state.output_r1.collect{result.output.resolve(it.name)}, + "output_r2": state.output_r2.collect{result.output.resolve(it.name)}, + ] + new_state + } + ) + + emit: + output_ch +} + +// inner workflow hook +def innerWorkflowFactory(args) { + return run_wf +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/workflows/well_demultiplex/nextflow.config b/target/nextflow/workflows/well_demultiplex/nextflow.config new file mode 100644 index 00000000..1ea1c1e1 --- /dev/null +++ b/target/nextflow/workflows/well_demultiplex/nextflow.config @@ -0,0 +1,126 @@ +manifest { + name = 'workflows/well_demultiplex' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'updatecraftbox' + description = 'Demultiplexing on well level' + author = 'Dries Schaumont, Marijke Van Moerbeke' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/workflows/well_demultiplex/nextflow_labels.config b/target/nextflow/workflows/well_demultiplex/nextflow_labels.config new file mode 100644 index 00000000..2821ec46 --- /dev/null +++ b/target/nextflow/workflows/well_demultiplex/nextflow_labels.config @@ -0,0 +1,108 @@ +executor { + $k8s { + submitRateLimit = '10sec' + pollInterval = '1 sec' + } +} + +process { + container = 'nextflow/bash:latest' + + // default resources + memory = { 8.Gb * task.attempt } + cpus = 8 + maxForks = 36 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + maxMemory = 192.GB + + // Resource labels + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 8 } + withLabel: midcpu { cpus = 16 } + withLabel: highcpu { cpus = 32 } + + withLabel: verylowmem { memory = { get_memory( 4.GB * task.attempt ) } } + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 16.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 64.GB * task.attempt ) } } + +} + +profiles { + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.fixOwnership = true + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + + local { + // This config is for local processing. + process { + withName: ".*parallel_map_process" { + maxForks = 1 + } + maxMemory = 25.GB + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 6 } + withLabel: highcpu { cpus = 8 } + + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 12.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 20.GB * task.attempt ) } } + } + } +} + +def get_memory(to_compare) { + if (!process.containsKey("maxMemory") || !process.maxMemory) { + return to_compare + } + + try { + if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) { + return process.maxMemory + } + else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) { + return max_memory as nextflow.util.MemoryUnit + } + else { + return to_compare + } + } catch (all) { + println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!" + System.exit(1) + } + } diff --git a/target/nextflow/workflows/well_demultiplex/nextflow_schema.json b/target/nextflow/workflows/well_demultiplex/nextflow_schema.json new file mode 100644 index 00000000..b703bedc --- /dev/null +++ b/target/nextflow/workflows/well_demultiplex/nextflow_schema.json @@ -0,0 +1,96 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "well_demultiplex", + "description": "Demultiplexing on well level", + "type": "object", + "$defs": { + "input arguments": { + "title": "Input arguments", + "type": "object", + "description": "No description", + "properties": { + "input_r1": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "exists": true, + "description": "Forward reads in FASTQ format", + "help_text": "Type: `file`, multiple: `True`, required, direction: `input`. " + }, + "input_r2": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "exists": true, + "description": "Reverse reads in FASTQ format", + "help_text": "Type: `file`, multiple: `True`, required, direction: `input`. " + }, + "barcodesFasta": { + "type": "string", + "format": "path", + "exists": true, + "description": "", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`. " + } + } + }, + "output arguments": { + "title": "Output arguments", + "type": "object", + "description": "No description", + "properties": { + "output_r1": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "description": "List of demultiplexed fastq files", + "help_text": "Type: `file`, multiple: `True`, required, default: `[\"fastq/*_R1_001.fastq\"]`, direction: `output`. ", + "default": [ + "fastq/*_R1_001.fastq" + ] + }, + "output_r2": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "description": "List of demultiplexed fastq files", + "help_text": "Type: `file`, multiple: `True`, required, default: `[\"fastq/*_R2_001.fastq\"]`, direction: `output`. ", + "default": [ + "fastq/*_R2_001.fastq" + ] + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/input arguments" + }, + { + "$ref": "#/$defs/output arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +} diff --git a/target/nextflow/workflows/well_metadata/.config.vsh.yaml b/target/nextflow/workflows/well_metadata/.config.vsh.yaml new file mode 100644 index 00000000..f97f501b --- /dev/null +++ b/target/nextflow/workflows/well_metadata/.config.vsh.yaml @@ -0,0 +1,268 @@ +name: "well_metadata" +namespace: "workflows" +version: "updatecraftbox" +authors: +- name: "Dries Schaumont" + roles: + - "maintainer" + info: + links: + email: "dries@data-intuitive.com" + github: "DriesSchaumont" + orcid: "0000-0002-4389-0440" + linkedin: "dries-schaumont" + organizations: + - name: "Data Intuitive" + href: "https://www.data-intuitive.com" + role: "Data Scientist" +argument_groups: +- name: "Input arguments" + arguments: + - type: "file" + name: "--input_r1" + description: "List of FASTQ files for the forward reads, 1 per well.\n" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--input_r2" + description: "List of FASTQ files for the forward reads, 1 per well.\n" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--star_mapping" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: true + multiple_sep: ";" + - type: "file" + name: "--barcodesFasta" + info: null + must_exist: true + create_parent: true + required: true + direction: "input" + multiple: false + multiple_sep: ";" +- name: "Output arguments" + arguments: + - type: "file" + name: "--output_r1" + description: "Output fastq file." + info: null + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--output_r2" + description: "Output fastq file." + info: null + must_exist: true + create_parent: true + required: true + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--pool" + description: "The original pool / sample name" + info: null + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--well_id" + info: null + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "string" + name: "--barcode" + info: null + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "integer" + name: "--n_wells" + description: "The number of wells in the pool is well is a part of." + info: null + required: false + direction: "output" + multiple: false + multiple_sep: ";" + - type: "file" + name: "--well_star_mapping" + info: null + must_exist: true + create_parent: true + required: false + direction: "output" + multiple: false + multiple_sep: ";" +resources: +- type: "nextflow_script" + path: "main.nf" + is_executable: true + entrypoint: "run_wf" +- type: "file" + path: "nextflow_labels.config" + dest: "nextflow_labels.config" +- type: "file" + path: "_viash.yaml" + dest: "_viash.yaml" +info: null +status: "enabled" +scope: + image: "public" + target: "public" +requirements: + commands: + - "ps" +license: "MIT" +links: + repository: "https://github.com/viash-hub/htrnaseq" +runners: +- type: "nextflow" + id: "nextflow" + directives: + tag: "$id" + auto: + simplifyInput: true + simplifyOutput: false + transcript: false + publish: false + config: + labels: + mem1gb: "memory = 1000000000.B" + mem2gb: "memory = 2000000000.B" + mem5gb: "memory = 5000000000.B" + mem10gb: "memory = 10000000000.B" + mem20gb: "memory = 20000000000.B" + mem50gb: "memory = 50000000000.B" + mem100gb: "memory = 100000000000.B" + mem200gb: "memory = 200000000000.B" + mem500gb: "memory = 500000000000.B" + mem1tb: "memory = 1000000000000.B" + mem2tb: "memory = 2000000000000.B" + mem5tb: "memory = 5000000000000.B" + mem10tb: "memory = 10000000000000.B" + mem20tb: "memory = 20000000000000.B" + mem50tb: "memory = 50000000000000.B" + mem100tb: "memory = 100000000000000.B" + mem200tb: "memory = 200000000000000.B" + mem500tb: "memory = 500000000000000.B" + mem1gib: "memory = 1073741824.B" + mem2gib: "memory = 2147483648.B" + mem4gib: "memory = 4294967296.B" + mem8gib: "memory = 8589934592.B" + mem16gib: "memory = 17179869184.B" + mem32gib: "memory = 34359738368.B" + mem64gib: "memory = 68719476736.B" + mem128gib: "memory = 137438953472.B" + mem256gib: "memory = 274877906944.B" + mem512gib: "memory = 549755813888.B" + mem1tib: "memory = 1099511627776.B" + mem2tib: "memory = 2199023255552.B" + mem4tib: "memory = 4398046511104.B" + mem8tib: "memory = 8796093022208.B" + mem16tib: "memory = 17592186044416.B" + mem32tib: "memory = 35184372088832.B" + mem64tib: "memory = 70368744177664.B" + mem128tib: "memory = 140737488355328.B" + mem256tib: "memory = 281474976710656.B" + mem512tib: "memory = 562949953421312.B" + cpu1: "cpus = 1" + cpu2: "cpus = 2" + cpu5: "cpus = 5" + cpu10: "cpus = 10" + cpu20: "cpus = 20" + cpu50: "cpus = 50" + cpu100: "cpus = 100" + cpu200: "cpus = 200" + cpu500: "cpus = 500" + cpu1000: "cpus = 1000" + script: + - "includeConfig(\"nextflow_labels.config\")" + debug: false + container: "docker" +engines: +- type: "native" + id: "native" +- type: "native" + id: "native" +build_info: + config: "src/workflows/well_metadata/config.vsh.yaml" + runner: "nextflow" + engine: "native|native" + output: "target/nextflow/workflows/well_metadata" + executable: "target/nextflow/workflows/well_metadata/main.nf" + viash_version: "0.9.4" + git_commit: "e6da525fc57aaec74f348eb974b68faa647bf800" + git_remote: "https://github.com/viash-hub/htrnaseq" +package_config: + name: "htrnaseq" + version: "updatecraftbox" + summary: "A workflow for high-throughput RNA-seq data analyses.\n" + description: "This workflow is designed to process high-throughput RNA-seq data,\ + \ where every\nwell of a microarray plate is a sample. A fasta file provided as\ + \ input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow\ + \ is built in a modular fashion, where most of the base functionality\nis provided\ + \ by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\n\ + supplemented by custom base components and workflow components in this package.\n\ + \nThe full workflow is split in two major subworkflows that can be run independently:\n\ + \n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per\ + \ well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate\ + \ QC reports.\n\nEach of those can be started individually, or the full workflow\ + \ can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq)\ + \ \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner)\ + \ where a\nnumber of choices (input/output structure and location) have been made.\n\ + \nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other\ + \ formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex)\ + \ first.\n" + info: + test_resources: + - path: "gs://viash-hub-resources/htrnaseq/v2" + dest: "resources_test" + viash_version: "0.9.4" + source: "src" + target: "target" + config_mods: + - ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script\ + \ += 'includeConfig(\"nextflow_labels.config\")'\n.resources += {path: '/src/config/labels.config',\ + \ dest: 'nextflow_labels.config'}\n.resources += {path: '/_viash.yaml', dest:\ + \ '_viash.yaml'}\n" + - ".engines += { type: \"native\" }" + - ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'" + - ".engines[.type == 'docker'].target_tag := 'updatecraftbox'" + keywords: + - "bioinformatics" + - "sequencing" + - "high-throughput" + - "RNAseq" + - "mapping" + - "counting" + - "pipeline" + - "workflow" + license: "MIT" + organization: "vsh" + links: + repository: "https://github.com/viash-hub/htrnaseq" + issue_tracker: "https://github.com/viash-hub/htrnaseq/issues" diff --git a/target/nextflow/workflows/well_metadata/_viash.yaml b/target/nextflow/workflows/well_metadata/_viash.yaml new file mode 100644 index 00000000..d08de7b1 --- /dev/null +++ b/target/nextflow/workflows/well_metadata/_viash.yaml @@ -0,0 +1,21 @@ +name: htrnaseq +summary: | + A workflow for high-throughput RNA-seq data analyses. +description: "This workflow is designed to process high-throughput RNA-seq data, where every\nwell of a microarray plate is a sample. A fasta file provided as input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow is built in a modular fashion, where most of the base functionality\nis provided by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\nsupplemented by custom base components and workflow components in this package.\n\nThe full workflow is split in two major subworkflows that can be run independently:\n\n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate QC reports.\n\nEach of those can be started individually, or the full workflow can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq) \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner) where a\nnumber of choices (input/output structure and location) have been made.\n\nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex) first.\n" +license: MIT +keywords: [bioinformatics, sequencing, high-throughput, RNAseq, mapping, counting, pipeline, workflow] +links: + issue_tracker: https://github.com/viash-hub/htrnaseq/issues + repository: https://github.com/viash-hub/htrnaseq +viash_version: 0.9.4 +info: + test_resources: + - path: gs://viash-hub-resources/htrnaseq/v2 + dest: resources_test +config_mods: | + .requirements.commands := ['ps'] + .runners[.type == 'nextflow'].config.script += 'includeConfig("nextflow_labels.config")' + .resources += {path: '/src/config/labels.config', dest: 'nextflow_labels.config'} + .resources += {path: '/_viash.yaml', dest: '_viash.yaml'} +version: updatecraftbox +organization: vsh diff --git a/target/nextflow/workflows/well_metadata/main.nf b/target/nextflow/workflows/well_metadata/main.nf new file mode 100644 index 00000000..46b4001e --- /dev/null +++ b/target/nextflow/workflows/well_metadata/main.nf @@ -0,0 +1,3608 @@ +// well_metadata updatecraftbox +// +// This wrapper script is auto-generated by viash 0.9.4 and is thus a derivative +// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data +// Intuitive. +// +// The component may contain files which fall under a different license. The +// authors of this component should specify the license in the header of such +// files, or include a separate license file detailing the licenses of all included +// files. +// +// Component authors: +// * Dries Schaumont (maintainer) + +//////////////////////////// +// VDSL3 helper functions // +//////////////////////////// + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_checkArgumentType.nf' +class UnexpectedArgumentTypeException extends Exception { + String errorIdentifier + String stage + String plainName + String expectedClass + String foundClass + + // ${key ? " in module '$key'" : ""}${id ? " id '$id'" : ""} + UnexpectedArgumentTypeException(String errorIdentifier, String stage, String plainName, String expectedClass, String foundClass) { + super("Error${errorIdentifier ? " $errorIdentifier" : ""}:${stage ? " $stage" : "" } argument '${plainName}' has the wrong type. " + + "Expected type: ${expectedClass}. Found type: ${foundClass}") + this.errorIdentifier = errorIdentifier + this.stage = stage + this.plainName = plainName + this.expectedClass = expectedClass + this.foundClass = foundClass + } +} + +/** + * Checks if the given value is of the expected type. If not, an exception is thrown. + * + * @param stage The stage of the argument (input or output) + * @param par The parameter definition + * @param value The value to check + * @param errorIdentifier The identifier to use in the error message + * @return The value, if it is of the expected type + * @throws UnexpectedArgumentTypeException If the value is not of the expected type +*/ +def _checkArgumentType(String stage, Map par, Object value, String errorIdentifier) { + // expectedClass will only be != null if value is not of the expected type + def expectedClass = null + def foundClass = null + + // todo: split if need be + + if (!par.required && value == null) { + expectedClass = null + } else if (par.multiple) { + if (value !instanceof Collection) { + value = [value] + } + + // split strings + value = value.collectMany{ val -> + if (val instanceof String) { + // collect() to ensure that the result is a List and not simply an array + val.split(par.multiple_sep).collect() + } else { + [val] + } + } + + // process globs + if (par.type == "file" && par.direction == "input") { + value = value.collect{ it instanceof String ? file(it, hidden: true) : it }.flatten() + } + + // check types of elements in list + try { + value = value.collect { listVal -> + _checkArgumentType(stage, par + [multiple: false], listVal, errorIdentifier) + } + } catch (UnexpectedArgumentTypeException e) { + expectedClass = "List[${e.expectedClass}]" + foundClass = "List[${e.foundClass}]" + } + } else if (par.type == "string") { + // cast to string if need be. only cast if the value is a GString + if (value instanceof GString) { + value = value as String + } + expectedClass = value instanceof String ? null : "String" + } else if (par.type == "integer") { + // cast to integer if need be + if (value !instanceof Integer) { + try { + value = value as Integer + } catch (NumberFormatException e) { + expectedClass = "Integer" + } + } + } else if (par.type == "long") { + // cast to long if need be + if (value !instanceof Long) { + try { + value = value as Long + } catch (NumberFormatException e) { + expectedClass = "Long" + } + } + } else if (par.type == "double") { + // cast to double if need be + if (value !instanceof Double) { + try { + value = value as Double + } catch (NumberFormatException e) { + expectedClass = "Double" + } + } + } else if (par.type == "float") { + // cast to float if need be + if (value !instanceof Float) { + try { + value = value as Float + } catch (NumberFormatException e) { + expectedClass = "Float" + } + } + } else if (par.type == "boolean" | par.type == "boolean_true" | par.type == "boolean_false") { + // cast to boolean if need be + if (value !instanceof Boolean) { + try { + value = value as Boolean + } catch (Exception e) { + expectedClass = "Boolean" + } + } + } else if (par.type == "file" && (par.direction == "input" || stage == "output")) { + // cast to path if need be + if (value instanceof String) { + value = file(value, hidden: true) + } + if (value instanceof File) { + value = value.toPath() + } + expectedClass = value instanceof Path ? null : "Path" + } else if (par.type == "file" && stage == "input" && par.direction == "output") { + // cast to string if need be + if (value !instanceof String) { + try { + value = value as String + } catch (Exception e) { + expectedClass = "String" + } + } + } else { + // didn't find a match for par.type + expectedClass = par.type + } + + if (expectedClass != null) { + if (foundClass == null) { + foundClass = value.getClass().getName() + } + throw new UnexpectedArgumentTypeException(errorIdentifier, stage, par.plainName, expectedClass, foundClass) + } + + return value +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processInputValues.nf' +Map _processInputValues(Map inputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.required && arg.direction == "input") { + assert inputs.containsKey(arg.plainName) && inputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required input argument '${arg.plainName}' is missing" + } + } + + inputs = inputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid input argument" + + value = _checkArgumentType("input", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return inputs +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/arguments/_processOutputValues.nf' +Map _checkValidOutputArgument(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + outputs = outputs.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && it.direction == "output" } + assert par != null : "Error in module '${key}' id '${id}': '${name}' is not a valid output argument" + + value = _checkArgumentType("output", par, value, "in module '$key' id '$id'") + + [ name, value ] + } + } + return outputs +} + +void _checkAllRequiredOuputsPresent(Map outputs, Map config, String id, String key) { + if (!workflow.stubRun) { + config.allArguments.each { arg -> + if (arg.direction == "output" && arg.required) { + assert outputs.containsKey(arg.plainName) && outputs.get(arg.plainName) != null : + "Error in module '${key}' id '${id}': required output argument '${arg.plainName}' is missing" + } + } + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/IDChecker.nf' +class IDChecker { + final def items = [] as Set + + @groovy.transform.WithWriteLock + boolean observe(String item) { + if (items.contains(item)) { + return false + } else { + items << item + return true + } + } + + @groovy.transform.WithReadLock + boolean contains(String item) { + return items.contains(item) + } + + @groovy.transform.WithReadLock + Set getItems() { + return items.clone() + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_checkUniqueIds.nf' + +/** + * Check if the ids are unique across parameter sets + * + * @param parameterSets a list of parameter sets. + */ +private void _checkUniqueIds(List>> parameterSets) { + def ppIds = parameterSets.collect{it[0]} + assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds" +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_getChild.nf' + +// helper functions for reading params from file // +def _getChild(parent, child) { + if (child.contains("://") || java.nio.file.Paths.get(child).isAbsolute()) { + child + } else { + def parentAbsolute = java.nio.file.Paths.get(parent).toAbsolutePath().toString() + parentAbsolute.replaceAll('/[^/]*$', "/") + child + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_parseParamList.nf' +/** + * Figure out the param list format based on the file extension + * + * @param param_list A String containing the path to the parameter list file. + * + * @return A String containing the format of the parameter list file. + */ +def _paramListGuessFormat(param_list) { + if (param_list !instanceof String) { + "asis" + } else if (param_list.endsWith(".csv")) { + "csv" + } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) { + "json" + } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) { + "yaml" + } else { + "yaml_blob" + } +} + + +/** + * Read the param list + * + * @param param_list One of the following: + * - A String containing the path to the parameter list file (csv, json or yaml), + * - A yaml blob of a list of maps (yaml_blob), + * - Or a groovy list of maps (asis). + * @param config A Map of the Viash configuration. + * + * @return A List of Maps containing the parameters. + */ +def _parseParamList(param_list, Map config) { + // first determine format by extension + def paramListFormat = _paramListGuessFormat(param_list) + + def paramListPath = (paramListFormat != "asis" && paramListFormat != "yaml_blob") ? + file(param_list, hidden: true) : + null + + // get the correct parser function for the detected params_list format + def paramSets = [] + if (paramListFormat == "asis") { + paramSets = param_list + } else if (paramListFormat == "yaml_blob") { + paramSets = readYamlBlob(param_list) + } else if (paramListFormat == "yaml") { + paramSets = readYaml(paramListPath) + } else if (paramListFormat == "json") { + paramSets = readJson(paramListPath) + } else if (paramListFormat == "csv") { + paramSets = readCsv(paramListPath) + } else { + error "Format of provided --param_list not recognised.\n" + + "Found: '$paramListFormat'.\n" + + "Expected: a csv file, a json file, a yaml file,\n" + + "a yaml blob or a groovy list of maps." + } + + // data checks + assert paramSets instanceof List: "--param_list should contain a list of maps" + for (value in paramSets) { + assert value instanceof Map: "--param_list should contain a list of maps" + } + + // id is argument + def idIsArgument = config.allArguments.any{it.plainName == "id"} + + // Reformat from List to List> by adding the ID as first element of a Tuple2 + paramSets = paramSets.collect({ data -> + def id = data.id + if (!idIsArgument) { + data = data.findAll{k, v -> k != "id"} + } + [id, data] + }) + + // Split parameters with 'multiple: true' + paramSets = paramSets.collect({ id, data -> + data = _splitParams(data, config) + [id, data] + }) + + // The paths of input files inside a param_list file may have been specified relatively to the + // location of the param_list file. These paths must be made absolute. + if (paramListPath) { + paramSets = paramSets.collect({ id, data -> + def new_data = data.collectEntries{ parName, parValue -> + def par = config.allArguments.find{it.plainName == parName} + if (par && par.type == "file" && par.direction == "input") { + if (parValue instanceof Collection) { + parValue = parValue.collectMany{path -> + def x = _resolveSiblingIfNotAbsolute(path, paramListPath) + x instanceof Collection ? x : [x] + } + } else { + parValue = _resolveSiblingIfNotAbsolute(parValue, paramListPath) + } + } + [parName, parValue] + } + [id, new_data] + }) + } + + return paramSets +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/_splitParams.nf' +/** + * Split parameters for arguments that accept multiple values using their separator + * + * @param paramList A Map containing parameters to split. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A Map of parameters where the parameter values have been split into a list using + * their seperator. + */ +Map _splitParams(Map parValues, Map config){ + def parsedParamValues = parValues.collectEntries { parName, parValue -> + def parameterSettings = config.allArguments.find({it.plainName == parName}) + + if (!parameterSettings) { + // if argument is not found, do not alter + return [parName, parValue] + } + if (parameterSettings.multiple) { // Check if parameter can accept multiple values + if (parValue instanceof Collection) { + parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it } + } else if (parValue instanceof String) { + parValue = parValue.split(parameterSettings.multiple_sep) + } else if (parValue == null) { + parValue = [] + } else { + parValue = [ parValue ] + } + parValue = parValue.flatten() + } + // For all parameters check if multiple values are only passed for + // arguments that allow it. Quietly simplify lists of length 1. + if (!parameterSettings.multiple && parValue instanceof Collection) { + assert parValue.size() == 1 : + "Error: argument ${parName} has too many values.\n" + + " Expected amount: 1. Found: ${parValue.size()}" + parValue = parValue[0] + } + [parName, parValue] + } + return parsedParamValues +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/channelFromParams.nf' +/** + * Parse nextflow parameters based on settings defined in a viash config. + * Return a list of parameter sets, each parameter set corresponding to + * an event in a nextflow channel. The output from this function can be used + * with Channel.fromList to create a nextflow channel with Vdsl3 formatted + * events. + * + * This function performs: + * - A filtering of the params which can be found in the config file. + * - Process the params_list argument which allows a user to to initialise + * a Vsdl3 channel with multiple parameter sets. Possible formats are + * csv, json, yaml, or simply a yaml_blob. A csv should have column names + * which correspond to the different arguments of this pipeline. A json or a yaml + * file should be a list of maps, each of which has keys corresponding to the + * arguments of the pipeline. A yaml blob can also be passed directly as a parameter. + * When passing a csv, json or yaml, relative path names are relativized to the + * location of the parameter file. + * - Combine the parameter sets into a vdsl3 Channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A list of parameters with the first element of the event being + * the event ID and the second element containing a map of the parsed parameters. + */ + +private List>> _paramsToParamSets(Map params, Map config){ + // todo: fetch key from run args + def key_ = config.name + + /* parse regular parameters (not in param_list) */ + /*************************************************/ + def globalParams = config.allArguments + .findAll { params.containsKey(it.plainName) } + .collectEntries { [ it.plainName, params[it.plainName] ] } + def globalID = params.get("id", null) + + /* process params_list arguments */ + /*********************************/ + def paramList = params.containsKey("param_list") && params.param_list != null ? + params.param_list : [] + // if (paramList instanceof String) { + // paramList = [paramList] + // } + // def paramSets = paramList.collectMany{ _parseParamList(it, config) } + // TODO: be able to process param_list when it is a list of strings + def paramSets = _parseParamList(paramList, config) + if (paramSets.isEmpty()) { + paramSets = [[null, [:]]] + } + + /* combine arguments into channel */ + /**********************************/ + def processedParams = paramSets.indexed().collect{ index, tup -> + // Process ID + def id = tup[0] ?: globalID + + if (workflow.stubRun && !id) { + // if stub run, explicitly add an id if missing + id = "stub${index}" + } + assert id != null: "Each parameter set should have at least an 'id'" + + // Process params + def parValues = globalParams + tup[1] + // // Remove parameters which are null, if the default is also null + // parValues = parValues.collectEntries{paramName, paramValue -> + // parameterSettings = config.functionality.allArguments.find({it.plainName == paramName}) + // if ( paramValue != null || parameterSettings.get("default", null) != null ) { + // [paramName, paramValue] + // } + // } + parValues = parValues.collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + assert par != null : "Error in module '${key_}' id '${id}': '${name}' is not a valid input argument" + + if (par == null) { + return [:] + } + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + + [ name, value ] + } + + [id, parValues] + } + + // Check if ids (first element of each list) is unique + _checkUniqueIds(processedParams) + return processedParams +} + +/** + * Parse nextflow parameters based on settings defined in a viash config + * and return a nextflow channel. + * + * @param params Input parameters. Can optionaly contain a 'param_list' key that + * provides a list of arguments that can be split up into multiple events + * in the output channel possible formats of param_lists are: a csv file, + * json file, a yaml file or a yaml blob. Each parameters set (event) must + * have a unique ID. + * @param config A Map of the Viash configuration. This Map can be generated from the config file + * using the readConfig() function. + * + * @return A nextflow Channel with events. Events are formatted as a tuple that contains + * first contains the ID of the event and as second element holds a parameter map. + * + * + */ +def channelFromParams(Map params, Map config) { + def processedParams = _paramsToParamSets(params, config) + return Channel.fromList(processedParams) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/checkUniqueIds.nf' +def checkUniqueIds(Map args) { + def stopOnError = args.stopOnError == null ? args.stopOnError : true + + def idChecker = new IDChecker() + + return filter { tup -> + if (!idChecker.observe(tup[0])) { + if (stopOnError) { + error "Duplicate id: ${tup[0]}" + } else { + log.warn "Duplicate id: ${tup[0]}, removing duplicate entry" + return false + } + } + return true + } +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/preprocessInputs.nf' +// This helper file will be deprecated soon +preprocessInputsDeprecationWarningPrinted = false + +def preprocessInputsDeprecationWarning() { + if (!preprocessInputsDeprecationWarningPrinted) { + preprocessInputsDeprecationWarningPrinted = true + System.err.println("Warning: preprocessInputs() is deprecated and will be removed in Viash 0.9.0.") + } +} + +/** + * Generate a nextflow Workflow that allows processing a channel of + * Vdsl3 formatted events and apply a Viash config to them: + * - Gather default parameters from the Viash config and make + * sure that they are correctly formatted (see applyConfig method). + * - Format the input parameters (also using the applyConfig method). + * - Apply the default parameter to the input parameters. + * - Do some assertions: + * ~ Check if the event IDs in the channel are unique. + * + * The events in the channel are formatted as tuples, with the + * first element of the tuples being a unique id of the parameter set, + * and the second element containg the the parameters themselves. + * Optional extra elements of the tuples will be passed to the output as is. + * + * @param args A map that must contain a 'config' key that points + * to a parsed config (see readConfig()). Optionally, a + * 'key' key can be provided which can be used to create a unique + * name for the workflow process. + * + * @return A workflow that allows processing a channel of Vdsl3 formatted events + * and apply a Viash config to them. + */ +def preprocessInputs(Map args) { + preprocessInputsDeprecationWarning() + + def config = args.config + assert config instanceof Map : + "Error in preprocessInputs: config must be a map. " + + "Expected class: Map. Found: config.getClass() is ${config.getClass()}" + def key_ = args.key ?: config.name + + // Get different parameter types (used throughout this function) + def defaultArgs = config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + map { tup -> + def id = tup[0] + def data = tup[1] + def passthrough = tup.drop(2) + + def new_data = (defaultArgs + data).collectEntries { name, value -> + def par = config.allArguments.find { it.plainName == name && (it.direction == "input" || it.type == "file") } + + if (par != null) { + value = _checkArgumentType("input", par, value, "in module '$key_' id '$id'") + } + + [ name, value ] + } + + [ id, new_data ] + passthrough + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runComponents.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component config. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component config. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component config. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component config. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runComponents(Map args) { + log.warn("runComponents is deprecated, use runEach instead") + assert args.components: "runComponents should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runComponents" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def id_ = args.id + + workflow runComponentsWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def comp_config = comp_.config + + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_config) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + // def new_id = id_(tup[0], tup[1], comp_config) + def new_id = tup[0] + if (id_ instanceof String) { + new_id = id_ + } else if (id_ instanceof Closure) { + new_id = id_(new_id, tup[1], comp_config) + } + [new_id] + tup.drop(1) + } + : filter_ch + def data_ch = id_ch | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_config) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_config) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + post_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runComponentsWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/runEach.nf' +/** + * Run a list of components on a stream of data. + * + * @param components: list of Viash VDSL3 modules to run + * @param fromState: a closure, a map or a list of keys to extract from the input data. + * If a closure, it will be called with the id, the data and the component itself. + * @param toState: a closure, a map or a list of keys to extract from the output data + * If a closure, it will be called with the id, the output data, the old state and the component itself. + * @param filter: filter function to apply to the input. + * It will be called with the id, the data and the component itself. + * @param id: id to use for the output data + * If a closure, it will be called with the id, the data and the component itself. + * @param auto: auto options to pass to the components + * + * @return: a workflow that runs the components + **/ +def runEach(Map args) { + assert args.components: "runEach should be passed a list of components to run" + + def components_ = args.components + if (components_ !instanceof List) { + components_ = [ components_ ] + } + assert components_.size() > 0: "pass at least one component to runEach" + + def fromState_ = args.fromState + def toState_ = args.toState + def filter_ = args.filter + def runIf_ = args.runIf + def id_ = args.id + + assert !runIf_ || runIf_ instanceof Closure: "runEach: must pass a Closure to runIf." + + workflow runEachWf { + take: input_ch + main: + + // generate one channel per method + out_chs = components_.collect{ comp_ -> + def filter_ch = filter_ + ? input_ch | filter{tup -> + filter_(tup[0], tup[1], comp_) + } + : input_ch + def id_ch = id_ + ? filter_ch | map{tup -> + def new_id = id_ + if (new_id instanceof Closure) { + new_id = new_id(tup[0], tup[1], comp_) + } + assert new_id instanceof String : "Error in runEach: id should be a String or a Closure that returns a String. Expected: id instanceof String. Found: ${new_id.getClass()}" + [new_id] + tup.drop(1) + } + : filter_ch + def chPassthrough = null + def chRun = null + if (runIf_) { + def idRunIfBranch = id_ch.branch{ tup -> + run: runIf_(tup[0], tup[1], comp_) + passthrough: true + } + chPassthrough = idRunIfBranch.passthrough + chRun = idRunIfBranch.run + } else { + chRun = id_ch + chPassthrough = Channel.empty() + } + def data_ch = chRun | map{tup -> + def new_data = tup[1] + if (fromState_ instanceof Map) { + new_data = fromState_.collectEntries{ key0, key1 -> + [key0, new_data[key1]] + } + } else if (fromState_ instanceof List) { + new_data = fromState_.collectEntries{ key -> + [key, new_data[key]] + } + } else if (fromState_ instanceof Closure) { + new_data = fromState_(tup[0], new_data, comp_) + } + tup.take(1) + [new_data] + tup.drop(1) + } + def out_ch = data_ch + | comp_.run( + auto: (args.auto ?: [:]) + [simplifyInput: false, simplifyOutput: false] + ) + def post_ch = toState_ + ? out_ch | map{tup -> + def output = tup[1] + def old_state = tup[2] + def new_state = null + if (toState_ instanceof Map) { + new_state = old_state + toState_.collectEntries{ key0, key1 -> + [key0, output[key1]] + } + } else if (toState_ instanceof List) { + new_state = old_state + toState_.collectEntries{ key -> + [key, output[key]] + } + } else if (toState_ instanceof Closure) { + new_state = toState_(tup[0], output, old_state, comp_) + } + [tup[0], new_state] + tup.drop(3) + } + : out_ch + + def return_ch = post_ch + | concat(chPassthrough) + + return_ch + } + + // mix all results + output_ch = + (out_chs.size == 1) + ? out_chs[0] + : out_chs[0].mix(*out_chs.drop(1)) + + emit: output_ch + } + + return runEachWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/channel/safeJoin.nf' +/** + * Join sourceChannel to targetChannel + * + * This function joins the sourceChannel to the targetChannel. + * However, each id in the targetChannel must be present in the + * sourceChannel. If _meta.join_id exists in the targetChannel, that is + * used as an id instead. If the id doesn't match any id in the sourceChannel, + * an error is thrown. + */ + +def safeJoin(targetChannel, sourceChannel, key) { + def sourceIDs = new IDChecker() + + def sourceCheck = sourceChannel + | map { tup -> + sourceIDs.observe(tup[0]) + tup + } + def targetCheck = targetChannel + | map { tup -> + def id = tup[0] + + if (!sourceIDs.contains(id)) { + error ( + "Error in module '${key}' when merging output with original state.\n" + + " Reason: output with id '${id}' could not be joined with source channel.\n" + + " If the IDs in the output channel differ from the input channel,\n" + + " please set `tup[1]._meta.join_id to the original ID.\n" + + " Original IDs in input channel: ['${sourceIDs.getItems().join("', '")}'].\n" + + " Unexpected ID in the output channel: '${id}'.\n" + + " Example input event: [\"id\", [input: file(...)]],\n" + + " Example output event: [\"newid\", [output: file(...), _meta: [join_id: \"id\"]]]" + ) + } + // TODO: add link to our documentation on how to fix this + + tup + } + + sourceCheck.cross(targetChannel) + | map{ left, right -> + right + left.drop(1) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/_processArgument.nf' +def _processArgument(arg) { + arg.multiple = arg.multiple != null ? arg.multiple : false + arg.required = arg.required != null ? arg.required : false + arg.direction = arg.direction != null ? arg.direction : "input" + arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ";" + arg.plainName = arg.name.replaceAll("^-*", "") + + if (arg.type == "file") { + arg.must_exist = arg.must_exist != null ? arg.must_exist : true + arg.create_parent = arg.create_parent != null ? arg.create_parent : true + } + + // add default values to output files which haven't already got a default + if (arg.type == "file" && arg.direction == "output" && arg.default == null) { + def mult = arg.multiple ? "_*" : "" + def extSearch = "" + if (arg.default != null) { + extSearch = arg.default + } else if (arg.example != null) { + extSearch = arg.example + } + if (extSearch instanceof List) { + extSearch = extSearch[0] + } + def extSearchResult = extSearch.find("\\.[^\\.]+\$") + def ext = extSearchResult != null ? extSearchResult : "" + arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}" + if (arg.multiple) { + arg.default = [arg.default] + } + } + + if (!arg.multiple) { + if (arg.default != null && arg.default instanceof List) { + arg.default = arg.default[0] + } + if (arg.example != null && arg.example instanceof List) { + arg.example = arg.example[0] + } + } + + if (arg.type == "boolean_true") { + arg.default = false + } + if (arg.type == "boolean_false") { + arg.default = true + } + + arg +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/addGlobalParams.nf' +def addGlobalArguments(config) { + def localConfig = [ + "argument_groups": [ + [ + "name": "Nextflow input-output arguments", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "arguments" : [ + [ + 'name': '--publish_dir', + 'required': true, + 'type': 'string', + 'description': 'Path to an output directory.', + 'example': 'output/', + 'multiple': false + ], + [ + 'name': '--param_list', + 'required': false, + 'type': 'string', + 'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob. + | + |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`. + |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`. + |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`. + |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`. + | + |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(), + 'example': 'my_params.yaml', + 'multiple': false, + 'hidden': true + ] + // TODO: allow multiple: true in param_list? + // TODO: allow to specify a --param_list_regex to filter the param_list? + // TODO: allow to specify a --param_list_from_state to remap entries in the param_list? + ] + ] + ] + ] + + return processConfig(_mergeMap(config, localConfig)) +} + +def _mergeMap(Map lhs, Map rhs) { + return rhs.inject(lhs.clone()) { map, entry -> + if (map[entry.key] instanceof Map && entry.value instanceof Map) { + map[entry.key] = _mergeMap(map[entry.key], entry.value) + } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) { + map[entry.key] += entry.value + } else { + map[entry.key] = entry.value + } + return map + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/generateHelp.nf' +def _generateArgumentHelp(param) { + // alternatives are not supported + // def names = param.alternatives ::: List(param.name) + + def unnamedProps = [ + ["required parameter", param.required], + ["multiple values allowed", param.multiple], + ["output", param.direction.toLowerCase() == "output"], + ["file must exist", param.type == "file" && param.must_exist] + ].findAll{it[1]}.collect{it[0]} + + def dflt = null + if (param.default != null) { + if (param.default instanceof List) { + dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + dflt = param.default.toString() + } + } + def example = null + if (param.example != null) { + if (param.example instanceof List) { + example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ") + } else { + example = param.example.toString() + } + } + def min = param.min?.toString() + def max = param.max?.toString() + + def escapeChoice = { choice -> + def s1 = choice.replaceAll("\\n", "\\\\n") + def s2 = s1.replaceAll("\"", """\\\"""") + s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2 + } + def choices = param.choices == null ? + null : + "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]" + + def namedPropsStr = [ + ["type", ([param.type] + unnamedProps).join(", ")], + ["default", dflt], + ["example", example], + ["choices", choices], + ["min", min], + ["max", max] + ] + .findAll{it[1]} + .collect{"\n " + it[0] + ": " + it[1].replaceAll("\n", "\\n")} + .join("") + + def descStr = param.description == null ? + "" : + _paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n ") + + "\n --" + param.plainName + + namedPropsStr + + descStr +} + +// Based on Helper.generateHelp() in Helper.scala +def _generateHelp(config) { + def fun = config + + // PART 1: NAME AND VERSION + def nameStr = fun.name + + (fun.version == null ? "" : " " + fun.version) + + // PART 2: DESCRIPTION + def descrStr = fun.description == null ? + "" : + "\n\n" + _paragraphWrap(fun.description.trim(), 80).join("\n") + + // PART 3: Usage + def usageStr = fun.usage == null ? + "" : + "\n\nUsage:\n" + fun.usage.trim() + + // PART 4: Options + def argGroupStrs = fun.allArgumentGroups.collect{argGroup -> + def name = argGroup.name + def descriptionStr = argGroup.description == null ? + "" : + "\n " + _paragraphWrap(argGroup.description.trim(), 80-4).join("\n ") + "\n" + def arguments = argGroup.arguments.collect{arg -> + arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg + }.findAll{it != null} + def argumentStrs = arguments.collect{param -> _generateArgumentHelp(param)} + + "\n\n$name:" + + descriptionStr + + argumentStrs.join("\n") + } + + // FINAL: combine + def out = nameStr + + descrStr + + usageStr + + argGroupStrs.join("") + + return out +} + +// based on Format._paragraphWrap +def _paragraphWrap(str, maxLength) { + def outLines = [] + str.split("\n").each{par -> + def words = par.split("\\s").toList() + + def word = null + def line = words.pop() + while(!words.isEmpty()) { + word = words.pop() + if (line.length() + word.length() + 1 <= maxLength) { + line = line + " " + word + } else { + outLines.add(line) + line = word + } + } + if (words.isEmpty()) { + outLines.add(line) + } + } + return outLines +} + +def helpMessage(config) { + if (params.containsKey("help") && params.help) { + def mergedConfig = addGlobalArguments(config) + def helpStr = _generateHelp(mergedConfig) + println(helpStr) + exit 0 + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/processConfig.nf' +def processConfig(config) { + // set defaults for arguments + config.arguments = + (config.arguments ?: []).collect{_processArgument(it)} + + // set defaults for argument_group arguments + config.argument_groups = + (config.argument_groups ?: []).collect{grp -> + grp.arguments = (grp.arguments ?: []).collect{_processArgument(it)} + grp + } + + // create combined arguments list + config.allArguments = + config.arguments + + config.argument_groups.collectMany{it.arguments} + + // add missing argument groups (based on Functionality::allArgumentGroups()) + def argGroups = config.argument_groups + if (argGroups.any{it.name.toLowerCase() == "arguments"}) { + argGroups = argGroups.collect{ grp -> + if (grp.name.toLowerCase() == "arguments") { + grp = grp + [ + arguments: grp.arguments + config.arguments + ] + } + grp + } + } else { + argGroups = argGroups + [ + name: "Arguments", + arguments: config.arguments + ] + } + config.allArgumentGroups = argGroups + + config +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/config/readConfig.nf' + +def readConfig(file) { + def config = readYaml(file ?: moduleDir.resolve("config.vsh.yaml")) + processConfig(config) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_resolveSiblingIfNotAbsolute.nf' +/** + * Resolve a path relative to the current file. + * + * @param str The path to resolve, as a String. + * @param parentPath The path to resolve relative to, as a Path. + * + * @return The path that may have been resovled, as a Path. + */ +def _resolveSiblingIfNotAbsolute(str, parentPath) { + if (str !instanceof String) { + return str + } + if (!_stringIsAbsolutePath(str)) { + return parentPath.resolveSibling(str) + } else { + return file(str, hidden: true) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/_stringIsAbsolutePath.nf' +/** + * Check whether a path as a string is absolute. + * + * In the past, we tried using `file(., relative: true).isAbsolute()`, + * but the 'relative' option was added in 22.10.0. + * + * @param path The path to check, as a String. + * + * @return Whether the path is absolute, as a boolean. + */ +def _stringIsAbsolutePath(path) { + def _resolve_URL_PROTOCOL = ~/^([a-zA-Z][a-zA-Z0-9]*:)?\\/.+/ + + assert path instanceof String + return _resolve_URL_PROTOCOL.matcher(path).matches() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/collectTraces.nf' +class CustomTraceObserver implements nextflow.trace.TraceObserver { + List traces + + CustomTraceObserver(List traces) { + this.traces = traces + } + + @Override + void onProcessComplete(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } + + @Override + void onProcessCached(nextflow.processor.TaskHandler handler, nextflow.trace.TraceRecord trace) { + def trace2 = trace.store.clone() + trace2.script = null + traces.add(trace2) + } +} + +def collectTraces() { + def traces = Collections.synchronizedList([]) + + // add custom trace observer which stores traces in the traces object + session.observers.add(new CustomTraceObserver(traces)) + + traces +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/deepClone.nf' +/** + * Performs a deep clone of the given object. + * @param x an object + */ +def deepClone(x) { + iterateMap(x, {it instanceof Cloneable ? it.clone() : it}) +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getPublishDir.nf' +def getPublishDir() { + return params.containsKey("publish_dir") ? params.publish_dir : + params.containsKey("publishDir") ? params.publishDir : + null +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/getRootDir.nf' + +// Recurse upwards until we find a '.build.yaml' file +def _findBuildYamlFile(pathPossiblySymlink) { + def path = pathPossiblySymlink.toRealPath() + def child = path.resolve(".build.yaml") + if (java.nio.file.Files.isDirectory(path) && java.nio.file.Files.exists(child)) { + return child + } else { + def parent = path.getParent() + if (parent == null) { + return null + } else { + return _findBuildYamlFile(parent) + } + } +} + +// get the root of the target folder +def getRootDir() { + def dir = _findBuildYamlFile(meta.resources_dir) + assert dir != null: "Could not find .build.yaml in the folder structure" + dir.getParent() +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/iterateMap.nf' +/** + * Recursively apply a function over the leaves of an object. + * @param obj The object to iterate over. + * @param fun The function to apply to each value. + * @return The object with the function applied to each value. + */ +def iterateMap(obj, fun) { + if (obj instanceof List && obj !instanceof String) { + return obj.collect{item -> + iterateMap(item, fun) + } + } else if (obj instanceof Map) { + return obj.collectEntries{key, item -> + [key.toString(), iterateMap(item, fun)] + } + } else { + return fun(obj) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/functions/niceView.nf' +/** + * A view for printing the event of each channel as a YAML blob. + * This is useful for debugging. + */ +def niceView() { + workflow niceViewWf { + take: input + main: + output = input + | view{toYamlBlob(it)} + emit: output + } + return niceViewWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readCsv.nf' + +def readCsv(file_path) { + def output = [] + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + + // todo: allow escaped quotes in string + // todo: allow single quotes? + def splitRegex = java.util.regex.Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''') + def removeQuote = java.util.regex.Pattern.compile('''"(.*)"''') + + def br = java.nio.file.Files.newBufferedReader(inputFile) + + def row = -1 + def header = null + while (br.ready() && header == null) { + def line = br.readLine() + row++ + if (!line.startsWith("#")) { + header = splitRegex.split(line, -1).collect{field -> + m = removeQuote.matcher(field) + m.find() ? m.replaceFirst('$1') : field + } + } + } + assert header != null: "CSV file should contain a header" + + while (br.ready()) { + def line = br.readLine() + row++ + if (line == null) { + br.close() + break + } + + if (!line.startsWith("#")) { + def predata = splitRegex.split(line, -1) + def data = predata.collect{field -> + if (field == "") { + return null + } + def m = removeQuote.matcher(field) + if (m.find()) { + return m.replaceFirst('$1') + } else { + return field + } + } + assert header.size() == data.size(): "Row $row should contain the same number as fields as the header" + + def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null} + output.add(dataMap) + } + } + + output +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJson.nf' +def readJson(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parse(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readJsonBlob.nf' +def readJsonBlob(str) { + def jsonSlurper = new groovy.json.JsonSlurper() + jsonSlurper.parseText(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readTaggedYaml.nf' +// Custom constructor to modify how certain objects are parsed from YAML +class CustomConstructor extends org.yaml.snakeyaml.constructor.Constructor { + Path root + + class ConstructPath extends org.yaml.snakeyaml.constructor.AbstractConstruct { + public Object construct(org.yaml.snakeyaml.nodes.Node node) { + String filename = (String) constructScalar(node); + if (root != null) { + return root.resolve(filename); + } + return java.nio.file.Paths.get(filename); + } + } + + CustomConstructor(org.yaml.snakeyaml.LoaderOptions options, Path root) { + super(options) + this.root = root + // Handling !file tag and parse it back to a File type + this.yamlConstructors.put(new org.yaml.snakeyaml.nodes.Tag("!file"), new ConstructPath()) + } +} + +def readTaggedYaml(Path path) { + def options = new org.yaml.snakeyaml.LoaderOptions() + def constructor = new CustomConstructor(options, path.getParent()) + def yaml = new org.yaml.snakeyaml.Yaml(constructor) + return yaml.load(path.text) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYaml.nf' +def readYaml(file_path) { + def inputFile = file_path !instanceof Path ? file(file_path, hidden: true) : file_path + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(inputFile) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/readYamlBlob.nf' +def readYamlBlob(str) { + def yamlSlurper = new org.yaml.snakeyaml.Yaml() + yamlSlurper.load(str) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toJsonBlob.nf' +String toJsonBlob(data) { + return groovy.json.JsonOutput.toJson(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toTaggedYamlBlob.nf' +// Custom representer to modify how certain objects are represented in YAML +class CustomRepresenter extends org.yaml.snakeyaml.representer.Representer { + Path relativizer + + class RepresentPath implements org.yaml.snakeyaml.representer.Represent { + public String getFileName(Object obj) { + if (obj instanceof File) { + obj = ((File) obj).toPath(); + } + if (obj !instanceof Path) { + throw new IllegalArgumentException("Object: " + obj + " is not a Path or File"); + } + def path = (Path) obj; + + if (relativizer != null) { + return relativizer.relativize(path).toString() + } else { + return path.toString() + } + } + + public org.yaml.snakeyaml.nodes.Node representData(Object data) { + String filename = getFileName(data); + def tag = new org.yaml.snakeyaml.nodes.Tag("!file"); + return representScalar(tag, filename); + } + } + CustomRepresenter(org.yaml.snakeyaml.DumperOptions options, Path relativizer) { + super(options) + this.relativizer = relativizer + this.representers.put(sun.nio.fs.UnixPath, new RepresentPath()) + this.representers.put(Path, new RepresentPath()) + this.representers.put(File, new RepresentPath()) + } +} + +String toTaggedYamlBlob(data) { + return toRelativeTaggedYamlBlob(data, null) +} +String toRelativeTaggedYamlBlob(data, Path relativizer) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + def representer = new CustomRepresenter(options, relativizer) + def yaml = new org.yaml.snakeyaml.Yaml(representer, options) + return yaml.dump(data) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/toYamlBlob.nf' +String toYamlBlob(data) { + def options = new org.yaml.snakeyaml.DumperOptions() + options.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + options.setPrettyFlow(true) + def yaml = new org.yaml.snakeyaml.Yaml(options) + def cleanData = iterateMap(data, { it instanceof Path ? it.toString() : it }) + return yaml.dump(cleanData) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeJson.nf' +void writeJson(data, file) { + assert data: "writeJson: data should not be null" + assert file: "writeJson: file should not be null" + file.write(toJsonBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/readwrite/writeYaml.nf' +void writeYaml(data, file) { + assert data: "writeYaml: data should not be null" + assert file: "writeYaml: file should not be null" + file.write(toYamlBlob(data)) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/findStates.nf' +def findStates(Map params, Map config) { + def auto_config = deepClone(config) + def auto_params = deepClone(params) + + auto_config = auto_config.clone() + // override arguments + auto_config.argument_groups = [] + auto_config.arguments = [ + [ + type: "string", + name: "--id", + description: "A dummy identifier", + required: false + ], + [ + type: "file", + name: "--input_states", + example: "/path/to/input/directory/**/state.yaml", + description: "Path to input directory containing the datasets to be integrated.", + required: true, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--filter", + example: "foo/.*/state.yaml", + description: "Regex to filter state files by path.", + required: false + ], + // to do: make this a yaml blob? + [ + type: "string", + name: "--rename_keys", + example: ["newKey1:oldKey1", "newKey2:oldKey2"], + description: "Rename keys in the detected input files. This is useful if the input files do not match the set of input arguments of the workflow.", + required: false, + multiple: true, + multiple_sep: ";" + ], + [ + type: "string", + name: "--settings", + example: '{"output_dataset": "dataset.h5ad", "k": 10}', + description: "Global arguments as a JSON glob to be passed to all components.", + required: false + ] + ] + if (!(auto_params.containsKey("id"))) { + auto_params["id"] = "auto" + } + + // run auto config through processConfig once more + auto_config = processConfig(auto_config) + + workflow findStatesWf { + helpMessage(auto_config) + + output_ch = + channelFromParams(auto_params, auto_config) + | flatMap { autoId, args -> + + def globalSettings = args.settings ? readYamlBlob(args.settings) : [:] + + // look for state files in input dir + def stateFiles = args.input_states + + // filter state files by regex + if (args.filter) { + stateFiles = stateFiles.findAll{ stateFile -> + def stateFileStr = stateFile.toString() + def matcher = stateFileStr =~ args.filter + matcher.matches()} + } + + // read in states + def states = stateFiles.collect { stateFile -> + def state_ = readTaggedYaml(stateFile) + [state_.id, state_] + } + + // construct renameMap + if (args.rename_keys) { + def renameMap = args.rename_keys.collectEntries{renameString -> + def split = renameString.split(":") + assert split.size() == 2: "Argument 'rename_keys' should be of the form 'newKey:oldKey', or 'newKey:oldKey;newKey:oldKey' in case of multiple values" + split + } + + // rename keys in state, only let states through which have all keys + // also add global settings + states = states.collectMany{id, state -> + def newState = [:] + + for (key in renameMap.keySet()) { + def origKey = renameMap[key] + if (!(state.containsKey(origKey))) { + return [] + } + newState[key] = state[origKey] + } + + [[id, globalSettings + newState]] + } + } + + states + } + emit: + output_ch + } + + return findStatesWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/joinStates.nf' +def joinStates(Closure apply_) { + workflow joinStatesWf { + take: input_ch + main: + output_ch = input_ch + | toSortedList + | filter{ it.size() > 0 } + | map{ tups -> + def ids = tups.collect{it[0]} + def states = tups.collect{it[1]} + apply_(ids, states) + } + + emit: output_ch + } + return joinStatesWf +} +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishFiles.nf' +def publishFiles(Map args) { + def key_ = args.get("key") + + assert key_ != null : "publishFiles: key must be specified" + + workflow publishFilesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + def inputFiles_ = inputoutputFilenames_[0] + def outputFilenames_ = inputoutputFilenames_[1] + + [id_, inputFiles_, outputFilenames_] + } + | publishFilesProc + emit: input_ch + } + return publishFilesWf +} + +process publishFilesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), path(inputFiles, stageAs: "_inputfile?/*"), val(outputFiles) + output: + tuple val(id), path{outputFiles} + script: + def copyCommands = [ + inputFiles instanceof List ? inputFiles : [inputFiles], + outputFiles instanceof List ? outputFiles : [outputFiles] + ] + .transpose() + .collectMany{infile, outfile -> + if (infile.toString() != outfile.toString()) { + [ + "[ -d \"\$(dirname '${outfile.toString()}')\" ] || mkdir -p \"\$(dirname '${outfile.toString()}')\"", + "cp -r '${infile.toString()}' '${outfile.toString()}'" + ] + } else { + // no need to copy if infile is the same as outfile + [] + } + } + """ + echo "Copying output files to destination folder" + ${copyCommands.join("\n ")} + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishFilesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishFilesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishFilesByConfig: key must be specified" + + workflow publishFilesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + + // the processed state is a list of [key, value, inputPath, outputFilename] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - inputPath is a List[Path] + // - outputFilename is a List[String] + // - (inputPath, outputFilename) are the files that will be copied from src to dest (relative to the state.yaml) + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output OR multiple channels were emitted + // and the output was just not added to using the channel + // that is now being parsed + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[inputPath: [], outputFilename: []]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def inputPath = val instanceof File ? val.toPath() : val + [inputPath: inputPath, outputFilename: filename_ix] + } + def transposedOutputs = ["inputPath", "outputFilename"].collectEntries{ key -> + [key, outputPerFile.collect{dic -> dic[key]}] + } + return [[key: plainName_] + transposedOutputs] + } else { + def value_ = java.nio.file.Paths.get(filename) + def inputPath = value instanceof File ? value.toPath() : value + return [[inputPath: [inputPath], outputFilename: [filename]]] + } + } + + def inputPaths = processedState.collectMany{it.inputPath} + def outputFilenames = processedState.collectMany{it.outputFilename} + + + [id_, inputPaths, outputFilenames] + } + | publishFilesProc + emit: input_ch + } + return publishFilesSimpleWf +} + + + + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/publishStates.nf' +def collectFiles(obj) { + if (obj instanceof java.io.File || obj instanceof Path) { + return [obj] + } else if (obj instanceof List && obj !instanceof String) { + return obj.collectMany{item -> + collectFiles(item) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectFiles(item) + } + } else { + return [] + } +} + +/** + * Recurse through a state and collect all input files and their target output filenames. + * @param obj The state to recurse through. + * @param prefix The prefix to prepend to the output filenames. + */ +def collectInputOutputPaths(obj, prefix) { + if (obj instanceof File || obj instanceof Path) { + def path = obj instanceof Path ? obj : obj.toPath() + def ext = path.getFileName().toString().find("\\.[^\\.]+\$") ?: "" + def newFilename = prefix + ext + return [[obj, newFilename]] + } else if (obj instanceof List && obj !instanceof String) { + return obj.withIndex().collectMany{item, ix -> + collectInputOutputPaths(item, prefix + "_" + ix) + } + } else if (obj instanceof Map) { + return obj.collectMany{key, item -> + collectInputOutputPaths(item, prefix + "." + key) + } + } else { + return [] + } +} + +def publishStates(Map args) { + def key_ = args.get("key") + def yamlTemplate_ = args.get("output_state", args.get("outputState", '$id.$key.state.yaml')) + + assert key_ != null : "publishStates: key must be specified" + + workflow publishStatesWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] + + // the input files and the target output filenames + def inputoutputFilenames_ = collectInputOutputPaths(state_, id_ + "." + key_).transpose() + + def yamlFilename = yamlTemplate_ + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + + // TODO: do the pathnames in state_ match up with the outputFilenames_? + + // convert state to yaml blob + def yamlBlob_ = toRelativeTaggedYamlBlob([id: id_] + state_, java.nio.file.Paths.get(yamlFilename)) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesWf +} +process publishStatesProc { + // todo: check publishpath? + publishDir path: "${getPublishDir()}/", mode: "copy" + tag "$id" + input: + tuple val(id), val(yamlBlob), val(yamlFile) + output: + tuple val(id), path{[yamlFile]} + script: + """ + mkdir -p "\$(dirname '${yamlFile}')" + echo "Storing state as yaml" + cat > '${yamlFile}' << HERE +${yamlBlob} +HERE + """ +} + + +// this assumes that the state contains no other values other than those specified in the config +def publishStatesByConfig(Map args) { + def config = args.get("config") + assert config != null : "publishStatesByConfig: config must be specified" + + def key_ = args.get("key", config.name) + assert key_ != null : "publishStatesByConfig: key must be specified" + + workflow publishStatesSimpleWf { + take: input_ch + main: + input_ch + | map { tup -> + def id_ = tup[0] + def state_ = tup[1] // e.g. [output: new File("myoutput.h5ad"), k: 10] + def origState_ = tup[2] // e.g. [output: '$id.$key.foo.h5ad'] + + // TODO: allow overriding the state.yaml template + // TODO TODO: if auto.publish == "state", add output_state as an argument + def yamlTemplate = params.containsKey("output_state") ? params.output_state : '$id.$key.state.yaml' + def yamlFilename = yamlTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + def yamlDir = java.nio.file.Paths.get(yamlFilename).getParent() + + // the processed state is a list of [key, value] tuples, where + // - key is a String + // - value is any object that can be serialized to a Yaml (so a String/Integer/Long/Double/Boolean, a List, a Map, or a Path) + // - (key, value) are the tuples that will be saved to the state.yaml file + def processedState = + config.allArguments + .findAll { it.direction == "output" } + .collectMany { par -> + def plainName_ = par.plainName + // if the state does not contain the key, it's an + // optional argument for which the component did + // not generate any output + if (!state_.containsKey(plainName_)) { + return [] + } + def value = state_[plainName_] + // if the parameter is not a file, it should be stored + // in the state as-is, but is not something that needs + // to be copied from the source path to the dest path + if (par.type != "file") { + return [[key: plainName_, value: value]] + } + // if the orig state does not contain this filename, + // it's an optional argument for which the user specified + // that it should not be returned as a state + if (!origState_.containsKey(plainName_)) { + return [] + } + def filenameTemplate = origState_[plainName_] + // if the pararameter is multiple: true, fetch the template + if (par.multiple && filenameTemplate instanceof List) { + filenameTemplate = filenameTemplate[0] + } + // instantiate the template + def filename = filenameTemplate + .replaceAll('\\$id', id_) + .replaceAll('\\$\\{id\\}', id_) + .replaceAll('\\$key', key_) + .replaceAll('\\$\\{key\\}', key_) + if (par.multiple) { + // if the parameter is multiple: true, the filename + // should contain a wildcard '*' that is replaced with + // the index of the file + assert filename.contains("*") : "Module '${key_}' id '${id_}': Multiple output files specified, but no wildcard '*' in the filename: ${filename}" + def outputPerFile = value.withIndex().collect{ val, ix -> + def filename_ix = filename.replace("*", ix.toString()) + def value_ = java.nio.file.Paths.get(filename_ix) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + return value_ + } + return [["key": plainName_, "value": outputPerFile]] + } else { + def value_ = java.nio.file.Paths.get(filename) + // if id contains a slash + if (yamlDir != null) { + value_ = yamlDir.relativize(value_) + } + def inputPath = value instanceof File ? value.toPath() : value + return [["key": plainName_, value: value_]] + } + } + + + def updatedState_ = processedState.collectEntries{[it.key, it.value]} + + // convert state to yaml blob + def yamlBlob_ = toTaggedYamlBlob([id: id_] + updatedState_) + + [id_, yamlBlob_, yamlFilename] + } + | publishStatesProc + emit: input_ch + } + return publishStatesSimpleWf +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/states/setState.nf' +def setState(fun) { + assert fun instanceof Closure || fun instanceof Map || fun instanceof List : + "Error in setState: Expected process argument to be a Closure, a Map, or a List. Found: class ${fun.getClass()}" + + // if fun is a List, convert to map + if (fun instanceof List) { + // check whether fun is a list[string] + assert fun.every{it instanceof CharSequence} : "Error in setState: argument is a List, but not all elements are Strings" + fun = fun.collectEntries{[it, it]} + } + + // if fun is a map, convert to closure + if (fun instanceof Map) { + // check whether fun is a map[string, string] + assert fun.values().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all values are Strings" + assert fun.keySet().every{it instanceof CharSequence} : "Error in setState: argument is a Map, but not all keys are Strings" + def funMap = fun.clone() + // turn the map into a closure to be used later on + fun = { id_, state_ -> + assert state_ instanceof Map : "Error in setState: the state is not a Map" + funMap.collectMany{newkey, origkey -> + if (state_.containsKey(origkey)) { + [[newkey, state_[origkey]]] + } else { + [] + } + }.collectEntries() + } + } + + map { tup -> + def id = tup[0] + def state = tup[1] + def unfilteredState = fun(id, state) + def newState = unfilteredState.findAll{key, val -> val != null} + [id, newState] + tup.drop(2) + } +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processAuto.nf' +// TODO: unit test processAuto +def processAuto(Map auto) { + // remove null values + auto = auto.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"] + def unexpectedKeys = auto.keySet() - expectedKeys + assert unexpectedKeys.isEmpty(), "unexpected keys in auto: '${unexpectedKeys.join("', '")}'" + + // check auto.simplifyInput + assert auto.simplifyInput instanceof Boolean, "auto.simplifyInput must be a boolean" + + // check auto.simplifyOutput + assert auto.simplifyOutput instanceof Boolean, "auto.simplifyOutput must be a boolean" + + // check auto.transcript + assert auto.transcript instanceof Boolean, "auto.transcript must be a boolean" + + // check auto.publish + assert auto.publish instanceof Boolean || auto.publish == "state", "auto.publish must be a boolean or 'state'" + + return auto.subMap(expectedKeys) +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processDirectives.nf' +def assertMapKeys(map, expectedKeys, requiredKeys, mapName) { + assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}" + map.forEach { key, val -> + assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map" + } + requiredKeys.forEach { requiredKey -> + assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map" + } +} + +// TODO: unit test processDirectives +def processDirectives(Map drctv) { + // remove null values + drctv = drctv.findAll{k, v -> v != null} + + // check for unexpected keys + def expectedKeys = [ + "accelerator", "afterScript", "beforeScript", "cache", "conda", "container", "containerOptions", "cpus", "disk", "echo", "errorStrategy", "executor", "machineType", "maxErrors", "maxForks", "maxRetries", "memory", "module", "penv", "pod", "publishDir", "queue", "label", "scratch", "storeDir", "stageInMode", "stageOutMode", "tag", "time" + ] + def unexpectedKeys = drctv.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Unexpected keys in process directive: '${unexpectedKeys.join("', '")}'" + + /* DIRECTIVE accelerator + accepted examples: + - [ limit: 4, type: "nvidia-tesla-k80" ] + */ + if (drctv.containsKey("accelerator")) { + assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator") + } + + /* DIRECTIVE afterScript + accepted examples: + - "source /cluster/bin/cleanup" + */ + if (drctv.containsKey("afterScript")) { + assert drctv["afterScript"] instanceof CharSequence + } + + /* DIRECTIVE beforeScript + accepted examples: + - "source /cluster/bin/setup" + */ + if (drctv.containsKey("beforeScript")) { + assert drctv["beforeScript"] instanceof CharSequence + } + + /* DIRECTIVE cache + accepted examples: + - true + - false + - "deep" + - "lenient" + */ + if (drctv.containsKey("cache")) { + assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean + if (drctv["cache"] instanceof CharSequence) { + assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache" + } + } + + /* DIRECTIVE conda + accepted examples: + - "bwa=0.7.15" + - "bwa=0.7.15 fastqc=0.11.5" + - ["bwa=0.7.15", "fastqc=0.11.5"] + */ + if (drctv.containsKey("conda")) { + if (drctv["conda"] instanceof List) { + drctv["conda"] = drctv["conda"].join(" ") + } + assert drctv["conda"] instanceof CharSequence + } + + /* DIRECTIVE container + accepted examples: + - "foo/bar:tag" + - [ registry: "reg", image: "im", tag: "ta" ] + is transformed to "reg/im:ta" + - [ image: "im" ] + is transformed to "im:latest" + */ + if (drctv.containsKey("container")) { + assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence + if (drctv["container"] instanceof Map) { + def m = drctv["container"] + assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container") + def part1 = + System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : + params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove? + m.registry ? m.registry + "/" : + "" + def part2 = m.image + def part3 = m.tag ? ":" + m.tag : ":latest" + drctv["container"] = part1 + part2 + part3 + } + } + + /* DIRECTIVE containerOptions + accepted examples: + - "--foo bar" + - ["--foo bar", "-f b"] + */ + if (drctv.containsKey("containerOptions")) { + if (drctv["containerOptions"] instanceof List) { + drctv["containerOptions"] = drctv["containerOptions"].join(" ") + } + assert drctv["containerOptions"] instanceof CharSequence + } + + /* DIRECTIVE cpus + accepted examples: + - 1 + - 10 + */ + if (drctv.containsKey("cpus")) { + assert drctv["cpus"] instanceof Integer + } + + /* DIRECTIVE disk + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("disk")) { + assert drctv["disk"] instanceof CharSequence + // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE echo + accepted examples: + - true + - false + */ + if (drctv.containsKey("echo")) { + assert drctv["echo"] instanceof Boolean + } + + /* DIRECTIVE errorStrategy + accepted examples: + - "terminate" + - "finish" + */ + if (drctv.containsKey("errorStrategy")) { + assert drctv["errorStrategy"] instanceof CharSequence + assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy" + } + + /* DIRECTIVE executor + accepted examples: + - "local" + - "sge" + */ + if (drctv.containsKey("executor")) { + assert drctv["executor"] instanceof CharSequence + assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor" + } + + /* DIRECTIVE machineType + accepted examples: + - "n1-highmem-8" + */ + if (drctv.containsKey("machineType")) { + assert drctv["machineType"] instanceof CharSequence + } + + /* DIRECTIVE maxErrors + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxErrors")) { + assert drctv["maxErrors"] instanceof Integer + } + + /* DIRECTIVE maxForks + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxForks")) { + assert drctv["maxForks"] instanceof Integer + } + + /* DIRECTIVE maxRetries + accepted examples: + - 1 + - 3 + */ + if (drctv.containsKey("maxRetries")) { + assert drctv["maxRetries"] instanceof Integer + } + + /* DIRECTIVE memory + accepted examples: + - "1 GB" + - "2TB" + - "3.2KB" + - "10.B" + */ + if (drctv.containsKey("memory")) { + assert drctv["memory"] instanceof CharSequence + // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B") + // ^ does not allow closures + } + + /* DIRECTIVE module + accepted examples: + - "ncbi-blast/2.2.27" + - "ncbi-blast/2.2.27:t_coffee/10.0" + - ["ncbi-blast/2.2.27", "t_coffee/10.0"] + */ + if (drctv.containsKey("module")) { + if (drctv["module"] instanceof List) { + drctv["module"] = drctv["module"].join(":") + } + assert drctv["module"] instanceof CharSequence + } + + /* DIRECTIVE penv + accepted examples: + - "smp" + */ + if (drctv.containsKey("penv")) { + assert drctv["penv"] instanceof CharSequence + } + + /* DIRECTIVE pod + accepted examples: + - [ label: "key", value: "val" ] + - [ annotation: "key", value: "val" ] + - [ env: "key", value: "val" ] + - [ [label: "l", value: "v"], [env: "e", value: "v"]] + */ + if (drctv.containsKey("pod")) { + if (drctv["pod"] instanceof Map) { + drctv["pod"] = [ drctv["pod"] ] + } + assert drctv["pod"] instanceof List + drctv["pod"].forEach { pod -> + assert pod instanceof Map + // TODO: should more checks be added? + // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod + // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...? + } + } + + /* DIRECTIVE publishDir + accepted examples: + - [] + - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ] + - "/path/to/dir" + is transformed to [[ path: "/path/to/dir" ]] + - [ path: "/path/to/dir", mode: "cache" ] + is transformed to [[ path: "/path/to/dir", mode: "cache" ]] + */ + // TODO: should we also look at params["publishDir"]? + if (drctv.containsKey("publishDir")) { + def pblsh = drctv["publishDir"] + + // check different options + assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence + + // turn into list if not already so + // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work. + pblsh = pblsh instanceof List ? pblsh : [ pblsh ] + + // check elements of publishDir + pblsh = pblsh.collect{ elem -> + // turn into map if not already so + elem = elem instanceof CharSequence ? [ path: elem ] : elem + + // check types and keys + assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}" + assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir") + + // check elements in map + assert elem.containsKey("path") + assert elem["path"] instanceof CharSequence + if (elem.containsKey("mode")) { + assert elem["mode"] instanceof CharSequence + assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ] + } + if (elem.containsKey("overwrite")) { + assert elem["overwrite"] instanceof Boolean + } + if (elem.containsKey("pattern")) { + assert elem["pattern"] instanceof CharSequence + } + if (elem.containsKey("saveAs")) { + assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'" + } + if (elem.containsKey("enabled")) { + assert elem["enabled"] instanceof Boolean + } + + // return final result + elem + } + // store final directive + drctv["publishDir"] = pblsh + } + + /* DIRECTIVE queue + accepted examples: + - "long" + - "short,long" + - ["short", "long"] + */ + if (drctv.containsKey("queue")) { + if (drctv["queue"] instanceof List) { + drctv["queue"] = drctv["queue"].join(",") + } + assert drctv["queue"] instanceof CharSequence + } + + /* DIRECTIVE label + accepted examples: + - "big_mem" + - "big_cpu" + - ["big_mem", "big_cpu"] + */ + if (drctv.containsKey("label")) { + if (drctv["label"] instanceof CharSequence) { + drctv["label"] = [ drctv["label"] ] + } + assert drctv["label"] instanceof List + drctv["label"].forEach { label -> + assert label instanceof CharSequence + // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?") + // ^ does not allow closures + } + } + + /* DIRECTIVE scratch + accepted examples: + - true + - "/path/to/scratch" + - '$MY_PATH_TO_SCRATCH' + - "ram-disk" + */ + if (drctv.containsKey("scratch")) { + assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence + } + + /* DIRECTIVE storeDir + accepted examples: + - "/path/to/storeDir" + */ + if (drctv.containsKey("storeDir")) { + assert drctv["storeDir"] instanceof CharSequence + } + + /* DIRECTIVE stageInMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageInMode")) { + assert drctv["stageInMode"] instanceof CharSequence + assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"] + } + + /* DIRECTIVE stageOutMode + accepted examples: + - "copy" + - "link" + */ + if (drctv.containsKey("stageOutMode")) { + assert drctv["stageOutMode"] instanceof CharSequence + assert drctv["stageOutMode"] in ["copy", "move", "rsync"] + } + + /* DIRECTIVE tag + accepted examples: + - "foo" + - '$id' + */ + if (drctv.containsKey("tag")) { + assert drctv["tag"] instanceof CharSequence + } + + /* DIRECTIVE time + accepted examples: + - "1h" + - "2days" + - "1day 6hours 3minutes 30seconds" + */ + if (drctv.containsKey("time")) { + assert drctv["time"] instanceof CharSequence + // todo: validation regex? + } + + return drctv +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/processWorkflowArgs.nf' +def processWorkflowArgs(Map args, Map defaultWfArgs, Map meta) { + // override defaults with args + def workflowArgs = defaultWfArgs + args + + // check whether 'key' exists + assert workflowArgs.containsKey("key") : "Error in module '${meta.config.name}': key is a required argument" + + // if 'key' is a closure, apply it to the original key + if (workflowArgs["key"] instanceof Closure) { + workflowArgs["key"] = workflowArgs["key"](meta.config.name) + } + def key = workflowArgs["key"] + assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}" + assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}" + + // check for any unexpected keys + def expectedKeys = ["key", "directives", "auto", "map", "mapId", "mapData", "mapPassthrough", "filter", "runIf", "fromState", "toState", "args", "renameKeys", "debug"] + def unexpectedKeys = workflowArgs.keySet() - expectedKeys + assert unexpectedKeys.isEmpty() : "Error in module '$key': unexpected arguments to the '.run()' function: '${unexpectedKeys.join("', '")}'" + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("directives") : "Error in module '$key': directives is a required argument" + assert workflowArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${workflowArgs['directives'].getClass()}" + workflowArgs["directives"] = processDirectives(defaultWfArgs.directives + workflowArgs["directives"]) + + // check whether directives exists and apply defaults + assert workflowArgs.containsKey("auto") : "Error in module '$key': auto is a required argument" + assert workflowArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${workflowArgs['auto'].getClass()}" + workflowArgs["auto"] = processAuto(defaultWfArgs.auto + workflowArgs["auto"]) + + // auto define publish, if so desired + if (workflowArgs.auto.publish == true && (workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : [:]).isEmpty()) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" + + // " Example: params.publish_dir = \"./output/\"" + def publishDir = getPublishDir() + + if (publishDir != null) { + workflowArgs.directives.publishDir = [[ + path: publishDir, + saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default + mode: "copy" + ]] + } + } + + // auto define transcript, if so desired + if (workflowArgs.auto.transcript == true) { + // can't assert at this level thanks to the no_publish profile + // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : + // "Error in module '${workflowArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" + + // " Example: params.transcripts_dir = \"./transcripts/\"" + def transcriptsDir = + params.containsKey("transcripts_dir") ? params.transcripts_dir : + params.containsKey("transcriptsDir") ? params.transcriptsDir : + params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" : + params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : + null + if (transcriptsDir != null) { + def timestamp = nextflow.Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss') + def transcriptsPublishDir = [ + path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/", + saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", + mode: "copy" + ] + def publishDirs = workflowArgs.directives.publishDir != null ? workflowArgs.directives.publishDir : null ? workflowArgs.directives.publishDir : [] + workflowArgs.directives.publishDir = publishDirs + transcriptsPublishDir + } + } + + // if this is a stubrun, remove certain directives? + if (workflow.stubRun) { + workflowArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"]) + } + + for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter", "runIf"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam]) { + assert workflowArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${workflowArgs[nam].getClass()}" + } + } + + // TODO: should functions like 'map', 'mapId', 'mapData', 'mapPassthrough' be deprecated as well? + for (nam in ["map", "mapData", "mapPassthrough", "renameKeys"]) { + if (workflowArgs.containsKey(nam) && workflowArgs[nam] != null) { + log.warn "module '$key': workflow argument '$nam' is deprecated and will be removed in Viash 0.9.0. Please use 'fromState' and 'toState' instead." + } + } + + // check fromState + workflowArgs["fromState"] = _processFromState(workflowArgs.get("fromState"), key, meta.config) + + // check toState + workflowArgs["toState"] = _processToState(workflowArgs.get("toState"), key, meta.config) + + // return output + return workflowArgs +} + +def _processFromState(fromState, key_, config_) { + assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List : + "Error in module '$key_': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}" + if (fromState == null) { + return null + } + + // if fromState is a List, convert to map + if (fromState instanceof List) { + // check whether fromstate is a list[string] + assert fromState.every{it instanceof CharSequence} : "Error in module '$key_': fromState is a List, but not all elements are Strings" + fromState = fromState.collectEntries{[it, it]} + } + + // if fromState is a map, convert to closure + if (fromState instanceof Map) { + // check whether fromstate is a map[string, string] + assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all values are Strings" + assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': fromState is a Map, but not all keys are Strings" + def fromStateMap = fromState.clone() + def requiredInputNames = meta.config.allArguments.findAll{it.required && it.direction == "Input"}.collect{it.plainName} + // turn the map into a closure to be used later on + fromState = { it -> + def state = it[1] + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def data = fromStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (state.containsKey(origkey)) { + [[newkey, state[origkey]]] + } else if (!requiredInputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': fromState key '$origkey' not found in current state") + } + }.collectEntries() + data + } + } + + return fromState +} + +def _processToState(toState, key_, config_) { + if (toState == null) { + toState = { tup -> tup[1] } + } + + // toState should be a closure, map[string, string], or list[string] + assert toState instanceof Closure || toState instanceof Map || toState instanceof List : + "Error in module '$key_': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}" + + // if toState is a List, convert to map + if (toState instanceof List) { + // check whether toState is a list[string] + assert toState.every{it instanceof CharSequence} : "Error in module '$key_': toState is a List, but not all elements are Strings" + toState = toState.collectEntries{[it, it]} + } + + // if toState is a map, convert to closure + if (toState instanceof Map) { + // check whether toState is a map[string, string] + assert toState.values().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all values are Strings" + assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key_': toState is a Map, but not all keys are Strings" + def toStateMap = toState.clone() + def requiredOutputNames = config_.allArguments.findAll{it.required && it.direction == "Output"}.collect{it.plainName} + // turn the map into a closure to be used later on + toState = { it -> + def output = it[1] + def state = it[2] + assert output instanceof Map : "Error in module '$key_': the output is not a Map" + assert state instanceof Map : "Error in module '$key_': the state is not a Map" + def extraEntries = toStateMap.collectMany{newkey, origkey -> + // check whether newkey corresponds to a required argument + if (output.containsKey(origkey)) { + [[newkey, output[origkey]]] + } else if (!requiredOutputNames.contains(origkey)) { + [] + } else { + throw new Exception("Error in module '$key_': toState key '$origkey' not found in current output") + } + }.collectEntries() + state + extraEntries + } + } + + return toState +} + +// helper file: 'src/main/resources/io/viash/runners/nextflow/workflowFactory/workflowFactory.nf' +def _debug(workflowArgs, debugKey) { + if (workflowArgs.debug) { + view { "process '${workflowArgs.key}' $debugKey tuple: $it" } + } else { + map { it } + } +} + +// depends on: innerWorkflowFactory +def workflowFactory(Map args, Map defaultWfArgs, Map meta) { + def workflowArgs = processWorkflowArgs(args, defaultWfArgs, meta) + def key_ = workflowArgs["key"] + def multipleArgs = meta.config.allArguments.findAll{ it.multiple }.collect{it.plainName} + + workflow workflowInstance { + take: input_ + + main: + def chModified = input_ + | checkUniqueIds([:]) + | _debug(workflowArgs, "input") + | map { tuple -> + tuple = deepClone(tuple) + + if (workflowArgs.map) { + tuple = workflowArgs.map(tuple) + } + if (workflowArgs.mapId) { + tuple[0] = workflowArgs.mapId(tuple[0]) + } + if (workflowArgs.mapData) { + tuple[1] = workflowArgs.mapData(tuple[1]) + } + if (workflowArgs.mapPassthrough) { + tuple = tuple.take(2) + workflowArgs.mapPassthrough(tuple.drop(2)) + } + + // check tuple + assert tuple instanceof List : + "Error in module '${key_}': element in channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + assert tuple.size() >= 2 : + "Error in module '${key_}': expected length of tuple in input channel to be two or greater.\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: tuple.size() == ${tuple.size()}" + + // check id field + if (tuple[0] instanceof GString) { + tuple[0] = tuple[0].toString() + } + assert tuple[0] instanceof CharSequence : + "Error in module '${key_}': first element of tuple in channel should be a String\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Found: ${tuple[0]}" + + // match file to input file + if (workflowArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) { + def inputFiles = meta.config.allArguments + .findAll { it.type == "file" && it.direction == "input" } + + assert inputFiles.size() == 1 : + "Error in module '${key_}' id '${tuple[0]}'.\n" + + " Anonymous file inputs are only allowed when the process has exactly one file input.\n" + + " Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}" + + tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries() + } + + // check data field + assert tuple[1] instanceof Map : + "Error in module '${key_}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // rename keys of data field in tuple + if (workflowArgs.renameKeys) { + assert workflowArgs.renameKeys instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class: Map. Found: renameKeys.getClass() is ${workflowArgs.renameKeys.getClass()}" + assert tuple[1] instanceof Map : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}" + + // TODO: allow renameKeys to be a function? + workflowArgs.renameKeys.each { newKey, oldKey -> + assert newKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}" + assert oldKey instanceof CharSequence : + "Error renaming data keys in module '${key_}' id '${tuple[0]}'.\n" + + " Example: renameKeys: ['new_key': 'old_key'].\n" + + " Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}" + assert tuple[1].containsKey(oldKey) : + "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" + + " Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'" + tuple[1].put(newKey, tuple[1][oldKey]) + } + tuple[1].keySet().removeAll(workflowArgs.renameKeys.collect{ newKey, oldKey -> oldKey }) + } + tuple + } + + + def chRun = null + def chPassthrough = null + if (workflowArgs.runIf) { + def runIfBranch = chModified.branch{ tup -> + run: workflowArgs.runIf(tup[0], tup[1]) + passthrough: true + } + chRun = runIfBranch.run + chPassthrough = runIfBranch.passthrough + } else { + chRun = chModified + chPassthrough = Channel.empty() + } + + def chRunFiltered = workflowArgs.filter ? + chRun | filter{workflowArgs.filter(it)} : + chRun + + def chArgs = workflowArgs.fromState ? + chRunFiltered | map{ + def new_data = workflowArgs.fromState(it.take(2)) + [it[0], new_data] + } : + chRunFiltered | map {tup -> tup.take(2)} + + // fill in defaults + def chArgsWithDefaults = chArgs + | map { tuple -> + def id_ = tuple[0] + def data_ = tuple[1] + + // TODO: could move fromState to here + + // fetch default params from functionality + def defaultArgs = meta.config.allArguments + .findAll { it.containsKey("default") } + .collectEntries { [ it.plainName, it.default ] } + + // fetch overrides in params + def paramArgs = meta.config.allArguments + .findAll { par -> + def argKey = key_ + "__" + par.plainName + params.containsKey(argKey) + } + .collectEntries { [ it.plainName, params[key_ + "__" + it.plainName] ] } + + // fetch overrides in data + def dataArgs = meta.config.allArguments + .findAll { data_.containsKey(it.plainName) } + .collectEntries { [ it.plainName, data_[it.plainName] ] } + + // combine params + def combinedArgs = defaultArgs + paramArgs + workflowArgs.args + dataArgs + + // remove arguments with explicit null values + combinedArgs + .removeAll{_, val -> val == null || val == "viash_no_value" || val == "force_null"} + + combinedArgs = _processInputValues(combinedArgs, meta.config, id_, key_) + + [id_, combinedArgs] + tuple.drop(2) + } + + // TODO: move some of the _meta.join_id wrangling to the safeJoin() function. + def chInitialOutputMulti = chArgsWithDefaults + | _debug(workflowArgs, "processed") + // run workflow + | innerWorkflowFactory(workflowArgs) + def chInitialOutputList = chInitialOutputMulti instanceof List ? chInitialOutputMulti : [chInitialOutputMulti] + assert chInitialOutputList.size() > 0: "should have emitted at least one output channel" + // Add a channel ID to the events, which designates the channel the event was emitted from as a running number + // This number is used to sort the events later when the events are gathered from across the channels. + def chInitialOutputListWithIndexedEvents = chInitialOutputList.withIndex().collect{channel, channelIndex -> + def newChannel = channel + | map {tuple -> + assert tuple instanceof List : + "Error in module '${key_}': element in output channel should be a tuple [id, data, ...otherargs...]\n" + + " Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" + + " Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}" + + def newEvent = [channelIndex] + tuple + return newEvent + } + return newChannel + } + // Put the events into 1 channel, cover case where there is only one channel is emitted + def chInitialOutput = chInitialOutputList.size() > 1 ? \ + chInitialOutputListWithIndexedEvents[0].mix(*chInitialOutputListWithIndexedEvents.tail()) : \ + chInitialOutputListWithIndexedEvents[0] + def chInitialOutputProcessed = chInitialOutput + | map { tuple -> + def channelId = tuple[0] + def id_ = tuple[1] + def output_ = tuple[2] + + // see if output map contains metadata + def meta_ = + output_ instanceof Map && output_.containsKey("_meta") ? + output_["_meta"] : + [:] + def join_id = meta_.join_id ?: id_ + + // remove metadata + output_ = output_.findAll{k, v -> k != "_meta"} + + // check value types + output_ = _checkValidOutputArgument(output_, meta.config, id_, key_) + + [join_id, channelId, id_, output_] + } + // | view{"chInitialOutput: ${it.take(3)}"} + + // join the output [prev_id, channel_id, new_id, output] with the previous state [prev_id, state, ...] + def chPublishWithPreviousState = safeJoin(chInitialOutputProcessed, chRunFiltered, key_) + // input tuple format: [join_id, channel_id, id, output, prev_state, ...] + // output tuple format: [join_id, channel_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(2).take(3)) + tup.take(3) + [new_state] + tup.drop(5) + } + if (workflowArgs.auto.publish == "state") { + def chPublishFiles = chPublishWithPreviousState + // input tuple format: [join_id, channel_id, id, new_state, ...] + // output tuple format: [join_id, channel_id, id, new_state] + | map{ tup -> + tup.take(4) + } + + safeJoin(chPublishFiles, chArgsWithDefaults, key_) + // input tuple format: [join_id, channel_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(2).take(3) + } + | publishFilesByConfig(key: key_, config: meta.config) + } + // Join the state from the events that were emitted from different channels + def chJoined = chInitialOutputProcessed + | map {tuple -> + def join_id = tuple[0] + def channel_id = tuple[1] + def id = tuple[2] + def other = tuple.drop(3) + // Below, groupTuple is used to join the events. To make sure resuming a workflow + // keeps working, the output state must be deterministic. This means the state needs to be + // sorted with groupTuple's has a 'sort' argument. This argument can be set to 'hash', + // but hashing the state when it is large can be problematic in terms of performance. + // Therefore, a custom comparator function is provided. We add the channel ID to the + // states so that we can use the channel ID to sort the items. + def stateWithChannelID = [[channel_id] * other.size(), other].transpose() + // A comparator that is provided to groupTuple's 'sort' argument is applied + // to all elements of the event tuple (that is not the 'id'). The comparator + // closure that is used below expects the input to be List. So the join_id and + // channel_id must also be wrapped in a list. + [[join_id], [channel_id], id] + stateWithChannelID + } + | groupTuple(by: 2, sort: {a, b -> a[0] <=> b[0]}, size: chInitialOutputList.size(), remainder: true) + | map {join_ids, _, id, statesWithChannelID -> + // Remove the channel IDs from the states + def states = statesWithChannelID.collect{it[1]} + def newJoinId = join_ids.flatten().unique{a, b -> a <=> b} + assert newJoinId.size() == 1: "Multiple events were emitted for '$id'." + def newJoinIdUnique = newJoinId[0] + + // Merge the states from the different channels + def newState = states.inject([:]){ old_state, state_to_add -> + return old_state + state_to_add.collectEntries{k, v -> + if (!multipleArgs.contains(k)) { + // if the key is not a multiple argument, we expect only one value + if (old_state.containsKey(k)) { + assert old_state[k] == v : "ID $id: multiple entries for argument $k were emitted." + } + [k, v] + } else { + // if the key is a multiple argument, append the different values into one list + def prevValue = old_state.getOrDefault(k, []) + def prevValueAsList = prevValue instanceof List ? prevValue : [prevValue] + [k, prevValueAsList + v] + } + } + } + + _checkAllRequiredOuputsPresent(newState, meta.config, id, key_) + + // simplify output if need be + if (workflowArgs.auto.simplifyOutput && newState.size() == 1) { + newState = newState.values()[0] + } + + return [newJoinIdUnique, id, newState] + } + + // join the output [prev_id, new_id, output] with the previous state [prev_id, state, ...] + def chNewState = safeJoin(chJoined, chRunFiltered, key_) + // input tuple format: [join_id, id, output, prev_state, ...] + // output tuple format: [join_id, id, new_state, ...] + | map{ tup -> + def new_state = workflowArgs.toState(tup.drop(1).take(3)) + tup.take(2) + [new_state] + tup.drop(4) + } + + if (workflowArgs.auto.publish == "state") { + def chPublishStates = chNewState + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [join_id, id, new_state] + | map{ tup -> + tup.take(3) + } + + safeJoin(chPublishStates, chArgsWithDefaults, key_) + // input tuple format: [join_id, id, new_state, orig_state, ...] + // output tuple format: [id, new_state, orig_state] + | map { tup -> + tup.drop(1).take(3) + } + | publishStatesByConfig(key: key_, config: meta.config) + } + chReturn = chNewState + | map { tup -> + // input tuple format: [join_id, id, new_state, ...] + // output tuple format: [id, new_state, ...] + tup.drop(1) + } + | _debug(workflowArgs, "output") + | concat(chPassthrough) + + emit: chReturn + } + + def wf = workflowInstance.cloneWithName(key_) + + // add factory function + wf.metaClass.run = { runArgs -> + workflowFactory(runArgs, workflowArgs, meta) + } + // add config to module for later introspection + wf.metaClass.config = meta.config + + return wf +} + +nextflow.enable.dsl=2 + +// START COMPONENT-SPECIFIC CODE + +// create meta object +meta = [ + "resources_dir": moduleDir.toRealPath().normalize(), + "config": processConfig(readJsonBlob('''{ + "name" : "well_metadata", + "namespace" : "workflows", + "version" : "updatecraftbox", + "authors" : [ + { + "name" : "Dries Schaumont", + "roles" : [ + "maintainer" + ], + "info" : { + "links" : { + "email" : "dries@data-intuitive.com", + "github" : "DriesSchaumont", + "orcid" : "0000-0002-4389-0440", + "linkedin" : "dries-schaumont" + }, + "organizations" : [ + { + "name" : "Data Intuitive", + "href" : "https://www.data-intuitive.com", + "role" : "Data Scientist" + } + ] + } + } + ], + "argument_groups" : [ + { + "name" : "Input arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--input_r1", + "description" : "List of FASTQ files for the forward reads, 1 per well.\n", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--input_r2", + "description" : "List of FASTQ files for the forward reads, 1 per well.\n", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--star_mapping", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : true, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--barcodesFasta", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "input", + "multiple" : false, + "multiple_sep" : ";" + } + ] + }, + { + "name" : "Output arguments", + "arguments" : [ + { + "type" : "file", + "name" : "--output_r1", + "description" : "Output fastq file.", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--output_r2", + "description" : "Output fastq file.", + "must_exist" : true, + "create_parent" : true, + "required" : true, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--pool", + "description" : "The original pool / sample name", + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--well_id", + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "string", + "name" : "--barcode", + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "integer", + "name" : "--n_wells", + "description" : "The number of wells in the pool is well is a part of.", + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + }, + { + "type" : "file", + "name" : "--well_star_mapping", + "must_exist" : true, + "create_parent" : true, + "required" : false, + "direction" : "output", + "multiple" : false, + "multiple_sep" : ";" + } + ] + } + ], + "resources" : [ + { + "type" : "nextflow_script", + "path" : "main.nf", + "is_executable" : true, + "entrypoint" : "run_wf" + }, + { + "type" : "file", + "path" : "/src/config/labels.config", + "dest" : "nextflow_labels.config" + }, + { + "type" : "file", + "path" : "/_viash.yaml", + "dest" : "_viash.yaml" + } + ], + "status" : "enabled", + "scope" : { + "image" : "public", + "target" : "public" + }, + "requirements" : { + "commands" : [ + "ps" + ] + }, + "license" : "MIT", + "links" : { + "repository" : "https://github.com/viash-hub/htrnaseq" + }, + "runners" : [ + { + "type" : "nextflow", + "id" : "nextflow", + "directives" : { + "tag" : "$id" + }, + "auto" : { + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false + }, + "config" : { + "labels" : { + "mem1gb" : "memory = 1000000000.B", + "mem2gb" : "memory = 2000000000.B", + "mem5gb" : "memory = 5000000000.B", + "mem10gb" : "memory = 10000000000.B", + "mem20gb" : "memory = 20000000000.B", + "mem50gb" : "memory = 50000000000.B", + "mem100gb" : "memory = 100000000000.B", + "mem200gb" : "memory = 200000000000.B", + "mem500gb" : "memory = 500000000000.B", + "mem1tb" : "memory = 1000000000000.B", + "mem2tb" : "memory = 2000000000000.B", + "mem5tb" : "memory = 5000000000000.B", + "mem10tb" : "memory = 10000000000000.B", + "mem20tb" : "memory = 20000000000000.B", + "mem50tb" : "memory = 50000000000000.B", + "mem100tb" : "memory = 100000000000000.B", + "mem200tb" : "memory = 200000000000000.B", + "mem500tb" : "memory = 500000000000000.B", + "mem1gib" : "memory = 1073741824.B", + "mem2gib" : "memory = 2147483648.B", + "mem4gib" : "memory = 4294967296.B", + "mem8gib" : "memory = 8589934592.B", + "mem16gib" : "memory = 17179869184.B", + "mem32gib" : "memory = 34359738368.B", + "mem64gib" : "memory = 68719476736.B", + "mem128gib" : "memory = 137438953472.B", + "mem256gib" : "memory = 274877906944.B", + "mem512gib" : "memory = 549755813888.B", + "mem1tib" : "memory = 1099511627776.B", + "mem2tib" : "memory = 2199023255552.B", + "mem4tib" : "memory = 4398046511104.B", + "mem8tib" : "memory = 8796093022208.B", + "mem16tib" : "memory = 17592186044416.B", + "mem32tib" : "memory = 35184372088832.B", + "mem64tib" : "memory = 70368744177664.B", + "mem128tib" : "memory = 140737488355328.B", + "mem256tib" : "memory = 281474976710656.B", + "mem512tib" : "memory = 562949953421312.B", + "cpu1" : "cpus = 1", + "cpu2" : "cpus = 2", + "cpu5" : "cpus = 5", + "cpu10" : "cpus = 10", + "cpu20" : "cpus = 20", + "cpu50" : "cpus = 50", + "cpu100" : "cpus = 100", + "cpu200" : "cpus = 200", + "cpu500" : "cpus = 500", + "cpu1000" : "cpus = 1000" + }, + "script" : [ + "includeConfig(\\"nextflow_labels.config\\")" + ] + }, + "debug" : false, + "container" : "docker" + } + ], + "engines" : [ + { + "type" : "native", + "id" : "native" + }, + { + "type" : "native", + "id" : "native" + } + ], + "build_info" : { + "config" : "/workdir/root/repo/src/workflows/well_metadata/config.vsh.yaml", + "runner" : "nextflow", + "engine" : "native|native", + "output" : "target/nextflow/workflows/well_metadata", + "viash_version" : "0.9.4", + "git_commit" : "e6da525fc57aaec74f348eb974b68faa647bf800", + "git_remote" : "https://github.com/viash-hub/htrnaseq" + }, + "package_config" : { + "name" : "htrnaseq", + "version" : "updatecraftbox", + "summary" : "A workflow for high-throughput RNA-seq data analyses.\n", + "description" : "This workflow is designed to process high-throughput RNA-seq data, where every\nwell of a microarray plate is a sample. A fasta file provided as input\ndefines the mapping between sample barcodes and wells.\n\nThe workflow is built in a modular fashion, where most of the base functionality\nis provided by components from [`biobox`](https://www.viash-hub.com/packages/biobox/latest)\nsupplemented by custom base components and workflow components in this package.\n\nThe full workflow is split in two major subworkflows that can be run independently:\n\n* **Well-demultiplexing:** Split the input (plate/pool level) fastq files per well.\n* **Mapping, counting and QC:** Run per-well mapping, counting and generate QC reports.\n\nEach of those can be started individually, or the full workflow can be run in two ways:\n\n1. Run the [main workflow](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/htrnaseq) \ncontaining the main functionality.\n2. Run the [(opinionated) `runner`](https://www.viash-hub.com/packages/htrnaseq/v0.3.0/components/workflows/runner) where a\nnumber of choices (input/output structure and location) have been made.\n\nInput for the workflow has to be `fastq` files (zipped or not). For bcl or other formats, please consider running\n[demultiplex](https://www.viash-hub.com/packages/demultiplex) first.\n", + "info" : { + "test_resources" : [ + { + "path" : "gs://viash-hub-resources/htrnaseq/v2", + "dest" : "resources_test" + } + ] + }, + "viash_version" : "0.9.4", + "source" : "src", + "target" : "target", + "config_mods" : [ + ".requirements.commands := ['ps']\n.runners[.type == 'nextflow'].config.script += 'includeConfig(\\"nextflow_labels.config\\")'\n.resources += {path: '/src/config/labels.config', dest: 'nextflow_labels.config'}\n.resources += {path: '/_viash.yaml', dest: '_viash.yaml'}\n", + ".engines += { type: \\"native\\" }", + ".engines[.type == 'docker'].target_registry := 'images.viash-hub.com'", + ".engines[.type == 'docker'].target_tag := 'updatecraftbox'" + ], + "keywords" : [ + "bioinformatics", + "sequencing", + "high-throughput", + "RNAseq", + "mapping", + "counting", + "pipeline", + "workflow" + ], + "license" : "MIT", + "organization" : "vsh", + "links" : { + "repository" : "https://github.com/viash-hub/htrnaseq", + "issue_tracker" : "https://github.com/viash-hub/htrnaseq/issues" + } + } +}''')) +] + +// resolve dependencies dependencies (if any) + + +// inner workflow +// user-provided Nextflow code +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + /* + Parse the fasta file containing the barcodes and do the following: + - The sequence headers must not contain any whitespaces + - The headers (Well IDs) must be unique + - The barcodes must be unique + - Store the number of barcodes in the state + - Add a barcode to well ID (header) mapping to the state, + in order to be able to retreive the well ID based on the FASTQ name after well demultiplexing + */ + | map {id, state -> + def n_wells = state.barcodesFasta.countFasta() as int + // The header is the full header, the id is the part header up to the first whitespace character + // We do not allow whitespace in the header of the fasta file, so assert this. + def fasta_entries = state.barcodesFasta.splitFasta( + record: ["id": true, "header": true, "seqString": true] + ) + assert fasta_entries.every{it.id == it.header}, \ + "The barcodes FASTA headers must not contain any whitespace!" + // Check if the fasta headers are unique + def fasta_ids = fasta_entries.collect{it.id} + assert fasta_ids.clone().unique() == fasta_ids, \ + "The barcodes FASTA entries must have a unique name!" + // Check if the sequences are unique + def fasta_sequences = fasta_entries.collect{it.seqString} + assert fasta_sequences.clone().unique() == fasta_sequences, \ + "The barcodes FASTA sequences must be unique!" + def well_id_matcher = /^([A-Za-z]+)0*([1-9]?[0-9]+)$/ + def entries_corrected_id = fasta_entries.collectEntries { it -> + def unformatted_id = it.header + def id_matched_to_format = unformatted_id =~ well_id_matcher + assert (id_matched_to_format && id_matched_to_format.getCount() == 1), \ + "The FASTA headers must match the coordinate system of a well plate (e.g. A01, B01, ... or AA1, AB1, ...). Found: ${unformatted_id}" + def id_letters = id_matched_to_format[0][1].toUpperCase() + def id_numbers = id_matched_to_format[0][2] + [it.seqString.replaceAll("[^ACGTacgt]", ""), "${id_letters}${id_numbers}".toString()] + } + def newState = state + [ + "n_wells": n_wells, + "barcode_well_id_mapping": entries_corrected_id, + ] + [id, newState] + } + | flatMap{ id, state -> + def new_events = state.star_mapping.collect{ star_output_dir -> + def pool = id + // Get the barcode from the STAR file. + // One STAR output contains the results for one + // well barcode. We can look for the barcode in + // the 'Solo.out/Gene/raw/barcode.tsv' file. + def barcodes_files = files("${star_output_dir}/Solo.out/Gene/raw/barcodes.tsv") + assert barcodes_files.size() == 1, \ + "Exactly one file should have matched the barcodes files (found: $barcodes_files)." + def barcode + barcodes_files.each{ it -> + assert it.countLines() == 1, + "Expected only one barcode in a single STAR output." + barcode = it.text.trim() + } + def well_id = state.barcode_well_id_mapping[barcode] + assert well_id, "Could not find Well ID in FASTA file for barcode ${barcode}." + def return_state = [ + "${pool}__${well_id}".toString(), + [ + "barcode": barcode, + "well_id": well_id, + "pool": pool, + "n_wells": state.n_wells, + "output_r1": state.input_r1, + "output_r2": state.input_r2, + "well_star_mapping": star_output_dir, + "_meta": ["join_id": pool] + ] + ] + } + return new_events + } + // Parse the file names to obtain metadata about the output + | map{ id, state -> + // Populate the new state + def fastq_files = [state.output_r1, state.output_r2].transpose().findResult{ fastq_pair -> + def (forward_fastq, reverse_fastq) = fastq_pair + def fastq_r1_name = forward_fastq.name + def fastq_r2_name = reverse_fastq.name + // Get the well ID, and also check if it matches between the forward and reverse FASTQ + def well_id = null + [fastq_r1_name, fastq_r2_name].each { file_name -> + def well_id_matcher = file_name =~ /^([A-Za-z0-9]*|unknown)_R?.*/ + assert well_id_matcher, \ + "Could not find Well ID in the name of FASTQ file ($file_name) output from cutadapt." + def current_well_id = well_id_matcher[0][1] + if (!well_id) { + well_id = current_well_id + } else { + assert well_id == current_well_id, + "Well ID for forward and reverse fastq file did not match! File names: ${fastq_r1_name} and ${fastq_r2_name}" + } + } + assert (well_id != null), \ + "No Well ID could be deduced from files ${fastq_r1_name} and ${fastq_r2_name}." + + if (well_id == "unknown" || well_id != state.well_id) { + return null + } + return fastq_pair + } + + def new_state = state + [ + "output_r1": fastq_files[0], + "output_r2": fastq_files[1] + ] + return [id, new_state] + } + | setState(["output_r1", "output_r2", "pool", "well_id", "n_wells", "barcode", "well_star_mapping", "_meta"]) + + emit: + output_ch +} + +// inner workflow hook +def innerWorkflowFactory(args) { + return run_wf +} + +// defaults +meta["defaults"] = [ + // key to be used to trace the process and determine output names + key: null, + + // fixed arguments to be passed to script + args: [:], + + // default directives + directives: readJsonBlob('''{ + "tag" : "$id" +}'''), + + // auto settings + auto: readJsonBlob('''{ + "simplifyInput" : true, + "simplifyOutput" : false, + "transcript" : false, + "publish" : false +}'''), + + // Apply a map over the incoming tuple + // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }` + map: null, + + // Apply a map over the ID element of a tuple (i.e. the first element) + // Example: `{ id -> id + "_foo" }` + mapId: null, + + // Apply a map over the data element of a tuple (i.e. the second element) + // Example: `{ data -> [ input: data.output ] }` + mapData: null, + + // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements) + // Example: `{ pt -> pt.drop(1) }` + mapPassthrough: null, + + // Filter the channel + // Example: `{ tup -> tup[0] == "foo" }` + filter: null, + + // Choose whether or not to run the component on the tuple if the condition is true. + // Otherwise, the tuple will be passed through. + // Example: `{ tup -> tup[0] != "skip_this" }` + runIf: null, + + // Rename keys in the data field of the tuple (i.e. the second element) + // Will likely be deprecated in favour of `fromState`. + // Example: `[ "new_key": "old_key" ]` + renameKeys: null, + + // Fetch data from the state and pass it to the module without altering the current state. + // + // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be passed to the module as is. + // - If it is a `List[String]`, the data will be the values of the state at the given keys. + // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map. + // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data. + // + // Example: `{ id, state -> [input: state.fastq_file] }` + // Default: `null` + fromState: null, + + // Determine how the state should be updated after the module has been run. + // + // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function. + // + // - If it is `null`, the state will be replaced with the output of the module. + // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys. + // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map. + // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state. + // + // Example: `{ id, output, state -> state + [counts: state.output] }` + // Default: `{ id, output, state -> output }` + toState: null, + + // Whether or not to print debug messages + // Default: `false` + debug: false +] + +// initialise default workflow +meta["workflow"] = workflowFactory([key: meta.config.name], meta.defaults, meta) + +// add workflow to environment +nextflow.script.ScriptMeta.current().addDefinition(meta.workflow) + +// anonymous workflow for running this module as a standalone +workflow { + // add id argument if it's not already in the config + // TODO: deep copy + def newConfig = deepClone(meta.config) + def newParams = deepClone(params) + + def argsContainsId = newConfig.allArguments.any{it.plainName == "id"} + if (!argsContainsId) { + def idArg = [ + 'name': '--id', + 'required': false, + 'type': 'string', + 'description': 'A unique id for every entry.', + 'multiple': false + ] + newConfig.arguments.add(0, idArg) + newConfig = processConfig(newConfig) + } + if (!newParams.containsKey("id")) { + newParams.id = "run" + } + + helpMessage(newConfig) + + channelFromParams(newParams, newConfig) + // make sure id is not in the state if id is not in the args + | map {id, state -> + if (!argsContainsId) { + [id, state.findAll{k, v -> k != "id"}] + } else { + [id, state] + } + } + | meta.workflow.run( + auto: [ publish: "state" ] + ) +} + +// END COMPONENT-SPECIFIC CODE diff --git a/target/nextflow/workflows/well_metadata/nextflow.config b/target/nextflow/workflows/well_metadata/nextflow.config new file mode 100644 index 00000000..3878bb52 --- /dev/null +++ b/target/nextflow/workflows/well_metadata/nextflow.config @@ -0,0 +1,125 @@ +manifest { + name = 'workflows/well_metadata' + mainScript = 'main.nf' + nextflowVersion = '!>=20.12.1-edge' + version = 'updatecraftbox' + author = 'Dries Schaumont' +} + +process.container = 'nextflow/bash:latest' + +// detect tempdir +tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' +).toAbsolutePath() + +profiles { + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + docker { + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } +} + +process{ + withLabel: mem1gb { memory = 1000000000.B } + withLabel: mem2gb { memory = 2000000000.B } + withLabel: mem5gb { memory = 5000000000.B } + withLabel: mem10gb { memory = 10000000000.B } + withLabel: mem20gb { memory = 20000000000.B } + withLabel: mem50gb { memory = 50000000000.B } + withLabel: mem100gb { memory = 100000000000.B } + withLabel: mem200gb { memory = 200000000000.B } + withLabel: mem500gb { memory = 500000000000.B } + withLabel: mem1tb { memory = 1000000000000.B } + withLabel: mem2tb { memory = 2000000000000.B } + withLabel: mem5tb { memory = 5000000000000.B } + withLabel: mem10tb { memory = 10000000000000.B } + withLabel: mem20tb { memory = 20000000000000.B } + withLabel: mem50tb { memory = 50000000000000.B } + withLabel: mem100tb { memory = 100000000000000.B } + withLabel: mem200tb { memory = 200000000000000.B } + withLabel: mem500tb { memory = 500000000000000.B } + withLabel: mem1gib { memory = 1073741824.B } + withLabel: mem2gib { memory = 2147483648.B } + withLabel: mem4gib { memory = 4294967296.B } + withLabel: mem8gib { memory = 8589934592.B } + withLabel: mem16gib { memory = 17179869184.B } + withLabel: mem32gib { memory = 34359738368.B } + withLabel: mem64gib { memory = 68719476736.B } + withLabel: mem128gib { memory = 137438953472.B } + withLabel: mem256gib { memory = 274877906944.B } + withLabel: mem512gib { memory = 549755813888.B } + withLabel: mem1tib { memory = 1099511627776.B } + withLabel: mem2tib { memory = 2199023255552.B } + withLabel: mem4tib { memory = 4398046511104.B } + withLabel: mem8tib { memory = 8796093022208.B } + withLabel: mem16tib { memory = 17592186044416.B } + withLabel: mem32tib { memory = 35184372088832.B } + withLabel: mem64tib { memory = 70368744177664.B } + withLabel: mem128tib { memory = 140737488355328.B } + withLabel: mem256tib { memory = 281474976710656.B } + withLabel: mem512tib { memory = 562949953421312.B } + withLabel: cpu1 { cpus = 1 } + withLabel: cpu2 { cpus = 2 } + withLabel: cpu5 { cpus = 5 } + withLabel: cpu10 { cpus = 10 } + withLabel: cpu20 { cpus = 20 } + withLabel: cpu50 { cpus = 50 } + withLabel: cpu100 { cpus = 100 } + withLabel: cpu200 { cpus = 200 } + withLabel: cpu500 { cpus = 500 } + withLabel: cpu1000 { cpus = 1000 } +} + +includeConfig("nextflow_labels.config") diff --git a/target/nextflow/workflows/well_metadata/nextflow_labels.config b/target/nextflow/workflows/well_metadata/nextflow_labels.config new file mode 100644 index 00000000..2821ec46 --- /dev/null +++ b/target/nextflow/workflows/well_metadata/nextflow_labels.config @@ -0,0 +1,108 @@ +executor { + $k8s { + submitRateLimit = '10sec' + pollInterval = '1 sec' + } +} + +process { + container = 'nextflow/bash:latest' + + // default resources + memory = { 8.Gb * task.attempt } + cpus = 8 + maxForks = 36 + + // Retry for exit codes that have something to do with memory issues + errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' } + maxRetries = 3 + maxMemory = 192.GB + + // Resource labels + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 8 } + withLabel: midcpu { cpus = 16 } + withLabel: highcpu { cpus = 32 } + + withLabel: verylowmem { memory = { get_memory( 4.GB * task.attempt ) } } + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 16.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 64.GB * task.attempt ) } } + +} + +profiles { + // detect tempdir + tempDir = java.nio.file.Paths.get( + System.getenv('NXF_TEMP') ?: + System.getenv('VIASH_TEMP') ?: + System.getenv('TEMPDIR') ?: + System.getenv('TMPDIR') ?: + '/tmp' + ).toAbsolutePath() + + mount_temp { + docker.temp = tempDir + podman.temp = tempDir + charliecloud.temp = tempDir + } + + no_publish { + process { + withName: '.*' { + publishDir = [ + enabled: false + ] + } + } + } + + docker { + docker.fixOwnership = true + docker.enabled = true + // docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + + local { + // This config is for local processing. + process { + withName: ".*parallel_map_process" { + maxForks = 1 + } + maxMemory = 25.GB + withLabel: verylowcpu { cpus = 2 } + withLabel: lowcpu { cpus = 4 } + withLabel: midcpu { cpus = 6 } + withLabel: highcpu { cpus = 8 } + + withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } } + withLabel: midmem { memory = { get_memory( 12.GB * task.attempt ) } } + withLabel: highmem { memory = { get_memory( 20.GB * task.attempt ) } } + } + } +} + +def get_memory(to_compare) { + if (!process.containsKey("maxMemory") || !process.maxMemory) { + return to_compare + } + + try { + if (process.containsKey("maxRetries") && process.maxRetries && task.attempt == (process.maxRetries as int)) { + return process.maxMemory + } + else if (to_compare.compareTo(process.maxMemory as nextflow.util.MemoryUnit) == 1) { + return max_memory as nextflow.util.MemoryUnit + } + else { + return to_compare + } + } catch (all) { + println "Error processing memory resources. Please check that process.maxMemory '${process.maxMemory}' and process.maxRetries '${process.maxRetries}' are valid!" + System.exit(1) + } + } diff --git a/target/nextflow/workflows/well_metadata/nextflow_schema.json b/target/nextflow/workflows/well_metadata/nextflow_schema.json new file mode 100644 index 00000000..b15de895 --- /dev/null +++ b/target/nextflow/workflows/well_metadata/nextflow_schema.json @@ -0,0 +1,123 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "well_metadata", + "description": "No description", + "type": "object", + "$defs": { + "input arguments": { + "title": "Input arguments", + "type": "object", + "description": "No description", + "properties": { + "input_r1": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "exists": true, + "description": "List of FASTQ files for the forward reads, 1 per well.\n", + "help_text": "Type: `file`, multiple: `True`, required, direction: `input`. " + }, + "input_r2": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "exists": true, + "description": "List of FASTQ files for the forward reads, 1 per well.\n", + "help_text": "Type: `file`, multiple: `True`, required, direction: `input`. " + }, + "star_mapping": { + "type": "array", + "items": { + "type": "string" + }, + "format": "path", + "exists": true, + "description": "", + "help_text": "Type: `file`, multiple: `True`, required, direction: `input`. " + }, + "barcodesFasta": { + "type": "string", + "format": "path", + "exists": true, + "description": "", + "help_text": "Type: `file`, multiple: `False`, required, direction: `input`. " + } + } + }, + "output arguments": { + "title": "Output arguments", + "type": "object", + "description": "No description", + "properties": { + "output_r1": { + "type": "string", + "format": "path", + "description": "Output fastq file.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output_r1\"`, direction: `output`. ", + "default": "$id.$key.output_r1" + }, + "output_r2": { + "type": "string", + "format": "path", + "description": "Output fastq file.", + "help_text": "Type: `file`, multiple: `False`, required, default: `\"$id.$key.output_r2\"`, direction: `output`. ", + "default": "$id.$key.output_r2" + }, + "pool": { + "type": "string", + "description": "The original pool / sample name", + "help_text": "Type: `string`, multiple: `False`. " + }, + "well_id": { + "type": "string", + "description": "", + "help_text": "Type: `string`, multiple: `False`. " + }, + "barcode": { + "type": "string", + "description": "", + "help_text": "Type: `string`, multiple: `False`. " + }, + "n_wells": { + "type": "integer", + "description": "The number of wells in the pool is well is a part of.", + "help_text": "Type: `integer`, multiple: `False`. " + }, + "well_star_mapping": { + "type": "string", + "format": "path", + "description": "", + "help_text": "Type: `file`, multiple: `False`, default: `\"$id.$key.well_star_mapping\"`, direction: `output`. ", + "default": "$id.$key.well_star_mapping" + } + } + }, + "nextflow input-output arguments": { + "title": "Nextflow input-output arguments", + "type": "object", + "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.", + "properties": { + "publish_dir": { + "type": "string", + "description": "Path to an output directory.", + "help_text": "Type: `string`, multiple: `False`, required, example: `\"output/\"`. " + } + } + } + }, + "allOf": [ + { + "$ref": "#/$defs/input arguments" + }, + { + "$ref": "#/$defs/output arguments" + }, + { + "$ref": "#/$defs/nextflow input-output arguments" + } + ] +}