Build branch main with version main (82647a4)

Build pipeline: viash-hub.htrnaseq.main-8kbhw

Source commit: 82647a421d

Source message: Assert that the Well ID matches the required format (#22)
This commit is contained in:
CI
2024-12-17 15:30:58 +00:00
parent f2ff92c6ac
commit 554d5253fe
142 changed files with 12149 additions and 384 deletions

View File

@@ -1,6 +1,20 @@
name: "htrnaseq"
namespace: "workflows"
version: "main"
authors:
- name: "Dries Schaumont"
roles:
- "maintainer"
info:
links:
email: "dries@data-intuitive.com"
github: "DriesSchaumont"
orcid: "0000-0002-4389-0440"
linkedin: "dries-schaumont"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Data Scientist"
argument_groups:
- name: "Input arguments"
arguments:
@@ -148,6 +162,17 @@ argument_groups:
direction: "output"
multiple: false
multiple_sep: ";"
- type: "file"
name: "--html_report"
info: null
default:
- "report.html"
must_exist: true
create_parent: true
required: true
direction: "output"
multiple: false
multiple_sep: ";"
resources:
- type: "nextflow_script"
path: "main.nf"
@@ -194,6 +219,9 @@ dependencies:
- name: "eset/create_pdata"
repository:
type: "local"
- name: "report/create_report"
repository:
type: "local"
repositories:
- type: "local"
name: "local"
@@ -280,8 +308,8 @@ build_info:
output: "target/nextflow/workflows/htrnaseq"
executable: "target/nextflow/workflows/htrnaseq/main.nf"
viash_version: "0.9.0"
git_commit: "65dd41d8b1b4a307735c72320c96c0880c75f17f"
git_remote: "https://x-access-token:ghs_McZDF0yobnnHmOEb2Q4JaaB3pzr9mz1VbIOs@github.com/viash-hub/htrnaseq"
git_commit: "82647a421dae521a9563f7f02050f13a1319eb4a"
git_remote: "https://x-access-token:ghs_GvoC19gNBNw8DS3yDc8aa44laHZP4K2GBiY3@github.com/viash-hub/htrnaseq"
dependencies:
- "target/nextflow/stats/combine_star_logs"
- "target/nextflow/stats/generate_pool_statistics"
@@ -292,6 +320,7 @@ build_info:
- "target/nextflow/eset/create_eset"
- "target/nextflow/eset/create_fdata"
- "target/nextflow/eset/create_pdata"
- "target/nextflow/report/create_report"
package_config:
name: "htrnaseq"
version: "main"

View File

@@ -8,6 +8,9 @@
// authors of this component should specify the license in the header of such
// files, or include a separate license file detailing the licenses of all included
// files.
//
// Component authors:
// * Dries Schaumont (maintainer)
////////////////////////////
// VDSL3 helper functions //
@@ -2807,6 +2810,29 @@ meta = [
"name" : "htrnaseq",
"namespace" : "workflows",
"version" : "main",
"authors" : [
{
"name" : "Dries Schaumont",
"roles" : [
"maintainer"
],
"info" : {
"links" : {
"email" : "dries@data-intuitive.com",
"github" : "DriesSchaumont",
"orcid" : "0000-0002-4389-0440",
"linkedin" : "dries-schaumont"
},
"organizations" : [
{
"name" : "Data Intuitive",
"href" : "https://www.data-intuitive.com",
"role" : "Data Scientist"
}
]
}
}
],
"argument_groups" : [
{
"name" : "Input arguments",
@@ -2974,6 +3000,19 @@ meta = [
"direction" : "output",
"multiple" : false,
"multiple_sep" : ";"
},
{
"type" : "file",
"name" : "--html_report",
"default" : [
"report.html"
],
"must_exist" : true,
"create_parent" : true,
"required" : true,
"direction" : "output",
"multiple" : false,
"multiple_sep" : ";"
}
]
}
@@ -3059,6 +3098,12 @@ meta = [
"repository" : {
"type" : "local"
}
},
{
"name" : "report/create_report",
"repository" : {
"type" : "local"
}
}
],
"repositories" : [
@@ -3165,8 +3210,8 @@ meta = [
"engine" : "native|native",
"output" : "target/nextflow/workflows/htrnaseq",
"viash_version" : "0.9.0",
"git_commit" : "65dd41d8b1b4a307735c72320c96c0880c75f17f",
"git_remote" : "https://x-access-token:ghs_McZDF0yobnnHmOEb2Q4JaaB3pzr9mz1VbIOs@github.com/viash-hub/htrnaseq"
"git_commit" : "82647a421dae521a9563f7f02050f13a1319eb4a",
"git_remote" : "https://x-access-token:ghs_GvoC19gNBNw8DS3yDc8aa44laHZP4K2GBiY3@github.com/viash-hub/htrnaseq"
},
"package_config" : {
"name" : "htrnaseq",
@@ -3218,6 +3263,7 @@ include { groupWells } from "${meta.resources_dir}/../../../nextflow/workflows/u
include { create_eset } from "${meta.resources_dir}/../../../nextflow/eset/create_eset/main.nf"
include { create_fdata } from "${meta.resources_dir}/../../../nextflow/eset/create_fdata/main.nf"
include { create_pdata } from "${meta.resources_dir}/../../../nextflow/eset/create_pdata/main.nf"
include { create_report } from "${meta.resources_dir}/../../../nextflow/report/create_report/main.nf"
// inner workflow
// user-provided Nextflow code
@@ -3238,21 +3284,6 @@ workflow run_wf {
// Perform mapping of each well. The input here are events per pool,
// the output channel is one event per well.
mapping_ch = input_ch
| map {id, state ->
def n_barcodes = state.barcodesFasta.countFasta() as int
def newState = state + ["n_barcodes": n_barcodes]
// The header is the full header, the id is the part header up to the first whitespace character
// We do not allow whitespace in the header of the fasta file, so assert this.
def fasta_entries = state.barcodesFasta.splitFasta(record: ["id": true, "header": true, "seqString": true])
assert fasta_entries.every{it.id == it.header}, "The barcodes FASTA headers must not contain any whitespace!"
// Check if the fasta headers are unique
def fasta_ids = fasta_entries.collect{it.id}
assert fasta_ids.clone().unique() == fasta_ids, "The barcodes FASTA entries must have a unique name!"
// Check if the sequences are unique
def fasta_sequences = fasta_entries.collect{it.seqString}
assert fasta_sequences.clone().unique() == fasta_sequences, "The barcodes FASTA sequences must be unique!"
[id, newState]
}
| well_demultiplex.run(
fromState: [
"input_r1": "input_r1",
@@ -3264,7 +3295,7 @@ workflow run_wf {
def filtered_results = result.findAll{!["output_r1", "output_r2"].contains(it.key)}
def new_state = filtered_input + filtered_results + [
"fastq_output_r1": result.output_r1,
"fastq_output_r2": result.output_r2,
"fastq_output_r2": result.output_r2,
]
return new_state
}
@@ -3292,6 +3323,7 @@ workflow run_wf {
[
"input": state.star_output.resolve('Aligned.sortedByCoord.out.bam'),
"barcode": state.barcode,
"well_id": state.well_id,
]
},
toState: [
@@ -3302,7 +3334,7 @@ workflow run_wf {
// Create a special groupKey, such that groupTuple
// knows when all the barcodes have been grouped into 1 event.
// This way the processing is as distributed as possible.
def key = groupKey(state.pool, state.n_barcodes)
def key = groupKey(state.pool, state.n_wells)
def newEvent = [key, state]
return newEvent
}
@@ -3318,10 +3350,14 @@ workflow run_wf {
def barcodes = states.collect{it.barcode}
assert barcodes.clone().unique().size() == barcodes.size(), \
"Error when gathering information for pool ${id}, barcodes are not unique!"
def well_ids = states.collect{it.well_id}
assert well_ids.clone().unique().size() == well_ids.size(), \
"Error when gathering information for pool ${id}, well IDs are not unique!"
def custom_state = [
"fastq_output_r1": states.collect{it.fastq_output_r1[0]},
"fastq_output_r2": states.collect{it.fastq_output_r2[0]},
"barcode": barcodes,
"well_id": well_ids,
"star_output": states.collect{it.star_output},
"nrReadsNrGenesPerChrom": states.collect{it.nrReadsNrGenesPerChrom},
]
@@ -3390,7 +3426,7 @@ workflow run_wf {
toState: ["p_data": "output"],
)
output_ch = p_data_ch.join(f_data_ch, remainder: true)
eset_ch = p_data_ch.join(f_data_ch, remainder: true)
| map {id, p_data_state, f_data_state ->
def newState = p_data_state + ["f_data": f_data_state["f_data"]]
[id, newState]
@@ -3409,6 +3445,36 @@ workflow run_wf {
"eset": "output",
]
)
report_channel = eset_ch
| toSortedList()
| map {ids_and_states ->
def states = ids_and_states.collect{it[1]}
def html_report = states[0].html_report
def ids = ids_and_states.collect{it[0]}
def esets = states.collect{it.eset}
["report", ["esets": esets, "html_report": html_report, "original_ids": ids]]
}
| create_report.run(
fromState: [
"eset": "esets",
"output_report": "html_report",
],
toState: [
"html_report": "output_report"
]
)
| flatMap {id, state ->
state.original_ids.collect{original_id ->
[original_id, ["html_report": state.html_report]]
}
}
output_ch = eset_ch.join(report_channel)
| map {id, state_eset, state_report ->
def new_state = state_eset + ["html_report": state_report.html_report]
[id, new_state]
}
| setState([
"star_output": "star_output",
"fastq_output_r1": "fastq_output_r1",
@@ -3418,7 +3484,8 @@ workflow run_wf {
"star_qc_metrics": "star_qc_metrics",
"eset": "eset",
"f_data": "f_data",
"p_data": "p_data"
"p_data": "p_data",
"html_report": "html_report",
])

View File

@@ -3,6 +3,7 @@ manifest {
mainScript = 'main.nf'
nextflowVersion = '!>=20.12.1-edge'
version = 'main'
author = 'Dries Schaumont'
}
process.container = 'nextflow/bash:latest'

View File

@@ -70,11 +70,14 @@ profiles {
local {
// This config is for local processing.
process {
withName: ".*parallel_map_process" {
maxForks = 1
}
maxMemory = 25.GB
withLabel: verylowcpu { cpus = 2 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 6 }
withLabel: highcpu { cpus = 12 }
withLabel: highcpu { cpus = 8 }
withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } }
withLabel: midmem { memory = { get_memory( 12.GB * task.attempt ) } }

View File

@@ -80,7 +80,7 @@
"description": "Type: List of `file`, required, default: `$id.$key.fastq_output_r1_*.fastq`, multiple_sep: `\";\"`. List of demultiplexed fastq files",
"help_text": "Type: List of `file`, required, default: `$id.$key.fastq_output_r1_*.fastq`, multiple_sep: `\";\"`. List of demultiplexed fastq files"
,
"default": "$id.$key.fastq_output_r1_*.fastq"
"default":"$id.$key.fastq_output_r1_*.fastq"
}
@@ -91,7 +91,7 @@
"description": "Type: List of `file`, required, default: `$id.$key.fastq_output_r2_*.fastq`, multiple_sep: `\";\"`. List of demultiplexed fastq files",
"help_text": "Type: List of `file`, required, default: `$id.$key.fastq_output_r2_*.fastq`, multiple_sep: `\";\"`. List of demultiplexed fastq files"
,
"default": "$id.$key.fastq_output_r2_*.fastq"
"default":"$id.$key.fastq_output_r2_*.fastq"
}
@@ -102,7 +102,7 @@
"description": "Type: List of `file`, required, default: `$id.$key.star_output_*.star_output_*`, multiple_sep: `\";\"`. Output from mapping with STAR",
"help_text": "Type: List of `file`, required, default: `$id.$key.star_output_*.star_output_*`, multiple_sep: `\";\"`. Output from mapping with STAR"
,
"default": "$id.$key.star_output_*.star_output_*"
"default":"$id.$key.star_output_*.star_output_*"
}
@@ -113,7 +113,7 @@
"description": "Type: `file`, required, default: `$id.$key.nrReadsNrGenesPerChrom.txt`. ",
"help_text": "Type: `file`, required, default: `$id.$key.nrReadsNrGenesPerChrom.txt`. "
,
"default": "$id.$key.nrReadsNrGenesPerChrom.txt"
"default":"$id.$key.nrReadsNrGenesPerChrom.txt"
}
@@ -124,7 +124,7 @@
"description": "Type: `file`, required, default: `$id.$key.star_qc_metrics.txt`. ",
"help_text": "Type: `file`, required, default: `$id.$key.star_qc_metrics.txt`. "
,
"default": "$id.$key.star_qc_metrics.txt"
"default":"$id.$key.star_qc_metrics.txt"
}
@@ -135,7 +135,7 @@
"description": "Type: `file`, required, default: `$id.$key.eset.rds`. ",
"help_text": "Type: `file`, required, default: `$id.$key.eset.rds`. "
,
"default": "$id.$key.eset.rds"
"default":"$id.$key.eset.rds"
}
@@ -146,7 +146,7 @@
"description": "Type: `file`, required, default: `$id.$key.f_data.tsv`. ",
"help_text": "Type: `file`, required, default: `$id.$key.f_data.tsv`. "
,
"default": "$id.$key.f_data.tsv"
"default":"$id.$key.f_data.tsv"
}
@@ -157,7 +157,18 @@
"description": "Type: `file`, required, default: `$id.$key.p_data.tsv`. ",
"help_text": "Type: `file`, required, default: `$id.$key.p_data.tsv`. "
,
"default": "$id.$key.p_data.tsv"
"default":"$id.$key.p_data.tsv"
}
,
"html_report": {
"type":
"string",
"description": "Type: `file`, required, default: `$id.$key.html_report.html`. ",
"help_text": "Type: `file`, required, default: `$id.$key.html_report.html`. "
,
"default":"$id.$key.html_report.html"
}

View File

@@ -1,6 +1,20 @@
name: "parallel_map_wf"
namespace: "workflows"
version: "main"
authors:
- name: "Dries Schaumont"
roles:
- "maintainer"
info:
links:
email: "dries@data-intuitive.com"
github: "DriesSchaumont"
orcid: "0000-0002-4389-0440"
linkedin: "dries-schaumont"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Data Scientist"
argument_groups:
- name: "Arguments"
arguments:
@@ -66,7 +80,7 @@ description: "Map RNA sequencing data, provided as fastq files (paired-end) to a
\ genome using STAR Solo.\nInput data must have been demultiplexed beforehand, meaning\
\ that a single fastq pair provides data for\none barcode (one well). Multiple wells\
\ can be mapped in parallel by providing multiple events to the \nworkflow. Output\
\ is provided as mapped output per pool, i.e. one output is provided per pool.xx\n"
\ is provided as mapped output per pool, i.e. one output is provided per pool.\n"
info: null
status: "enabled"
requirements:
@@ -161,8 +175,8 @@ build_info:
output: "target/nextflow/workflows/parallel_map_wf"
executable: "target/nextflow/workflows/parallel_map_wf/main.nf"
viash_version: "0.9.0"
git_commit: "65dd41d8b1b4a307735c72320c96c0880c75f17f"
git_remote: "https://x-access-token:ghs_McZDF0yobnnHmOEb2Q4JaaB3pzr9mz1VbIOs@github.com/viash-hub/htrnaseq"
git_commit: "82647a421dae521a9563f7f02050f13a1319eb4a"
git_remote: "https://x-access-token:ghs_GvoC19gNBNw8DS3yDc8aa44laHZP4K2GBiY3@github.com/viash-hub/htrnaseq"
dependencies:
- "target/nextflow/parallel_map"
- "target/nextflow/workflows/utils/groupWells"

View File

@@ -8,6 +8,9 @@
// authors of this component should specify the license in the header of such
// files, or include a separate license file detailing the licenses of all included
// files.
//
// Component authors:
// * Dries Schaumont (maintainer)
////////////////////////////
// VDSL3 helper functions //
@@ -2807,6 +2810,29 @@ meta = [
"name" : "parallel_map_wf",
"namespace" : "workflows",
"version" : "main",
"authors" : [
{
"name" : "Dries Schaumont",
"roles" : [
"maintainer"
],
"info" : {
"links" : {
"email" : "dries@data-intuitive.com",
"github" : "DriesSchaumont",
"orcid" : "0000-0002-4389-0440",
"linkedin" : "dries-schaumont"
},
"organizations" : [
{
"name" : "Data Intuitive",
"href" : "https://www.data-intuitive.com",
"role" : "Data Scientist"
}
]
}
}
],
"argument_groups" : [
{
"name" : "Arguments",
@@ -2883,7 +2909,7 @@ meta = [
"dest" : "nextflow_labels.config"
}
],
"description" : "Map RNA sequencing data, provided as fastq files (paired-end) to a reference genome using STAR Solo.\nInput data must have been demultiplexed beforehand, meaning that a single fastq pair provides data for\none barcode (one well). Multiple wells can be mapped in parallel by providing multiple events to the \nworkflow. Output is provided as mapped output per pool, i.e. one output is provided per pool.xx\n",
"description" : "Map RNA sequencing data, provided as fastq files (paired-end) to a reference genome using STAR Solo.\nInput data must have been demultiplexed beforehand, meaning that a single fastq pair provides data for\none barcode (one well). Multiple wells can be mapped in parallel by providing multiple events to the \nworkflow. Output is provided as mapped output per pool, i.e. one output is provided per pool.\n",
"status" : "enabled",
"requirements" : {
"commands" : [
@@ -3002,8 +3028,8 @@ meta = [
"engine" : "native|native",
"output" : "target/nextflow/workflows/parallel_map_wf",
"viash_version" : "0.9.0",
"git_commit" : "65dd41d8b1b4a307735c72320c96c0880c75f17f",
"git_remote" : "https://x-access-token:ghs_McZDF0yobnnHmOEb2Q4JaaB3pzr9mz1VbIOs@github.com/viash-hub/htrnaseq"
"git_commit" : "82647a421dae521a9563f7f02050f13a1319eb4a",
"git_remote" : "https://x-access-token:ghs_GvoC19gNBNw8DS3yDc8aa44laHZP4K2GBiY3@github.com/viash-hub/htrnaseq"
},
"package_config" : {
"name" : "htrnaseq",

View File

@@ -3,7 +3,8 @@ manifest {
mainScript = 'main.nf'
nextflowVersion = '!>=20.12.1-edge'
version = 'main'
description = 'Map RNA sequencing data, provided as fastq files (paired-end) to a reference genome using STAR Solo.\nInput data must have been demultiplexed beforehand, meaning that a single fastq pair provides data for\none barcode (one well). Multiple wells can be mapped in parallel by providing multiple events to the \nworkflow. Output is provided as mapped output per pool, i.e. one output is provided per pool.xx\n'
description = 'Map RNA sequencing data, provided as fastq files (paired-end) to a reference genome using STAR Solo.\nInput data must have been demultiplexed beforehand, meaning that a single fastq pair provides data for\none barcode (one well). Multiple wells can be mapped in parallel by providing multiple events to the \nworkflow. Output is provided as mapped output per pool, i.e. one output is provided per pool.\n'
author = 'Dries Schaumont'
}
process.container = 'nextflow/bash:latest'

View File

@@ -70,11 +70,14 @@ profiles {
local {
// This config is for local processing.
process {
withName: ".*parallel_map_process" {
maxForks = 1
}
maxMemory = 25.GB
withLabel: verylowcpu { cpus = 2 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 6 }
withLabel: highcpu { cpus = 12 }
withLabel: highcpu { cpus = 8 }
withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } }
withLabel: midmem { memory = { get_memory( 12.GB * task.attempt ) } }

View File

@@ -1,7 +1,7 @@
{
"$schema": "http://json-schema.org/draft-07/schema",
"title": "parallel_map_wf",
"description": "Map RNA sequencing data, provided as fastq files (paired-end) to a reference genome using STAR Solo.\nInput data must have been demultiplexed beforehand, meaning that a single fastq pair provides data for\none barcode (one well). Multiple wells can be mapped in parallel by providing multiple events to the \nworkflow. Output is provided as mapped output per pool, i.e. one output is provided per pool.xx\n",
"description": "Map RNA sequencing data, provided as fastq files (paired-end) to a reference genome using STAR Solo.\nInput data must have been demultiplexed beforehand, meaning that a single fastq pair provides data for\none barcode (one well). Multiple wells can be mapped in parallel by providing multiple events to the \nworkflow. Output is provided as mapped output per pool, i.e. one output is provided per pool.\n",
"type": "object",
"definitions": {
@@ -70,7 +70,7 @@
"description": "Type: `file`, required, default: `$id.$key.output.output`. ",
"help_text": "Type: `file`, required, default: `$id.$key.output.output`. "
,
"default": "$id.$key.output.output"
"default":"$id.$key.output.output"
}

View File

@@ -1,6 +1,20 @@
name: "groupWells"
namespace: "workflows/utils"
version: "main"
authors:
- name: "Dries Schaumont"
roles:
- "maintainer"
info:
links:
email: "dries@data-intuitive.com"
github: "DriesSchaumont"
orcid: "0000-0002-4389-0440"
linkedin: "dries-schaumont"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Data Scientist"
argument_groups:
- name: "Inputs"
arguments:
@@ -171,8 +185,8 @@ build_info:
output: "target/nextflow/workflows/utils/groupWells"
executable: "target/nextflow/workflows/utils/groupWells/main.nf"
viash_version: "0.9.0"
git_commit: "65dd41d8b1b4a307735c72320c96c0880c75f17f"
git_remote: "https://x-access-token:ghs_McZDF0yobnnHmOEb2Q4JaaB3pzr9mz1VbIOs@github.com/viash-hub/htrnaseq"
git_commit: "82647a421dae521a9563f7f02050f13a1319eb4a"
git_remote: "https://x-access-token:ghs_GvoC19gNBNw8DS3yDc8aa44laHZP4K2GBiY3@github.com/viash-hub/htrnaseq"
package_config:
name: "htrnaseq"
version: "main"

View File

@@ -8,6 +8,9 @@
// authors of this component should specify the license in the header of such
// files, or include a separate license file detailing the licenses of all included
// files.
//
// Component authors:
// * Dries Schaumont (maintainer)
////////////////////////////
// VDSL3 helper functions //
@@ -2807,6 +2810,29 @@ meta = [
"name" : "groupWells",
"namespace" : "workflows/utils",
"version" : "main",
"authors" : [
{
"name" : "Dries Schaumont",
"roles" : [
"maintainer"
],
"info" : {
"links" : {
"email" : "dries@data-intuitive.com",
"github" : "DriesSchaumont",
"orcid" : "0000-0002-4389-0440",
"linkedin" : "dries-schaumont"
},
"organizations" : [
{
"name" : "Data Intuitive",
"href" : "https://www.data-intuitive.com",
"role" : "Data Scientist"
}
]
}
}
],
"argument_groups" : [
{
"name" : "Inputs",
@@ -3013,8 +3039,8 @@ meta = [
"engine" : "native",
"output" : "target/nextflow/workflows/utils/groupWells",
"viash_version" : "0.9.0",
"git_commit" : "65dd41d8b1b4a307735c72320c96c0880c75f17f",
"git_remote" : "https://x-access-token:ghs_McZDF0yobnnHmOEb2Q4JaaB3pzr9mz1VbIOs@github.com/viash-hub/htrnaseq"
"git_commit" : "82647a421dae521a9563f7f02050f13a1319eb4a",
"git_remote" : "https://x-access-token:ghs_GvoC19gNBNw8DS3yDc8aa44laHZP4K2GBiY3@github.com/viash-hub/htrnaseq"
},
"package_config" : {
"name" : "htrnaseq",

View File

@@ -4,6 +4,7 @@ manifest {
nextflowVersion = '!>=20.12.1-edge'
version = 'main'
description = 'N/A\n'
author = 'Dries Schaumont'
}
process.container = 'nextflow/bash:latest'

View File

@@ -70,11 +70,14 @@ profiles {
local {
// This config is for local processing.
process {
withName: ".*parallel_map_process" {
maxForks = 1
}
maxMemory = 25.GB
withLabel: verylowcpu { cpus = 2 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 6 }
withLabel: highcpu { cpus = 12 }
withLabel: highcpu { cpus = 8 }
withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } }
withLabel: midmem { memory = { get_memory( 12.GB * task.attempt ) } }

View File

@@ -80,7 +80,7 @@
"description": "Type: List of `file`, default: `$id.$key.output_r1_*.output_r1_*`, multiple_sep: `\";\"`. Path to output for R2",
"help_text": "Type: List of `file`, default: `$id.$key.output_r1_*.output_r1_*`, multiple_sep: `\";\"`. Path to output for R2"
,
"default": "$id.$key.output_r1_*.output_r1_*"
"default":"$id.$key.output_r1_*.output_r1_*"
}
@@ -91,7 +91,7 @@
"description": "Type: List of `file`, default: `$id.$key.output_r2_*.output_r2_*`, multiple_sep: `\";\"`. Path to the output for R2",
"help_text": "Type: List of `file`, default: `$id.$key.output_r2_*.output_r2_*`, multiple_sep: `\";\"`. Path to the output for R2"
,
"default": "$id.$key.output_r2_*.output_r2_*"
"default":"$id.$key.output_r2_*.output_r2_*"
}

View File

@@ -1,6 +1,32 @@
name: "well_demultiplex"
namespace: "workflows"
version: "main"
authors:
- name: "Dries Schaumont"
roles:
- "maintainer"
info:
links:
email: "dries@data-intuitive.com"
github: "DriesSchaumont"
orcid: "0000-0002-4389-0440"
linkedin: "dries-schaumont"
organizations:
- name: "Data Intuitive"
href: "https://www.data-intuitive.com"
role: "Data Scientist"
- name: "Marijke Van Moerbeke"
roles:
- "contributor"
info:
links:
github: "mvanmoerbeke"
orcid: "0000-0002-3097-5621"
linkedin: "marijke-van-moerbeke-84303a34"
organizations:
- name: "OpenAnalytics"
href: "https://www.openanalytics.eu"
role: "Statistical Consultant"
argument_groups:
- name: "Input arguments"
arguments:
@@ -71,6 +97,13 @@ argument_groups:
direction: "output"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--well_id"
info: null
required: false
direction: "output"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--barcode"
info: null
@@ -92,6 +125,14 @@ argument_groups:
direction: "output"
multiple: false
multiple_sep: ";"
- type: "integer"
name: "--n_wells"
description: "The number of wells in the pool is well is a part of."
info: null
required: false
direction: "output"
multiple: false
multiple_sep: ";"
resources:
- type: "nextflow_script"
path: "main.nf"
@@ -106,6 +147,10 @@ test_resources:
path: "test.nf"
is_executable: true
entrypoint: "test_wf"
- type: "nextflow_script"
path: "test.nf"
is_executable: true
entrypoint: "test_wf2"
info: null
status: "enabled"
requirements:
@@ -210,8 +255,8 @@ build_info:
output: "target/nextflow/workflows/well_demultiplex"
executable: "target/nextflow/workflows/well_demultiplex/main.nf"
viash_version: "0.9.0"
git_commit: "65dd41d8b1b4a307735c72320c96c0880c75f17f"
git_remote: "https://x-access-token:ghs_McZDF0yobnnHmOEb2Q4JaaB3pzr9mz1VbIOs@github.com/viash-hub/htrnaseq"
git_commit: "82647a421dae521a9563f7f02050f13a1319eb4a"
git_remote: "https://x-access-token:ghs_GvoC19gNBNw8DS3yDc8aa44laHZP4K2GBiY3@github.com/viash-hub/htrnaseq"
dependencies:
- "target/dependencies/vsh/vsh/biobox/main/nextflow/cutadapt"
- "target/dependencies/vsh/vsh/craftbox/v0.1.0/nextflow/concat_text"

View File

@@ -8,6 +8,10 @@
// authors of this component should specify the license in the header of such
// files, or include a separate license file detailing the licenses of all included
// files.
//
// Component authors:
// * Dries Schaumont (maintainer)
// * Marijke Van Moerbeke (contributor)
////////////////////////////
// VDSL3 helper functions //
@@ -2807,6 +2811,49 @@ meta = [
"name" : "well_demultiplex",
"namespace" : "workflows",
"version" : "main",
"authors" : [
{
"name" : "Dries Schaumont",
"roles" : [
"maintainer"
],
"info" : {
"links" : {
"email" : "dries@data-intuitive.com",
"github" : "DriesSchaumont",
"orcid" : "0000-0002-4389-0440",
"linkedin" : "dries-schaumont"
},
"organizations" : [
{
"name" : "Data Intuitive",
"href" : "https://www.data-intuitive.com",
"role" : "Data Scientist"
}
]
}
},
{
"name" : "Marijke Van Moerbeke",
"roles" : [
"contributor"
],
"info" : {
"links" : {
"github" : "mvanmoerbeke",
"orcid" : "0000-0002-3097-5621",
"linkedin" : "marijke-van-moerbeke-84303a34"
},
"organizations" : [
{
"name" : "OpenAnalytics",
"href" : "https://www.openanalytics.eu",
"role" : "Statistical Consultant"
}
]
}
}
],
"argument_groups" : [
{
"name" : "Input arguments",
@@ -2885,6 +2932,14 @@ meta = [
"multiple" : false,
"multiple_sep" : ";"
},
{
"type" : "string",
"name" : "--well_id",
"required" : false,
"direction" : "output",
"multiple" : false,
"multiple_sep" : ";"
},
{
"type" : "string",
"name" : "--barcode",
@@ -2908,6 +2963,15 @@ meta = [
"direction" : "output",
"multiple" : false,
"multiple_sep" : ";"
},
{
"type" : "integer",
"name" : "--n_wells",
"description" : "The number of wells in the pool is well is a part of.",
"required" : false,
"direction" : "output",
"multiple" : false,
"multiple_sep" : ";"
}
]
}
@@ -2932,6 +2996,12 @@ meta = [
"path" : "test.nf",
"is_executable" : true,
"entrypoint" : "test_wf"
},
{
"type" : "nextflow_script",
"path" : "test.nf",
"is_executable" : true,
"entrypoint" : "test_wf2"
}
],
"status" : "enabled",
@@ -3064,8 +3134,8 @@ meta = [
"engine" : "native|native",
"output" : "target/nextflow/workflows/well_demultiplex",
"viash_version" : "0.9.0",
"git_commit" : "65dd41d8b1b4a307735c72320c96c0880c75f17f",
"git_remote" : "https://x-access-token:ghs_McZDF0yobnnHmOEb2Q4JaaB3pzr9mz1VbIOs@github.com/viash-hub/htrnaseq"
"git_commit" : "82647a421dae521a9563f7f02050f13a1319eb4a",
"git_remote" : "https://x-access-token:ghs_GvoC19gNBNw8DS3yDc8aa44laHZP4K2GBiY3@github.com/viash-hub/htrnaseq"
},
"package_config" : {
"name" : "htrnaseq",
@@ -3119,18 +3189,73 @@ workflow run_wf {
main:
output_ch = input_ch
/*
Parse the fasta file containing the barcodes and do the following:
- The sequence headers must not contain any whitespaces
- The headers (Well IDs) must be unique
- The barcodes must be unique
- Store the number of barcodes in the state
- Add a barcode to well ID (header) mapping to the state,
in order to be able to retreive the well ID based on the FASTQ name after well demultiplexing
*/
| map {id, state ->
def n_wells = state.barcodesFasta.countFasta() as int
// The header is the full header, the id is the part header up to the first whitespace character
// We do not allow whitespace in the header of the fasta file, so assert this.
def fasta_entries = state.barcodesFasta.splitFasta(
record: ["id": true, "header": true, "seqString": true]
)
assert fasta_entries.every{it.id == it.header}, \
"The barcodes FASTA headers must not contain any whitespace!"
// Check if the fasta headers are unique
def fasta_ids = fasta_entries.collect{it.id}
assert fasta_ids.clone().unique() == fasta_ids, \
"The barcodes FASTA entries must have a unique name!"
// Check if the sequences are unique
def fasta_sequences = fasta_entries.collect{it.seqString}
assert fasta_sequences.clone().unique() == fasta_sequences, \
"The barcodes FASTA sequences must be unique!"
def well_id_matcher = /^([A-Za-z]+)0*([1-9]?[0-9]+)$/
def entries_corrected_id = fasta_entries.collectEntries { it ->
def unformatted_id = it.header
def id_matched_to_format = unformatted_id =~ well_id_matcher
assert (id_matched_to_format && id_matched_to_format.getCount() == 1), \
"The FASTA headers must match the coordinate system of a well plate (e.g. A01, B01, ... or AA1, AB1, ...). Found: ${unformatted_id}"
def id_letters = id_matched_to_format[0][1].toUpperCase()
def id_numbers = id_matched_to_format[0][2]
["${id_letters}${id_numbers}", it.seqString]
}
def newState = state + [
"n_wells": n_wells,
"well_id_barcode_mapping": entries_corrected_id,
]
[id, newState]
}
/*
For each pool (i.e. event) in the channel, a list of R1 and R2 input
reads is provided which correspond to the lanes. If there are multiple lanes,
we can demultiplex into the wells for each lane in parallel. Therefore, cutadapt
must be started multiple times and we need an event per lane. The events are
created by taking the R1 and R2 pairs from the input lists. The index of the elements
in these lists are added to the ID in order to make them unique.
*/
| flatMap {id, state ->
assert state.input_r1.size() == state.input_r2.size(), "Expected equal number of inputs for R1 and R2"
assert state.input_r1.size() == state.input_r2.size(), \
"Expected equal number of inputs for R1 and R2"
// Store the number of lanes that were encountered here in order to
// group them together in an asynchronous manner later by providing
// the expected number of events to be grouped to groupTuple.
// see https://www.nextflow.io/docs/latest/reference/operator.html#grouptuple
def n_lanes = state.input_r1.size()
[state.input_r1, state.input_r2].transpose().withIndex().collect{ input_pair, index ->
def single_input_r1 = input_pair[0]
def single_input_r2 = input_pair[1]
def newState = state + ["input_r1": single_input_r1,
"input_r2": single_input_r2,
"pool": id,
def newState = state + ["input_r1": single_input_r1,
"input_r2": single_input_r2,
"pool": id,
"lane_sorting": index,
"n_lanes": n_lanes]
def newId = id + "_" + index
def newId = id + "_" + index
[newId, newState]
}
}
@@ -3151,10 +3276,12 @@ workflow run_wf {
},
toState: { id, result, state ->
def newState = [
pool: state.pool,
n_lanes: state.n_lanes,
output: result.output,
lane_sorting: state.lane_sorting,
"pool": state.pool,
"n_lanes": state.n_lanes,
"output": result.output,
"lane_sorting": state.lane_sorting,
"n_wells": state.n_wells,
"well_id_barcode_mapping": state.well_id_barcode_mapping,
]
return newState
}
@@ -3163,58 +3290,129 @@ workflow run_wf {
| flatMap{ id, state ->
def pool = state.pool
state.output.collect{ p ->
def barcode = (p =~ /.*\\/([ACTG]*|unknown)_R?.*/)[0][1]
def pair_end = (p =~ /.*_(R[12])_.*/)[0][1]
def lane = (p =~ /.*_(L\d+).*/) ? (p =~ /.*_(L\d+).*/)[0][1] : "NA"
def new_id = pool + "__" + barcode
def group_key = groupKey(new_id, state.n_lanes * 2)
def well_id_matcher = p =~ /.*\\/([A-Za-z0-9]*|unknown)_R?.*/
assert well_id_matcher, \
"Could not find Well ID in the name of FASTQ file ($p) output from cutadapt."
def well_id = well_id_matcher[0][1]
// Note: set the barcode to 'null' for reads that were put into 'unknown'
def barcode = (well_id != "unknown") ? state.well_id_barcode_mapping[well_id].replaceAll("[^ACGTacgt]", "") : null
assert (well_id == "unknown") || (barcode != null), \
"After demultiplexing, no Well ID could be retreived for barcode ${barcode}."
def pair_end_matcher = p =~ /.*_(R[12])_.*/
assert pair_end_matcher, \
"Could not find read orientation information in the name of the FASTQ file ($p) output from cutadapt."
def pair_end = pair_end_matcher[0][1]
def lane_matcher = p =~ /.*_(L\d+).*/
def lane = lane_matcher ? lane_matcher[0][1] : "NA"
def new_id = pool + "__" + well_id
[
group_key,
new_id,
[
pool: pool,
barcode: barcode,
output: p,
lane: lane,
pair_end: pair_end,
lane_sorting: state.lane_sorting,
_meta: [ join_id: pool ]
"pool": pool,
"barcode": barcode,
"well_id": well_id,
"output": p,
"lane": lane,
"n_wells": state.n_wells,
"pair_end": pair_end,
"n_lanes": state.n_lanes,
"lane_sorting": state.lane_sorting,
"_meta": [ "join_id": pool ]
]
]
}
}
// Group the outputs from across lanes
| groupTuple(sort: "hash")
/*
At this point, the events are provided on the smallest possible level,
as each event represents the reads for a certain orientation from a
particular lane and a single well. Here, we join these events back together
on well level, gathering FASTQS across the lanes and read orientations.
In order to make this joining as efficient as possible, the number of
lanes which are expected to be gathered were stored in the state earlier.
This way, the processing of a well can continue as as soon as all of
the lanes have been gathered. The number of lanes times 2 (forward
and reverse orientation) represents the total number of FASTQS (events)
to be included for a certain well.
*/
| map {id, state ->
def group_key = groupKey(id, state.n_lanes * 2)
return [group_key, state]
}
| groupTuple(sort: {a, b ->
// Make sure that the grouped states are in order,
// meaning forward and reverse FASTQs are paired and the FASTQ
// for the forward reads comes before the reverse reads FASTQ.
if (a.lane_sorting == b.lane_sorting) {
return a.pair_end <=> b.pair_end
}
return a.lane_sorting <=> b.lane_sorting
})
| map {_, states ->
def r1_output = states.findAll{ it.pair_end == "R1" }.collect{it.output}
def r2_output = states.findAll{ it.pair_end == "R2" }.collect{it.output}
def lane_sorting_r1 = states.findAll{ it.pair_end == "R1" }.collect{it.lane_sorting}
def lane_sorting_r2 = states.findAll{ it.pair_end == "R2" }.collect{it.lane_sorting}
// The states are in one long flat list, group them into pairs
// This assumes that the FASTQ files are already in order!
// (See the 'sort' argument of groupTuple above)
def output_pairs = states.collate(2)
// At this point, the lane_sorting hold the positios the items in r1_output and r2_output
// should become in a new list.
def r1_output_sorted = new ArrayList(r1_output.size())
def r2_output_sorted = new ArrayList(r2_output.size())
lane_sorting_r1.eachWithIndex { pos, index ->
r1_output_sorted[pos] = r1_output[index]
}
lane_sorting_r2.eachWithIndex { pos, index ->
r2_output_sorted[pos] = r2_output[index]
// Sanity check the state
output_pairs.each{ pair ->
assert pair.size() == 2, \
"State error: expected FASTQ pairs as output from cutadapt, " +
"found output state: $pair"
def (first, second) = pair
def should_be_the_same = [
"barcode",
"well_id",
"lane",
"pool",
"lane_sorting",
]
should_be_the_same.each { attr_to_check ->
first_attr = first.get(attr_to_check)
second_attr = second.get(attr_to_check)
assert first_attr == second_attr, \
"State error: expected FASTQ pairs from cutadapt to have " +
"the same detected ${attr_to_check}. Found: " +
"$first_attr and $second_attr"
}
// Forward and reverse reads should be designated
// by 'R1' and 'R2', and sorted lexographically.
assert first.pair_end == "R1", \
"State error: expected first item from FASTQ pair to have " +
"orientation 'R1', found $first.pair_end"
assert second.pair_end == "R2", \
"State error: expected second item from FASTQ pair to have " +
"orientation 'R2', found $second.pair_end"
}
def r1_output = output_pairs.collect{it[0].output}
def r2_output = output_pairs.collect{it[1].output}
assert r1_output.size() == r2_output.size()
/* The lane sorting represents the order of the FASTQ files
as provided by the input. The order of the FASTQ files should
remain the same in the well output. This is because the result of STAR
can differ based on the order of the reads in the FASTQ file.
Even when the same reads are provided, the order of them matters.
*/
def lane_sorting = output_pairs.it[0].lane_sorting
def sorting_is_monotonically_increasing = lane_sorting.withIndex().every { i, idx ->
idx == 0 || lane_sorting[idx - 1] <= i
}
assert sorting_is_monotonically_increasing, \
"State error: expected the order of the FASTQ files after grouping " +
"the cutadapt output to be the same as the order in the input. " +
"Found sorting $lane_sorting, R1 output: $r1_output, R2 output: $r2_output."
// Here we pick the state from the first item in the list of states
// and overwrite the keys which are different across states
// TODO: we can assert that these keys are the same
def first_state = states[0]
def new_id = first_state.pool + "__" + first_state.barcode
def new_state = first_state + ["output_r1": r1_output_sorted, "output_r2": r2_output_sorted]
def new_id = first_state.pool + "__" + first_state.well_id
def new_state = first_state + ["output_r1": r1_output, "output_r2": r2_output]
[new_id, new_state]
}
// TODO: Expand this into matching a whitelist/blacklist of barcodes
// ... and turn into separate component
| filter{ id, state -> state.barcode != "unknown" }
| filter{ id, state -> state.well_id != "unknown" }
| concat_text.run(
directives: [label: ["lowmem", "lowcpu"]],
key: "concat_txt_r1",
@@ -3247,7 +3445,7 @@ workflow run_wf {
return newState
}
)
| setState(["pool", "barcode", "lane", "_meta", "output_r1", "output_r2"])
| setState(["pool", "well_id", "n_wells", "barcode", "lane", "_meta", "output_r1", "output_r2"])
emit:
output_ch

View File

@@ -4,6 +4,7 @@ manifest {
nextflowVersion = '!>=20.12.1-edge'
version = 'main'
description = 'Demultiplexing on well level'
author = 'Dries Schaumont, Marijke Van Moerbeke'
}
process.container = 'nextflow/bash:latest'

View File

@@ -70,11 +70,14 @@ profiles {
local {
// This config is for local processing.
process {
withName: ".*parallel_map_process" {
maxForks = 1
}
maxMemory = 25.GB
withLabel: verylowcpu { cpus = 2 }
withLabel: lowcpu { cpus = 4 }
withLabel: midcpu { cpus = 6 }
withLabel: highcpu { cpus = 12 }
withLabel: highcpu { cpus = 8 }
withLabel: lowmem { memory = { get_memory( 8.GB * task.attempt ) } }
withLabel: midmem { memory = { get_memory( 12.GB * task.attempt ) } }

View File

@@ -60,7 +60,7 @@
"description": "Type: List of `file`, required, default: `$id.$key.output_r1_*.fastq`, multiple_sep: `\";\"`. List of demultiplexed fastq files",
"help_text": "Type: List of `file`, required, default: `$id.$key.output_r1_*.fastq`, multiple_sep: `\";\"`. List of demultiplexed fastq files"
,
"default": "$id.$key.output_r1_*.fastq"
"default":"$id.$key.output_r1_*.fastq"
}
@@ -71,7 +71,7 @@
"description": "Type: List of `file`, required, default: `$id.$key.output_r2_*.fastq`, multiple_sep: `\";\"`. List of demultiplexed fastq files",
"help_text": "Type: List of `file`, required, default: `$id.$key.output_r2_*.fastq`, multiple_sep: `\";\"`. List of demultiplexed fastq files"
,
"default": "$id.$key.output_r2_*.fastq"
"default":"$id.$key.output_r2_*.fastq"
}
@@ -85,6 +85,16 @@
}
,
"well_id": {
"type":
"string",
"description": "Type: `string`. ",
"help_text": "Type: `string`. "
}
,
"barcode": {
"type":
@@ -115,6 +125,16 @@
}
,
"n_wells": {
"type":
"integer",
"description": "Type: `integer`. The number of wells in the pool is well is a part of",
"help_text": "Type: `integer`. The number of wells in the pool is well is a part of."
}
}
},