Build branch main with version main (900f5ed)

Build pipeline: viash-hub.htrnaseq.main-2ww7f

Source commit: 900f5ed356

Source message: Runner: allow multiple input directories. (#38)
This commit is contained in:
CI
2025-02-13 16:08:09 +00:00
parent 34f9a5c19d
commit 9eac6fff21
90 changed files with 1650 additions and 5016 deletions

View File

@@ -32,6 +32,13 @@ argument_groups:
arguments:
- type: "file"
name: "--input_r1"
description: "Input FASTQ files for the forward reads. All FASTQ file names must\
\ start with the prefix '{well_id}_R1', where\n'well_id' can be found as the\
\ sequence identifier in the barcodes FASTA file (see 'barcodesFasta' argument).\n\
For each FASTQ file, a matching FASTQ file for the reverse reads must be provided\
\ to the 'input_r2' argument,\nmeaning that their 'well_id' prefix must match.\
\ The number of items provided for 'input_r1' must be equal\nto the number of\
\ items for 'input_r2'.\n"
info: null
must_exist: true
create_parent: true
@@ -41,6 +48,13 @@ argument_groups:
multiple_sep: ";"
- type: "file"
name: "--input_r2"
description: "Input FASTQ files for the reverse reads. All FASTQ file names must\
\ start with the prefix '{well_id}_R2', where\n'well_id' can be found as the\
\ sequence identifier in the barcodes FASTA file (see 'barcodesFasta' argument).\n\
For each FASTQ file, a matching FASTQ file for the reverse reads must be provided\
\ to the 'input_r1' argument,\nmeaning that their 'well_id' prefix must match.\
\ The number of items provided for 'input_r1' must be equal\nto the number of\
\ items for 'input_r2'.\n"
info: null
must_exist: true
create_parent: true
@@ -50,7 +64,8 @@ argument_groups:
multiple_sep: ";"
- type: "file"
name: "--genomeDir"
description: "STAR reference directory"
description: "Reference genome to match to. Can be generated from genomic FASTA\
\ sequences and a genome annotation\nby using STAR with '--runMode genomeGenerate'.\n"
info: null
must_exist: true
create_parent: true
@@ -58,19 +73,25 @@ argument_groups:
direction: "input"
multiple: false
multiple_sep: ";"
- type: "string"
name: "--barcodes"
description: "The barcodes/wells to process"
- type: "file"
name: "--barcodesFasta"
description: "FASTA file where each entry specifies a unique barcode sequence\
\ present at the start of the forward input reads\n(input_r1). The IDs of each\
\ barcode (the start of the FASTA headers up until the first whitespace character)\
\ must\nmatch with the start of one input FASTQ pair.\n"
info: null
must_exist: true
create_parent: true
required: true
direction: "input"
multiple: true
multiple: false
multiple_sep: ";"
- name: "Barcode arguments"
arguments:
- type: "integer"
name: "--umiLength"
description: "The length of the UMIs"
description: "Length of the Unique Molecular Identifiers (UMI). The UMI are expected\
\ to be located after the barcodes in the\nforwards reads.\n"
info: null
required: true
direction: "input"
@@ -101,9 +122,10 @@ argument_groups:
arguments:
- type: "file"
name: "--output"
description: "Location of the output folders, 1 folder per barcode. The value\
\ used\nfor this argument must contain a '*', which will be replaced with the\n\
barcode to form the final output location for that barcode.\n"
description: "A list of output folders which are the result of using STAR to map\
\ each input FASTQ pair STAR to the reference genome.\nThe order of the items\
\ DO NOT match with the order of the entries in the barcodes FASTA file or the\
\ input FASTQ pairs. \n"
info: null
default:
- "./*"
@@ -234,6 +256,7 @@ engines:
- "zlib1g-dev"
- "parallel"
- "file"
- "seqkit"
interactive: false
- type: "docker"
copy:
@@ -256,8 +279,8 @@ build_info:
output: "target/executable/parallel_map"
executable: "target/executable/parallel_map/parallel_map"
viash_version: "0.9.0"
git_commit: "8eb391e3ebd9ffe573b76b9be635a81ec495c3bb"
git_remote: "https://x-access-token:ghs_ybw8wXfG5bR3FsTslxlh1sXKguvM9R0Rd31p@github.com/viash-hub/htrnaseq"
git_commit: "900f5ed35659137aa5c62d183232e0f33de97873"
git_remote: "https://github.com/viash-hub/htrnaseq"
package_config:
name: "htrnaseq"
version: "main"

View File

@@ -184,22 +184,48 @@ function ViashHelp {
echo "Input arguments:"
echo " --input_r1"
echo " type: file, required parameter, multiple values allowed, file must exist"
echo " Input FASTQ files for the forward reads. All FASTQ file names must start"
echo " with the prefix '{well_id}_R1', where"
echo " 'well_id' can be found as the sequence identifier in the barcodes FASTA"
echo " file (see 'barcodesFasta' argument)."
echo " For each FASTQ file, a matching FASTQ file for the reverse reads must be"
echo " provided to the 'input_r2' argument,"
echo " meaning that their 'well_id' prefix must match. The number of items"
echo " provided for 'input_r1' must be equal"
echo " to the number of items for 'input_r2'."
echo ""
echo " --input_r2"
echo " type: file, required parameter, multiple values allowed, file must exist"
echo " Input FASTQ files for the reverse reads. All FASTQ file names must start"
echo " with the prefix '{well_id}_R2', where"
echo " 'well_id' can be found as the sequence identifier in the barcodes FASTA"
echo " file (see 'barcodesFasta' argument)."
echo " For each FASTQ file, a matching FASTQ file for the reverse reads must be"
echo " provided to the 'input_r1' argument,"
echo " meaning that their 'well_id' prefix must match. The number of items"
echo " provided for 'input_r1' must be equal"
echo " to the number of items for 'input_r2'."
echo ""
echo " --genomeDir"
echo " type: file, required parameter, file must exist"
echo " STAR reference directory"
echo " Reference genome to match to. Can be generated from genomic FASTA"
echo " sequences and a genome annotation"
echo " by using STAR with '--runMode genomeGenerate'."
echo ""
echo " --barcodes"
echo " type: string, required parameter, multiple values allowed"
echo " The barcodes/wells to process"
echo " --barcodesFasta"
echo " type: file, required parameter, file must exist"
echo " FASTA file where each entry specifies a unique barcode sequence present"
echo " at the start of the forward input reads"
echo " (input_r1). The IDs of each barcode (the start of the FASTA headers up"
echo " until the first whitespace character) must"
echo " match with the start of one input FASTQ pair."
echo ""
echo "Barcode arguments:"
echo " --umiLength"
echo " type: integer, required parameter"
echo " The length of the UMIs"
echo " Length of the Unique Molecular Identifiers (UMI). The UMI are expected"
echo " to be located after the barcodes in the"
echo " forwards reads."
echo ""
echo " --limitBAMsortRAM"
echo " type: string"
@@ -216,9 +242,10 @@ function ViashHelp {
echo " type: file, required parameter, multiple values allowed, output, file"
echo "must exist"
echo " default: ./*"
echo " Location of the output folders, 1 folder per barcode. The value used"
echo " for this argument must contain a '*', which will be replaced with the"
echo " barcode to form the final output location for that barcode."
echo " A list of output folders which are the result of using STAR to map each"
echo " input FASTQ pair STAR to the reference genome."
echo " The order of the items DO NOT match with the order of the entries in the"
echo " barcodes FASTA file or the input FASTQ pairs."
echo ""
echo " --joblog"
echo " type: file, output, file must exist"
@@ -502,7 +529,7 @@ function ViashDockerfile {
FROM debian:stable-slim
ENTRYPOINT []
RUN apt-get update && \
DEBIAN_FRONTEND=noninteractive apt-get install -y procps wget automake make gcc g++ zlib1g-dev parallel file && \
DEBIAN_FRONTEND=noninteractive apt-get install -y procps wget automake make gcc g++ zlib1g-dev parallel file seqkit && \
rm -rf /var/lib/apt/lists/*
ARG STAR_V
@@ -513,9 +540,9 @@ ENV STAR_BINARY=STAR
COPY STAR /usr/local/bin/$STAR_BINARY
LABEL org.opencontainers.image.authors="Dries Schaumont, Toni Verbeiren"
LABEL org.opencontainers.image.description="Companion container for running component parallel_map"
LABEL org.opencontainers.image.created="2025-01-17T13:20:50Z"
LABEL org.opencontainers.image.created="2025-02-13T15:18:39Z"
LABEL org.opencontainers.image.source="https://github.com/viash-hub/htrnaseq"
LABEL org.opencontainers.image.revision="8eb391e3ebd9ffe573b76b9be635a81ec495c3bb"
LABEL org.opencontainers.image.revision="900f5ed35659137aa5c62d183232e0f33de97873"
LABEL org.opencontainers.image.version="main"
VIASHDOCKER
@@ -700,21 +727,15 @@ while [[ $# -gt 0 ]]; do
VIASH_PAR_GENOMEDIR=$(ViashRemoveFlags "$1")
shift 1
;;
--barcodes)
if [ -z "$VIASH_PAR_BARCODES" ]; then
VIASH_PAR_BARCODES="$2"
else
VIASH_PAR_BARCODES="$VIASH_PAR_BARCODES;""$2"
fi
[ $# -lt 2 ] && ViashError Not enough arguments passed to --barcodes. Use "--help" to get more information on the parameters. && exit 1
--barcodesFasta)
[ -n "$VIASH_PAR_BARCODESFASTA" ] && ViashError Bad arguments for option \'--barcodesFasta\': \'$VIASH_PAR_BARCODESFASTA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1
VIASH_PAR_BARCODESFASTA="$2"
[ $# -lt 2 ] && ViashError Not enough arguments passed to --barcodesFasta. Use "--help" to get more information on the parameters. && exit 1
shift 2
;;
--barcodes=*)
if [ -z "$VIASH_PAR_BARCODES" ]; then
VIASH_PAR_BARCODES=$(ViashRemoveFlags "$1")
else
VIASH_PAR_BARCODES="$VIASH_PAR_BARCODES;"$(ViashRemoveFlags "$1")
fi
--barcodesFasta=*)
[ -n "$VIASH_PAR_BARCODESFASTA" ] && ViashError Bad arguments for option \'--barcodesFasta=*\': \'$VIASH_PAR_BARCODESFASTA\' \& \'$2\' - you should provide exactly one argument for this option. && exit 1
VIASH_PAR_BARCODESFASTA=$(ViashRemoveFlags "$1")
shift 1
;;
--umiLength)
@@ -956,8 +977,8 @@ if [ -z ${VIASH_PAR_GENOMEDIR+x} ]; then
ViashError '--genomeDir' is a required argument. Use "--help" to get more information on the parameters.
exit 1
fi
if [ -z ${VIASH_PAR_BARCODES+x} ]; then
ViashError '--barcodes' is a required argument. Use "--help" to get more information on the parameters.
if [ -z ${VIASH_PAR_BARCODESFASTA+x} ]; then
ViashError '--barcodesFasta' is a required argument. Use "--help" to get more information on the parameters.
exit 1
fi
if [ -z ${VIASH_PAR_UMILENGTH+x} ]; then
@@ -1033,6 +1054,10 @@ if [ ! -z "$VIASH_PAR_GENOMEDIR" ] && [ ! -e "$VIASH_PAR_GENOMEDIR" ]; then
ViashError "Input file '$VIASH_PAR_GENOMEDIR' does not exist."
exit 1
fi
if [ ! -z "$VIASH_PAR_BARCODESFASTA" ] && [ ! -e "$VIASH_PAR_BARCODESFASTA" ]; then
ViashError "Input file '$VIASH_PAR_BARCODESFASTA' does not exist."
exit 1
fi
# check whether parameters values are of the right type
if [[ -n "$VIASH_PAR_UMILENGTH" ]]; then
@@ -1172,6 +1197,10 @@ if [ ! -z "$VIASH_PAR_GENOMEDIR" ]; then
VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_GENOMEDIR")" )
VIASH_PAR_GENOMEDIR=$(ViashDockerAutodetectMount "$VIASH_PAR_GENOMEDIR")
fi
if [ ! -z "$VIASH_PAR_BARCODESFASTA" ]; then
VIASH_DIRECTORY_MOUNTS+=( "$(ViashDockerAutodetectMountArg "$VIASH_PAR_BARCODESFASTA")" )
VIASH_PAR_BARCODESFASTA=$(ViashDockerAutodetectMount "$VIASH_PAR_BARCODESFASTA")
fi
if [ ! -z "$VIASH_PAR_OUTPUT" ]; then
VIASH_TEST_OUTPUT=()
IFS=';'
@@ -1263,7 +1292,7 @@ cat > "\$tempscript" << 'VIASHMAIN'
$( if [ ! -z ${VIASH_PAR_INPUT_R1+x} ]; then echo "${VIASH_PAR_INPUT_R1}" | sed "s#'#'\"'\"'#g;s#.*#par_input_r1='&'#" ; else echo "# par_input_r1="; fi )
$( if [ ! -z ${VIASH_PAR_INPUT_R2+x} ]; then echo "${VIASH_PAR_INPUT_R2}" | sed "s#'#'\"'\"'#g;s#.*#par_input_r2='&'#" ; else echo "# par_input_r2="; fi )
$( if [ ! -z ${VIASH_PAR_GENOMEDIR+x} ]; then echo "${VIASH_PAR_GENOMEDIR}" | sed "s#'#'\"'\"'#g;s#.*#par_genomeDir='&'#" ; else echo "# par_genomeDir="; fi )
$( if [ ! -z ${VIASH_PAR_BARCODES+x} ]; then echo "${VIASH_PAR_BARCODES}" | sed "s#'#'\"'\"'#g;s#.*#par_barcodes='&'#" ; else echo "# par_barcodes="; fi )
$( if [ ! -z ${VIASH_PAR_BARCODESFASTA+x} ]; then echo "${VIASH_PAR_BARCODESFASTA}" | sed "s#'#'\"'\"'#g;s#.*#par_barcodesFasta='&'#" ; else echo "# par_barcodesFasta="; fi )
$( if [ ! -z ${VIASH_PAR_UMILENGTH+x} ]; then echo "${VIASH_PAR_UMILENGTH}" | sed "s#'#'\"'\"'#g;s#.*#par_umiLength='&'#" ; else echo "# par_umiLength="; fi )
$( if [ ! -z ${VIASH_PAR_LIMITBAMSORTRAM+x} ]; then echo "${VIASH_PAR_LIMITBAMSORTRAM}" | sed "s#'#'\"'\"'#g;s#.*#par_limitBAMsortRAM='&'#" ; else echo "# par_limitBAMsortRAM="; fi )
$( if [ ! -z ${VIASH_PAR_RUNTHREADN+x} ]; then echo "${VIASH_PAR_RUNTHREADN}" | sed "s#'#'\"'\"'#g;s#.*#par_runThreadN='&'#" ; else echo "# par_runThreadN="; fi )
@@ -1303,25 +1332,14 @@ else
fi
# Split the delimited strings into arrays
IFS=';' read -r -a barcodes <<< "\$par_barcodes"
IFS=';' read -r -a input_r1 <<< "\$par_input_r1"
IFS=';' read -r -a input_r2 <<< "\$par_input_r2"
# Check that the number of values provided for the barcodes and the fastq files are the same.
num_barcodes="\${#barcodes[@]}"
num_r1_inputs="\${#input_r1[@]}"
num_r2_inputs="\${#input_r2[@]}"
if [ ! "\$num_barcodes" -eq "\$num_r1_inputs" ] || [ ! "\$num_r1_inputs" -eq "\$num_r2_inputs" ]; then
echo "The number of values for arguments 'barcodes' (\$num_barcodes), "\\
"'input_r1' (\$num_r1_inputs) and 'input_r2' (\$num_r2_inputs) "\\
"should be the same, and their order should match."
exit 1
else
echo "Checked if length of barcodes input (\$num_barcodes) is "\\
"the same as R1 reads (\$num_r1_inputs) and R2 reads "\\
"(\$num_r2_inputs). Seems OK!"
fi
# Read barcodes FASTQ
# seqkit will make sure to take the leading non-whitespace as sequence identifier (ID)
# Luckily, this is the same as how cutadapt determines an adapter name from the FASTA header.
readarray -t well_ids < <(seqkit seq --name "\$par_barcodesFasta" )
readarray -t barcodes < <(seqkit seq --seq --upper-case --remove-gaps --gap-letters '^' --validate-seq "\$par_barcodesFasta")
# Function to test for unique values in array
function arrayContainsUniqueValues {
@@ -1348,6 +1366,66 @@ if ! (exit \$is_array_unique_exit_code); then
exit 1
fi
# Check that the number of values provided for the fastq files are the same.
num_r1_inputs="\${#input_r1[@]}"
num_r2_inputs="\${#input_r2[@]}"
if [ ! "\$num_r1_inputs" -eq "\$num_r2_inputs" ]; then
echo "The number of values for arguments "\\
"'input_r1' (\$num_r1_inputs) and 'input_r2' (\$num_r2_inputs) "\\
"should be the same."
exit 1
else
echo "Checked if the same as the number of R1 FASTQ (\$num_r1_inputs) and R2 FASTQ files "\\
"(\$num_r2_inputs) were provided. Seems OK!"
fi
# Loop over the well IDs and match them to the input FASTQ files
# The FASTQ file names should have the format {well_id}_R(1|2).fastq,
# which is the output format that the cutadapt component uses for demultiplexing.
# sorted_input_r1 and sorted_input_r2 are the input FASTQ files sorted by the order
# of the barcodes in the barcodes array (i.e. the order in the barcodes FASTA file).
declare -a sorted_input_r1=()
declare -a sorted_input_r2=()
for barcode_index in "\${!barcodes[@]}"; do
barcode="\${barcodes[\$barcode_index]}"
well_id="\${well_ids[\$barcode_index]}"
echo "Finding FASTQ files for barcode \${barcode}, well ID '\${well_id}'."
# The FASTQ files for a particular barcode must match the following regex:
input_file_regex="^\${well_id}_R[1-2]"
for r1_index in "\${!input_r1[@]}"; do
r1_file_path=\${input_r1[\$r1_index]}
r2_file_path=\${input_r2[\$r1_index]}
# Get the file names from the full path
r1_file_name=\$(basename -- "\$r1_file_path")
r2_file_name=\$(basename -- "\$r2_file_path")
# Check if the file names match the regex
if [[ \$r1_file_name =~ \$input_file_regex ]]; then
echo "Matched with \$r1_file_name and \$r2_file_name."
# If the R1 FASTQ file matched the regex,
# the R2 file must have also been matched
if ! [[ \$r2_file_name =~ \$input_file_regex ]]; then
echo "File \${r1_file_name} matched with regex \${input_file_regex} "\\
"but \${r2_file_name} did not! Make sure that the order of "\\
"the R1 and R2 input files match."
exit 1
fi
# Add the
sorted_input_r1+=("\$r1_file_path")
sorted_input_r2+=("\$r2_file_path")
# Do not continue looking for more files for this barcode
# '2' to affect the *outer* loop (which indeed loops barcodes)!
continue 2
fi
done
echo "Did not find FASTQ files files for well \${well_id}! "\\
"Make sure that the input files have the correct file name format."
exit 1
done
# Define the function that will be used to run a single job
function _run() {
local par_UMIlength="\$1"
@@ -1517,7 +1595,7 @@ parallel_cmd+=(":::" "\$par_umiLength" ":::" "\$par_output" ":::" "\$par_genomeD
# Argument which in fact will cause extra jobs to be spawned, per job one item from each argument will be selected
# Thus, these argument lists should have the same length.
parallel_cmd+=(":::" "\${barcodes[@]}" ":::+" "\${input_r1[@]}" ":::+" "\${input_r2[@]}")
parallel_cmd+=(":::" "\${barcodes[@]}" ":::+" "\${sorted_input_r1[@]}" ":::+" "\${sorted_input_r2[@]}")
set +eo pipefail
"\${parallel_cmd[@]}"
@@ -1601,6 +1679,9 @@ if [[ "$VIASH_ENGINE_TYPE" == "docker" ]]; then
if [ ! -z "$VIASH_PAR_GENOMEDIR" ]; then
VIASH_PAR_GENOMEDIR=$(ViashDockerStripAutomount "$VIASH_PAR_GENOMEDIR")
fi
if [ ! -z "$VIASH_PAR_BARCODESFASTA" ]; then
VIASH_PAR_BARCODESFASTA=$(ViashDockerStripAutomount "$VIASH_PAR_BARCODESFASTA")
fi
if [ ! -z "$VIASH_PAR_OUTPUT" ]; then
VIASH_PAR_OUTPUT=$(ViashDockerStripAutomount "$VIASH_PAR_OUTPUT")
fi