Files
htrnaseq/src/parallel_map/script.sh
CI 991c615670 Build branch htrnaseq/update_craftbox with version updatecraftbox to htrnaseq on branch updatecraftbox (e6da525)
Build pipeline: viash-hub.htrnaseq.updatecraftbox-zzrhd

Source commit: e6da525fc5

Source message: Merge branch 'main' into update_craftbox
2025-09-02 14:31:15 +00:00

343 lines
12 KiB
Bash
Executable File

#!/bin/bash
## VIASH START
par_input_r1="work/2c/5b8b3a2dd4a988b8838e3f72d38a37/_viash_par/input_r1_1/two__ACACCGAATT.concat_text_r1.output.txt"
par_input_r2="work/2c/5b8b3a2dd4a988b8838e3f72d38a37/_viash_par/input_r2_1/two__ACACCGAATT.concat_text_r2.output.txt"
par_barcodes="ACACCGAATT;GGCTATTGAT"
par_output="./*"
par_genomeDir="star"
par_umiLength=10
par_limitBAMsortRAM="10000000000"
meta_cpus=2
par_runThreadN=1
## VIASH END
set -eo pipefail
# Check if wildcard character is present in output folder template
printf "Checking if output folder template ($par_output) contains a single wildcard character '*'. "
output_glob_character="${par_output//[^\*]}"
if [[ "${#output_glob_character}" -ne "1" ]]; then
echo "The value for --output must contain exactly one '*' character. Exiting..."
exit 1
else
echo "Done, wildcard character found!"
fi
# Split the delimited strings into arrays
IFS=';' read -r -a input_r1 <<< "$par_input_r1"
IFS=';' read -r -a input_r2 <<< "$par_input_r2"
# Read barcodes FASTQ
# seqkit will make sure to take the leading non-whitespace as sequence identifier (ID)
# Luckily, this is the same as how cutadapt determines an adapter name from the FASTA header.
readarray -t well_ids < <(seqkit seq --name "$par_barcodesFasta" )
readarray -t barcodes < <(seqkit seq --seq --upper-case --remove-gaps --gap-letters '^' --validate-seq "$par_barcodesFasta")
# Function to test for unique values in array
function arrayContainsUniqueValues {
# Pass the argument by reference
local -n arr=$1
# Create a temporary associative array
# in order to use its uniqueness of keys
# 'declare' in a function is automatically local
declare -A uniq_tmp
for item in "${arr[@]}"; do
uniq_tmp[$item]=0 # assigning a placeholder
done
local unique_array_values=(${!uniq_tmp[@]})
if [ "${#unique_array_values[@]}" -eq "${#arr[@]}" ]; then
return
fi
false
}
arrayContainsUniqueValues barcodes
is_array_unique_exit_code=$?
if ! (exit $is_array_unique_exit_code); then
echo "The provided barcodes should be unique!"
echo "Values: $par_barcodes"
exit 1
fi
# Check that the number of values provided for the fastq files are the same.
num_r1_inputs="${#input_r1[@]}"
num_r2_inputs="${#input_r2[@]}"
if [ ! "$num_r1_inputs" -eq "$num_r2_inputs" ]; then
echo "The number of values for arguments "\
"'input_r1' ($num_r1_inputs) and 'input_r2' ($num_r2_inputs) "\
"should be the same."
exit 1
else
echo "Checked if the same as the number of R1 FASTQ ($num_r1_inputs) and R2 FASTQ files "\
"($num_r2_inputs) were provided. Seems OK!"
fi
# Loop over the well IDs and match them to the input FASTQ files
# The FASTQ file names should have the format {well_id}_R(1|2).fastq,
# which is the output format that the cutadapt component uses for demultiplexing.
# sorted_input_r1 and sorted_input_r2 are the input FASTQ files sorted by the order
# of the barcodes in the barcodes array (i.e. the order in the barcodes FASTA file).
declare -a sorted_input_r1=()
declare -a sorted_input_r2=()
for barcode_index in "${!barcodes[@]}"; do
barcode="${barcodes[$barcode_index]}"
well_id="${well_ids[$barcode_index]}"
echo "Finding FASTQ files for barcode ${barcode}, well ID '${well_id}'."
# The FASTQ files for a particular barcode must match the following regex:
input_file_regex="^${well_id}_R[1-2]"
for r1_index in "${!input_r1[@]}"; do
r1_file_path=${input_r1[$r1_index]}
r2_file_path=${input_r2[$r1_index]}
# Get the file names from the full path
r1_file_name=$(basename -- "$r1_file_path")
r2_file_name=$(basename -- "$r2_file_path")
# Check if the file names match the regex
if [[ $r1_file_name =~ $input_file_regex ]]; then
echo "Matched with $r1_file_name and $r2_file_name."
# If the R1 FASTQ file matched the regex,
# the R2 file must have also been matched
if ! [[ $r2_file_name =~ $input_file_regex ]]; then
echo "File ${r1_file_name} matched with regex ${input_file_regex} "\
"but ${r2_file_name} did not! Make sure that the order of "\
"the R1 and R2 input files match."
exit 1
fi
# Add the
sorted_input_r1+=("$r1_file_path")
sorted_input_r2+=("$r2_file_path")
# Do not continue looking for more files for this barcode
# '2' to affect the *outer* loop (which indeed loops barcodes)!
continue 2
fi
done
echo "Did not find FASTQ files files for well ${well_id}! "\
"Make sure that the input files have the correct file name format."\
"Input files: ${input_r1[@]}"
exit 1
done
# Define the function that will be used to run a single job
function _run() {
local par_UMIlength="$1"
local par_output="$2"
local par_genomeDir="$3"
local par_limitBAMsortRAM="$4"
local par_runThreadN="$5"
local barcode="$6"
local input_R1="$7"
local input_R2="$8"
local barcode_length="${#barcode}"
local umi_start="$(($barcode_length + 1))"
set -eo pipefail
echo <<-EOF
Processing $barcode
For the following inputs (lanes):
"$star_readFilesIn
EOF
echo "Writing barcode '$barcode' to $barcode.txt and using it as input".
# Note that there is no possible conflict between jobs here
# because the barcodes are unique (and the barcode is part of the name
# of the file).
echo "$barcode" > "$barcode.txt"
local dir="${par_output//\*/$barcode}/"
echo "Setting output for barcode '$barcode' to '$dir'."
mkdir -p "$dir"
# check if files are compressed
local TMPDIR=$(mktemp -d "$meta_temp_dir/parallel_map-$barcode-XXXXXX")
function clean_up {
[[ -d "$TMPDIR" ]] && rm -r "$TMPDIR"
}
trap clean_up RETURN
# Decompress the input files when needed
# NOTE: for some reason, using STAR's --readFilesCommand does not always work
# This might be because STAR creates fifo files (see https://man7.org/linux/man-pages/man7/fifo.7.html)
# and this requires a filesystem that supports this. Another cause might be that the input files
# are symlinks. When testing this, using '--readFilesCommand "zcat"'
# always produced empty BAM files, but also a succesfull exit code (0) so the problem is not reported.
# However, the logs showed the following error: "gzip -: unexpected end of file".
function is_gzipped {
printf "Checking if input '$1' (barcode '$barcode') is gzipped... "
if file "$1" | grep -q 'gzip'; then
echo "Done, detected compressed file."
return
fi
echo "Done, file does not need decompression."
false
}
# Resolve symbolic links to actual file paths
input_R1=$(realpath $input_R1)
input_R2=$(realpath $input_R2)
if is_gzipped $input_R1; then
local compressed_file_name_r1="$(basename -- $input_R1)"
local uncompressed_file_r1="$TMPDIR/${compressed_file_name_r1%.gz}"
printf "Unpacking input to $uncompressed_file_r1... "
zcat "$input_R1" > "$uncompressed_file_r1"
echo "Decompression done."
else
local uncompressed_file_r1="$input_R1"
fi
if is_gzipped $input_R2; then
local compressed_file_name_r2="$(basename -- $input_R2)"
local uncompressed_file_r2="$TMPDIR/${compressed_file_name_r2%.gz}"
printf "Unpacking input to $uncompressed_file_r2... "
zcat "$input_R2" > "$uncompressed_file_r2"
echo "Decompression done."
else
local uncompressed_file_r2="$input_R2"
fi
local n_input_lines_r1=$(wc -l < "$uncompressed_file_r1")
local n_input_lines_r2=$(wc -l < "$uncompressed_file_r2")
printf "Checking if length of input file mates match. "
if (( $n_input_lines_r1 != n_input_lines_r2 )); then
echo "The length of file $input_R1 ($n_input_lines_r1) does not match with $input_R2 ($n_input_lines_r2)"
return 1
else
echo "Seems OK, $n_input_lines_r1 input lines."
fi
echo "Starting STAR for barcode '$barcode'"
# soloType 'Droplet' is the same as 'CB_UMI_Simple': one UMI and one cell barcode of fixed length.
# By default in this mode, STAR will look for the cell barcode and the UMI int the last files specified with --readFilesIn
# So we need to specify R2 first and R1 second, because R1 contains the barcode and UMI.
# Also, you might be tempted to use '--soloBarcodeMate 1' to alter this behavior, but this requires the clipping
# the barcode from this mate by specifying --clip5pNbases and/or --clip3pNbases, which we do not want to do.
STAR \
--readFilesIn "$uncompressed_file_r2" "$uncompressed_file_r1" \
--soloType Droplet \
--quantMode GeneCounts \
--genomeLoad LoadAndKeep \
--limitBAMsortRAM "$par_limitBAMsortRAM" \
--runThreadN "$par_runThreadN" \
--outFilterMultimapNmax 1 \
--outSAMtype BAM SortedByCoordinate \
--soloCBstart 1 \
--readFilesType "Fastx" \
--soloCBlen "$barcode_length" \
--soloUMIstart "$umi_start" \
--soloUMIlen "$par_UMIlength" \
--soloBarcodeReadLength 0 \
--soloStrand Unstranded \
--soloFeatures Gene \
--genomeDir "$par_genomeDir" \
--outReadsUnmapped Fastx \
--outSAMunmapped Within \
--outSAMattributes NH HI nM AS CR UR CB UB GX GN \
--soloCBwhitelist "$barcode.txt" \
--outFileNamePrefix "$dir" \
--outTmpDir "$TMPDIR/STARtemp/"
printf "Done running STAR. "
# Check if the number of processed reads is equal to the number of input reads
local n_input_reads=$(($n_input_lines_r1 / 4))
local nr_output_reads=$(grep -Po "Number\ of\ input\ reads \\|\W*\K\d+" "$dir/Log.final.out")
if (( $nr_output_reads != $n_input_reads )); then
echo "Not all input reads were processed for barcode $barcode."
return 1
else
echo "Processed $nr_output_reads reads for barcode $barcode".
fi
printf "Making sure that the output has the proper permissions."
find "$dir" -type d -exec chmod o+x {} \;
chmod -R o+r "$dir"
echo "Done"
}
# Export the function - requires bash
export -f _run
# Load reference genome
echo "Loading reference genome"
STAR --genomeLoad LoadAndExit --genomeDir "$par_genomeDir"
# Run the concurrent jobs using GNU parallel
# Make sure that parallel uses the correct shell
export PARALLEL_SHELL="/bin/bash"
# Some notes:
# --halt now,fail=1: instruct parallel to exit when a job has failed and kill remaining running jobs.
#
# ::: is a special syntax for GNU parallel to delineate inputs
# If multiple ::: are given, each group will be treated as an input source, and all combinations of input
# sources will be generated. E.g. ::: 1 2 ::: a b c will result in the combinations (1,a) (1,b) (1,c) (2,a) (2,b) (2,c)
# The delimiter :::+ (note the extra '+') links the argument to the previous argument, and one argument from each of the input
# sources will be read.
parallel_cmd=("parallel" "--jobs" "80%" "--verbose" "--memfree" "2G"
"--tmpdir" "$meta_temp_dir"
"--retry-failed" "--retries" "4" "--halt" "soon,fail=1"
"--joblog" "$par_joblog" "_run" "{}")
# Arguments for which there is one value, so these will not create extra jobs
parallel_cmd+=(":::" "$par_umiLength" ":::" "$par_output" ":::" "$par_genomeDir" ":::" "$par_limitBAMsortRAM" ":::" "$par_runThreadN")
# Argument which in fact will cause extra jobs to be spawned, per job one item from each argument will be selected
# Thus, these argument lists should have the same length.
parallel_cmd+=(":::" "${barcodes[@]}" ":::+" "${sorted_input_r1[@]}" ":::+" "${sorted_input_r2[@]}")
set +eo pipefail
"${parallel_cmd[@]}"
exit_code=$?
set -eo pipefail
echo "GNU parallel finished!"
# Unload reference
printf "Unloading reference genome. "
STAR --genomeLoad Remove --genomeDir "$par_genomeDir"
echo "Done!"
# Exit code from GNU parallel:
# If fail=1 is used, the exit status will be the exit status of the failing job.
echo "Checking exit code"
if ((exit_code>0)); then
# Note that the ending HERE must be indented with TAB characters (not spaces)
# in order to remove leading indentation
MESSAGE=$(
cat <<-HERE
==================================================================
!!! An error occurred for one of the jobs.
Exit code of the failing job: $exit_code.
%s
==================================================================
HERE
)
printf "$MESSAGE" "$(<$par_joblog)"
exit 1
else
cat <<-HERE
==================================================================
Mapping went fine (exit code '$exit_code'), zero errors occurred
==================================================================
HERE
fi