#!/bin/bash ## VIASH START par_input_r1="work/2c/5b8b3a2dd4a988b8838e3f72d38a37/_viash_par/input_r1_1/two__ACACCGAATT.concat_text_r1.output.txt" par_input_r2="work/2c/5b8b3a2dd4a988b8838e3f72d38a37/_viash_par/input_r2_1/two__ACACCGAATT.concat_text_r2.output.txt" par_barcodes="ACACCGAATT;GGCTATTGAT" par_output="./*" par_genomeDir="star" par_umiLength=10 par_limitBAMsortRAM="10000000000" meta_cpus=2 par_runThreadN=1 ## VIASH END set -eo pipefail # Check if wildcard character is present in output folder template printf "Checking if output folder template ($par_output) contains a single wildcard character '*'. " output_glob_character="${par_output//[^\*]}" if [[ "${#output_glob_character}" -ne "1" ]]; then echo "The value for --output must contain exactly one '*' character. Exiting..." exit 1 else echo "Done, wildcard character found!" fi # Split the delimited strings into arrays IFS=';' read -r -a input_r1 <<< "$par_input_r1" IFS=';' read -r -a input_r2 <<< "$par_input_r2" # Read barcodes FASTQ # seqkit will make sure to take the leading non-whitespace as sequence identifier (ID) # Luckily, this is the same as how cutadapt determines an adapter name from the FASTA header. readarray -t well_ids < <(seqkit seq --name "$par_barcodesFasta" ) readarray -t barcodes < <(seqkit seq --seq --upper-case --remove-gaps --gap-letters '^' --validate-seq "$par_barcodesFasta") # Function to test for unique values in array function arrayContainsUniqueValues { # Pass the argument by reference local -n arr=$1 # Create a temporary associative array # in order to use its uniqueness of keys # 'declare' in a function is automatically local declare -A uniq_tmp for item in "${arr[@]}"; do uniq_tmp[$item]=0 # assigning a placeholder done local unique_array_values=(${!uniq_tmp[@]}) if [ "${#unique_array_values[@]}" -eq "${#arr[@]}" ]; then return fi false } arrayContainsUniqueValues barcodes is_array_unique_exit_code=$? if ! (exit $is_array_unique_exit_code); then echo "The provided barcodes should be unique!" echo "Values: $par_barcodes" exit 1 fi # Check that the number of values provided for the fastq files are the same. num_r1_inputs="${#input_r1[@]}" num_r2_inputs="${#input_r2[@]}" if [ ! "$num_r1_inputs" -eq "$num_r2_inputs" ]; then echo "The number of values for arguments "\ "'input_r1' ($num_r1_inputs) and 'input_r2' ($num_r2_inputs) "\ "should be the same." exit 1 else echo "Checked if the same as the number of R1 FASTQ ($num_r1_inputs) and R2 FASTQ files "\ "($num_r2_inputs) were provided. Seems OK!" fi # Loop over the well IDs and match them to the input FASTQ files # The FASTQ file names should have the format {well_id}_R(1|2).fastq, # which is the output format that the cutadapt component uses for demultiplexing. # sorted_input_r1 and sorted_input_r2 are the input FASTQ files sorted by the order # of the barcodes in the barcodes array (i.e. the order in the barcodes FASTA file). declare -a sorted_input_r1=() declare -a sorted_input_r2=() for barcode_index in "${!barcodes[@]}"; do barcode="${barcodes[$barcode_index]}" well_id="${well_ids[$barcode_index]}" echo "Finding FASTQ files for barcode ${barcode}, well ID '${well_id}'." # The FASTQ files for a particular barcode must match the following regex: input_file_regex="^${well_id}_R[1-2]" for r1_index in "${!input_r1[@]}"; do r1_file_path=${input_r1[$r1_index]} r2_file_path=${input_r2[$r1_index]} # Get the file names from the full path r1_file_name=$(basename -- "$r1_file_path") r2_file_name=$(basename -- "$r2_file_path") # Check if the file names match the regex if [[ $r1_file_name =~ $input_file_regex ]]; then echo "Matched with $r1_file_name and $r2_file_name." # If the R1 FASTQ file matched the regex, # the R2 file must have also been matched if ! [[ $r2_file_name =~ $input_file_regex ]]; then echo "File ${r1_file_name} matched with regex ${input_file_regex} "\ "but ${r2_file_name} did not! Make sure that the order of "\ "the R1 and R2 input files match." exit 1 fi # Add the sorted_input_r1+=("$r1_file_path") sorted_input_r2+=("$r2_file_path") # Do not continue looking for more files for this barcode # '2' to affect the *outer* loop (which indeed loops barcodes)! continue 2 fi done echo "Did not find FASTQ files files for well ${well_id}! "\ "Make sure that the input files have the correct file name format."\ "Input files: ${input_r1[@]}" exit 1 done # Define the function that will be used to run a single job function _run() { local par_UMIlength="$1" local par_output="$2" local par_genomeDir="$3" local par_limitBAMsortRAM="$4" local par_runThreadN="$5" local barcode="$6" local input_R1="$7" local input_R2="$8" local barcode_length="${#barcode}" local umi_start="$(($barcode_length + 1))" set -eo pipefail echo <<-EOF Processing $barcode For the following inputs (lanes): "$star_readFilesIn EOF echo "Writing barcode '$barcode' to $barcode.txt and using it as input". # Note that there is no possible conflict between jobs here # because the barcodes are unique (and the barcode is part of the name # of the file). echo "$barcode" > "$barcode.txt" local dir="${par_output//\*/$barcode}/" echo "Setting output for barcode '$barcode' to '$dir'." mkdir -p "$dir" # check if files are compressed local TMPDIR=$(mktemp -d "$meta_temp_dir/parallel_map-$barcode-XXXXXX") function clean_up { [[ -d "$TMPDIR" ]] && rm -r "$TMPDIR" } trap clean_up RETURN # Decompress the input files when needed # NOTE: for some reason, using STAR's --readFilesCommand does not always work # This might be because STAR creates fifo files (see https://man7.org/linux/man-pages/man7/fifo.7.html) # and this requires a filesystem that supports this. Another cause might be that the input files # are symlinks. When testing this, using '--readFilesCommand "zcat"' # always produced empty BAM files, but also a succesfull exit code (0) so the problem is not reported. # However, the logs showed the following error: "gzip -: unexpected end of file". function is_gzipped { printf "Checking if input '$1' (barcode '$barcode') is gzipped... " if file "$1" | grep -q 'gzip'; then echo "Done, detected compressed file." return fi echo "Done, file does not need decompression." false } # Resolve symbolic links to actual file paths input_R1=$(realpath $input_R1) input_R2=$(realpath $input_R2) if is_gzipped $input_R1; then local compressed_file_name_r1="$(basename -- $input_R1)" local uncompressed_file_r1="$TMPDIR/${compressed_file_name_r1%.gz}" printf "Unpacking input to $uncompressed_file_r1... " zcat "$input_R1" > "$uncompressed_file_r1" echo "Decompression done." else local uncompressed_file_r1="$input_R1" fi if is_gzipped $input_R2; then local compressed_file_name_r2="$(basename -- $input_R2)" local uncompressed_file_r2="$TMPDIR/${compressed_file_name_r2%.gz}" printf "Unpacking input to $uncompressed_file_r2... " zcat "$input_R2" > "$uncompressed_file_r2" echo "Decompression done." else local uncompressed_file_r2="$input_R2" fi local n_input_lines_r1=$(wc -l < "$uncompressed_file_r1") local n_input_lines_r2=$(wc -l < "$uncompressed_file_r2") printf "Checking if length of input file mates match. " if (( $n_input_lines_r1 != n_input_lines_r2 )); then echo "The length of file $input_R1 ($n_input_lines_r1) does not match with $input_R2 ($n_input_lines_r2)" return 1 else echo "Seems OK, $n_input_lines_r1 input lines." fi echo "Starting STAR for barcode '$barcode'" # soloType 'Droplet' is the same as 'CB_UMI_Simple': one UMI and one cell barcode of fixed length. # By default in this mode, STAR will look for the cell barcode and the UMI int the last files specified with --readFilesIn # So we need to specify R2 first and R1 second, because R1 contains the barcode and UMI. # Also, you might be tempted to use '--soloBarcodeMate 1' to alter this behavior, but this requires the clipping # the barcode from this mate by specifying --clip5pNbases and/or --clip3pNbases, which we do not want to do. STAR \ --readFilesIn "$uncompressed_file_r2" "$uncompressed_file_r1" \ --soloType Droplet \ --quantMode GeneCounts \ --genomeLoad LoadAndKeep \ --limitBAMsortRAM "$par_limitBAMsortRAM" \ --runThreadN "$par_runThreadN" \ --outFilterMultimapNmax 1 \ --outSAMtype BAM SortedByCoordinate \ --soloCBstart 1 \ --readFilesType "Fastx" \ --soloCBlen "$barcode_length" \ --soloUMIstart "$umi_start" \ --soloUMIlen "$par_UMIlength" \ --soloBarcodeReadLength 0 \ --soloStrand Unstranded \ --soloFeatures Gene \ --genomeDir "$par_genomeDir" \ --outReadsUnmapped Fastx \ --outSAMunmapped Within \ --outSAMattributes NH HI nM AS CR UR CB UB GX GN \ --soloCBwhitelist "$barcode.txt" \ --outFileNamePrefix "$dir" \ --outTmpDir "$TMPDIR/STARtemp/" printf "Done running STAR. " # Check if the number of processed reads is equal to the number of input reads local n_input_reads=$(($n_input_lines_r1 / 4)) local nr_output_reads=$(grep -Po "Number\ of\ input\ reads \\|\W*\K\d+" "$dir/Log.final.out") if (( $nr_output_reads != $n_input_reads )); then echo "Not all input reads were processed for barcode $barcode." return 1 else echo "Processed $nr_output_reads reads for barcode $barcode". fi printf "Making sure that the output has the proper permissions." find "$dir" -type d -exec chmod o+x {} \; chmod -R o+r "$dir" echo "Done" } # Export the function - requires bash export -f _run # Load reference genome echo "Loading reference genome" STAR --genomeLoad LoadAndExit --genomeDir "$par_genomeDir" # Run the concurrent jobs using GNU parallel # Make sure that parallel uses the correct shell export PARALLEL_SHELL="/bin/bash" # Some notes: # --halt now,fail=1: instruct parallel to exit when a job has failed and kill remaining running jobs. # # ::: is a special syntax for GNU parallel to delineate inputs # If multiple ::: are given, each group will be treated as an input source, and all combinations of input # sources will be generated. E.g. ::: 1 2 ::: a b c will result in the combinations (1,a) (1,b) (1,c) (2,a) (2,b) (2,c) # The delimiter :::+ (note the extra '+') links the argument to the previous argument, and one argument from each of the input # sources will be read. parallel_cmd=("parallel" "--jobs" "80%" "--verbose" "--memfree" "2G" "--tmpdir" "$meta_temp_dir" "--retry-failed" "--retries" "4" "--halt" "soon,fail=1" "--joblog" "$par_joblog" "_run" "{}") # Arguments for which there is one value, so these will not create extra jobs parallel_cmd+=(":::" "$par_umiLength" ":::" "$par_output" ":::" "$par_genomeDir" ":::" "$par_limitBAMsortRAM" ":::" "$par_runThreadN") # Argument which in fact will cause extra jobs to be spawned, per job one item from each argument will be selected # Thus, these argument lists should have the same length. parallel_cmd+=(":::" "${barcodes[@]}" ":::+" "${sorted_input_r1[@]}" ":::+" "${sorted_input_r2[@]}") set +eo pipefail "${parallel_cmd[@]}" exit_code=$? set -eo pipefail echo "GNU parallel finished!" # Unload reference printf "Unloading reference genome. " STAR --genomeLoad Remove --genomeDir "$par_genomeDir" echo "Done!" # Exit code from GNU parallel: # If fail=1 is used, the exit status will be the exit status of the failing job. echo "Checking exit code" if ((exit_code>0)); then # Note that the ending HERE must be indented with TAB characters (not spaces) # in order to remove leading indentation MESSAGE=$( cat <<-HERE ================================================================== !!! An error occurred for one of the jobs. Exit code of the failing job: $exit_code. %s ================================================================== HERE ) printf "$MESSAGE" "$(<$par_joblog)" exit 1 else cat <<-HERE ================================================================== Mapping went fine (exit code '$exit_code'), zero errors occurred ================================================================== HERE fi