Files
openpipeline/resources_test_scripts/10x_20k_fixed.sh
CI cd0af18851 Build branch fix-integration-tests with version dev (2dbe3b72)
Build pipeline: vsh-ci-dev-k8tz4

Source commit: 2dbe3b7231

Source message: Fix pointers to test resources
2024-10-17 17:56:12 +00:00

202 lines
6.7 KiB
Bash
Executable File

#!/bin/bash
set -eo pipefail
# ensure that the command below is run from the root of the repository
REPO_ROOT=$(git rev-parse --show-toplevel)
cd "$REPO_ROOT"
# settings
ID=10x_5k_fixed
OUT="resources_test/$ID"
# create raw directory
raw_dir="$OUT/raw"
mkdir -p "$raw_dir"
# Check whether seqkit is available
if ! command -v seqkit &> /dev/null; then
echo "This script requires seqkit. Please make sure the binary is added to your PATH."
exit 1
fi
# check whether reference is available
reference_dir="resources_test/reference_gencodev41_chr1/"
genome_tar="$reference_dir/reference_cellranger.tar.gz"
if [[ ! -f "$genome_tar" ]]; then
echo "$genome_tar does not exist. Please create the reference genome first"
exit 1
fi
# create tempdir
MY_TEMP="${VIASH_TEMP:-/tmp}"
TMPDIR=$(mktemp -d "$MY_TEMP/$ID-XXXXXX")
function clean_up {
[[ -d "$TMPDIR" ]] && rm -r "$TMPDIR"
}
# dataset page:
# https://www.10xgenomics.com/datasets/mixture-of-healthy-and-cancer-ffpe-tissues-dissociated-using-miltenyi-ffpe-tissue-dissociation-kit-multiplexed-samples-4-probe-barcodes-1-standard
# download and untar source fastq files
tar_dir="$HOME/.cache/openpipeline/4plex_human_liver_colorectal_ovarian_panc_scFFPE_multiplex"
if [[ ! -d "$tar_dir" ]]; then
mkdir -p "$tar_dir"
# download fastqs and untar
wget "https://s3-us-west-2.amazonaws.com/10x.files/samples/cell-exp/7.1.0/4plex_human_liver_colorectal_ovarian_panc_scFFPE_multiplex_Multiplex/4plex_human_liver_colorectal_ovarian_panc_scFFPE_multiplex_Multiplex_fastqs.tar" -O "$tar_dir.tar"
tar -xvf "$tar_dir.tar" -C "$tar_dir" --strip-components=1
rm "$tar_dir.tar"
fi
function seqkit_head {
input="$1"
output="$2"
if [[ ! -f "$output" ]]; then
echo "> Processing `basename $input`"
seqkit head -n 200000 "$input" | gzip > "$output"
fi
}
orig_sample_id="4plex_human_liver_colorectal_ovarian_panc_scFFPE_multiplex"
seqkit_head "$tar_dir/${orig_sample_id}_S1_L001_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_subset_S1_L001_R1_001.fastq.gz"
seqkit_head "$tar_dir/${orig_sample_id}_S1_L001_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_subset_S1_L001_R2_001.fastq.gz"
# download feature reference
feature_ref="$raw_dir/4plex_mouse_LymphNode_Spleen_TotalSeqC_multiplex_feature_reference.csv"
if [[ ! -f "$feature_ref" ]]; then
wget "https://cf.10xgenomics.com/samples/cell-exp/7.2.0/4plex_mouse_LymphNode_Spleen_TotalSeqC_multiplex_Multiplex/4plex_mouse_LymphNode_Spleen_TotalSeqC_multiplex_Multiplex_count_feature_reference.csv" -O "$feature_ref"
fi
# download probe set
probe_set="$raw_dir/Chromium_Human_Transcriptome_Probe_Set_v1.0_GRCh38-2020-A.csv"
if [[ ! -f "$probe_set" ]]; then
wget "https://cf.10xgenomics.com/supp/cell-exp/probeset/Chromium_Human_Transcriptome_Probe_Set_v1.0_GRCh38-2020-A.csv" -O "$probe_set"
fi
sed -i 's/#reference_genome=GRCh38/#reference_genome=output/g' "$probe_set"
probe_set_corrected="$raw_dir/Chromium_Human_Transcriptome_Probe_Set_v1.0_GRCh38-2020-A_corrected.csv"
if [[ ! -f "$probe_set_corrected" ]]; then
reference_gtf="resources_test/reference_gencodev41_chr1/reference.gtf.gz"
gunzip -c "$reference_gtf" > "$TMPDIR/uncompressed_ref.gtf"
cat "$probe_set" | while read line || [[ -n $line ]];
do
echo "Line: $line"
old_id=$( printf "%s\n" "$line" | awk -F',' '{print $1}' )
echo "Old ID: $old_id"
if [[ "$old_id" == "gene_id" ]] || [[ "$old_id" == \#* ]] ; then
echo "Just writing line"
printf "%s\n" "$line" >> "$probe_set_corrected"
else
gtf_lookup=$(grep "$old_id" "$TMPDIR/uncompressed_ref.gtf" || test $? = 1;)
if [ ! -z "$gtf_lookup" ]; then
echo "Found hit"
new_id=$(echo "$gtf_lookup" | awk '{if ($3 == "gene") print $10;}' | sed -e "s/^\"//" -e "s/\";$//")
echo "New ID: $new_id"
new_line=${line/"$old_id"/"$new_id"}
printf "%s\n" "$new_line" >> "$probe_set_corrected"
else
echo "Did not find hit"
fi
fi
done
fi
# # Input FASTA:
# # >1 dna:chromosome chromosome:GRCh38:1:1:248956422:1 REF
# # Output FASTA:
# # >chr1 1
# input_fastq="$HOME/.cache/openpipeline/GRCh38.primary_assembly.genome.fa.gz"
# fasta_modified="$TMPDIR/GRCh38.primary_assembly.genome.modified.fa"
# if [[ ! -f "$input_fastq" ]]; then
# wget "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/GRCh38.primary_assembly.genome.fa.gz" -O "$input_fastq"
# fi
# zcat "$input_fastq" \
# | sed -E 's/^>(\S+).*/>\1 \1/' \
# | sed -E 's/^>([0-9]+|[XY]) />chr\1 /' \
# | sed -E 's/^>MT />chrM /' \
# > "$fasta_modified"
# pigz --fast "$fasta_modified"
# fasta_modified="$fasta_modified.gz"
# # Input GTF:
# # ... gene_id "ENSG00000223972.5"; ...
# # Output GTF:
# # ... gene_id "ENSG00000223972"; gene_version "5"; ...
# input_gtf="$HOME/.cache/openpipeline/gencode.v41.annotation.gtf.gz"
# gtf_modified="$TMPDIR/gencode.v41.annotation.gtf.modified.gtf"
# if [[ ! -f "$input_gtf" ]]; then
# wget "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/gencode.v41.annotation.gtf.gz" -O "$input_gtf"
# fi
# REGEX="(ENS(MUS)?[GTE][0-9]+)\.([0-9]+)"
# zcat "$input_gtf" \
# | sed -E 's/gene_id "'"$REGEX"'";/gene_id "\1"; gene_version "\3";/' \
# | sed -E 's/transcript_id "'"$REGEX"'";/transcript_id "\1"; transcript_version "\3";/' \
# | sed -E 's/exon_id "'"$REGEX"'";/exon_id "\1"; exon_version "\3";/' \
# > "$gtf_modified"
# pigz --fast "$gtf_modified"
# gtf_modified="$gtf_modified.gz"
final_genome="$HOME/.cache/openpipeline/GRCh38.cellranger.genome.fa.gz"
if [ ! -f "$final_genome" ]; then
NXF_VER=21.10.6 nextflow \
run . \
-main-script target/nextflow/workflows/ingestion/make_reference/main.nf \
-profile docker \
-resume \
--id "GRCh38" \
--genome_fasta "$fasta_modified" \
--transcriptome_gtf "$gtf_modified" \
--target "cellranger" \
--output_fasta "reference.fa.gz" \
--output_gtf "reference.gtf.gz" \
--output_cellranger "GRCh38.cellranger.genome.fa.gz" \
--publish_dir "$HOME/.cache/openpipeline/"
fi
# Run mapping pipeline
cat > /tmp/params.yaml << HERE
param_list:
- id: "$ID"
input: "$raw_dir"
library_id:
- ${orig_sample_id}_subset
library_type:
- "Gene Expression"
library_lanes:
- "any"
probe_set: "$probe_set_corrected"
gex_reference: "$genome_tar"
feature_reference: "$feature_ref"
publish_dir: "$OUT/processed"
probe_barcode_ids:
- BC001
- BC002
- BC003
- BC004
sample_ids:
- Liver_BC1
- Ovarian_BC2
- Colorectal_BC3
- Pancreas_BC4
gex_generate_bam: false
sample_force_cells:
- 5000
- -1
- -1
- -1
HERE
nextflow \
run . \
-main-script target/nextflow/mapping/cellranger_multi/main.nf \
-resume \
-profile docker,mount_temp \
-params-file /tmp/params.yaml \
-c src/workflows/utils/labels_ci.config