Build pipeline: vsh-ci-dev-k8tz4
Source commit: 2dbe3b7231
Source message: Fix pointers to test resources
202 lines
6.7 KiB
Bash
Executable File
202 lines
6.7 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
set -eo pipefail
|
|
|
|
# ensure that the command below is run from the root of the repository
|
|
REPO_ROOT=$(git rev-parse --show-toplevel)
|
|
cd "$REPO_ROOT"
|
|
|
|
# settings
|
|
ID=10x_5k_fixed
|
|
OUT="resources_test/$ID"
|
|
|
|
# create raw directory
|
|
raw_dir="$OUT/raw"
|
|
mkdir -p "$raw_dir"
|
|
|
|
# Check whether seqkit is available
|
|
if ! command -v seqkit &> /dev/null; then
|
|
echo "This script requires seqkit. Please make sure the binary is added to your PATH."
|
|
exit 1
|
|
fi
|
|
|
|
# check whether reference is available
|
|
reference_dir="resources_test/reference_gencodev41_chr1/"
|
|
genome_tar="$reference_dir/reference_cellranger.tar.gz"
|
|
if [[ ! -f "$genome_tar" ]]; then
|
|
echo "$genome_tar does not exist. Please create the reference genome first"
|
|
exit 1
|
|
fi
|
|
|
|
# create tempdir
|
|
MY_TEMP="${VIASH_TEMP:-/tmp}"
|
|
TMPDIR=$(mktemp -d "$MY_TEMP/$ID-XXXXXX")
|
|
function clean_up {
|
|
[[ -d "$TMPDIR" ]] && rm -r "$TMPDIR"
|
|
}
|
|
|
|
# dataset page:
|
|
# https://www.10xgenomics.com/datasets/mixture-of-healthy-and-cancer-ffpe-tissues-dissociated-using-miltenyi-ffpe-tissue-dissociation-kit-multiplexed-samples-4-probe-barcodes-1-standard
|
|
|
|
# download and untar source fastq files
|
|
tar_dir="$HOME/.cache/openpipeline/4plex_human_liver_colorectal_ovarian_panc_scFFPE_multiplex"
|
|
if [[ ! -d "$tar_dir" ]]; then
|
|
mkdir -p "$tar_dir"
|
|
|
|
# download fastqs and untar
|
|
wget "https://s3-us-west-2.amazonaws.com/10x.files/samples/cell-exp/7.1.0/4plex_human_liver_colorectal_ovarian_panc_scFFPE_multiplex_Multiplex/4plex_human_liver_colorectal_ovarian_panc_scFFPE_multiplex_Multiplex_fastqs.tar" -O "$tar_dir.tar"
|
|
tar -xvf "$tar_dir.tar" -C "$tar_dir" --strip-components=1
|
|
rm "$tar_dir.tar"
|
|
fi
|
|
|
|
function seqkit_head {
|
|
input="$1"
|
|
output="$2"
|
|
if [[ ! -f "$output" ]]; then
|
|
echo "> Processing `basename $input`"
|
|
seqkit head -n 200000 "$input" | gzip > "$output"
|
|
fi
|
|
}
|
|
|
|
orig_sample_id="4plex_human_liver_colorectal_ovarian_panc_scFFPE_multiplex"
|
|
|
|
seqkit_head "$tar_dir/${orig_sample_id}_S1_L001_R1_001.fastq.gz" "$raw_dir/${orig_sample_id}_subset_S1_L001_R1_001.fastq.gz"
|
|
seqkit_head "$tar_dir/${orig_sample_id}_S1_L001_R2_001.fastq.gz" "$raw_dir/${orig_sample_id}_subset_S1_L001_R2_001.fastq.gz"
|
|
|
|
# download feature reference
|
|
feature_ref="$raw_dir/4plex_mouse_LymphNode_Spleen_TotalSeqC_multiplex_feature_reference.csv"
|
|
if [[ ! -f "$feature_ref" ]]; then
|
|
wget "https://cf.10xgenomics.com/samples/cell-exp/7.2.0/4plex_mouse_LymphNode_Spleen_TotalSeqC_multiplex_Multiplex/4plex_mouse_LymphNode_Spleen_TotalSeqC_multiplex_Multiplex_count_feature_reference.csv" -O "$feature_ref"
|
|
fi
|
|
|
|
# download probe set
|
|
probe_set="$raw_dir/Chromium_Human_Transcriptome_Probe_Set_v1.0_GRCh38-2020-A.csv"
|
|
if [[ ! -f "$probe_set" ]]; then
|
|
wget "https://cf.10xgenomics.com/supp/cell-exp/probeset/Chromium_Human_Transcriptome_Probe_Set_v1.0_GRCh38-2020-A.csv" -O "$probe_set"
|
|
fi
|
|
|
|
sed -i 's/#reference_genome=GRCh38/#reference_genome=output/g' "$probe_set"
|
|
|
|
probe_set_corrected="$raw_dir/Chromium_Human_Transcriptome_Probe_Set_v1.0_GRCh38-2020-A_corrected.csv"
|
|
if [[ ! -f "$probe_set_corrected" ]]; then
|
|
reference_gtf="resources_test/reference_gencodev41_chr1/reference.gtf.gz"
|
|
gunzip -c "$reference_gtf" > "$TMPDIR/uncompressed_ref.gtf"
|
|
cat "$probe_set" | while read line || [[ -n $line ]];
|
|
do
|
|
echo "Line: $line"
|
|
old_id=$( printf "%s\n" "$line" | awk -F',' '{print $1}' )
|
|
echo "Old ID: $old_id"
|
|
if [[ "$old_id" == "gene_id" ]] || [[ "$old_id" == \#* ]] ; then
|
|
echo "Just writing line"
|
|
printf "%s\n" "$line" >> "$probe_set_corrected"
|
|
else
|
|
gtf_lookup=$(grep "$old_id" "$TMPDIR/uncompressed_ref.gtf" || test $? = 1;)
|
|
if [ ! -z "$gtf_lookup" ]; then
|
|
echo "Found hit"
|
|
new_id=$(echo "$gtf_lookup" | awk '{if ($3 == "gene") print $10;}' | sed -e "s/^\"//" -e "s/\";$//")
|
|
echo "New ID: $new_id"
|
|
new_line=${line/"$old_id"/"$new_id"}
|
|
printf "%s\n" "$new_line" >> "$probe_set_corrected"
|
|
else
|
|
echo "Did not find hit"
|
|
fi
|
|
fi
|
|
done
|
|
fi
|
|
|
|
# # Input FASTA:
|
|
# # >1 dna:chromosome chromosome:GRCh38:1:1:248956422:1 REF
|
|
# # Output FASTA:
|
|
# # >chr1 1
|
|
# input_fastq="$HOME/.cache/openpipeline/GRCh38.primary_assembly.genome.fa.gz"
|
|
# fasta_modified="$TMPDIR/GRCh38.primary_assembly.genome.modified.fa"
|
|
# if [[ ! -f "$input_fastq" ]]; then
|
|
# wget "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/GRCh38.primary_assembly.genome.fa.gz" -O "$input_fastq"
|
|
# fi
|
|
# zcat "$input_fastq" \
|
|
# | sed -E 's/^>(\S+).*/>\1 \1/' \
|
|
# | sed -E 's/^>([0-9]+|[XY]) />chr\1 /' \
|
|
# | sed -E 's/^>MT />chrM /' \
|
|
# > "$fasta_modified"
|
|
|
|
# pigz --fast "$fasta_modified"
|
|
# fasta_modified="$fasta_modified.gz"
|
|
# # Input GTF:
|
|
# # ... gene_id "ENSG00000223972.5"; ...
|
|
# # Output GTF:
|
|
# # ... gene_id "ENSG00000223972"; gene_version "5"; ...
|
|
# input_gtf="$HOME/.cache/openpipeline/gencode.v41.annotation.gtf.gz"
|
|
# gtf_modified="$TMPDIR/gencode.v41.annotation.gtf.modified.gtf"
|
|
# if [[ ! -f "$input_gtf" ]]; then
|
|
# wget "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_41/gencode.v41.annotation.gtf.gz" -O "$input_gtf"
|
|
# fi
|
|
|
|
# REGEX="(ENS(MUS)?[GTE][0-9]+)\.([0-9]+)"
|
|
# zcat "$input_gtf" \
|
|
# | sed -E 's/gene_id "'"$REGEX"'";/gene_id "\1"; gene_version "\3";/' \
|
|
# | sed -E 's/transcript_id "'"$REGEX"'";/transcript_id "\1"; transcript_version "\3";/' \
|
|
# | sed -E 's/exon_id "'"$REGEX"'";/exon_id "\1"; exon_version "\3";/' \
|
|
# > "$gtf_modified"
|
|
# pigz --fast "$gtf_modified"
|
|
# gtf_modified="$gtf_modified.gz"
|
|
|
|
final_genome="$HOME/.cache/openpipeline/GRCh38.cellranger.genome.fa.gz"
|
|
if [ ! -f "$final_genome" ]; then
|
|
NXF_VER=21.10.6 nextflow \
|
|
run . \
|
|
-main-script target/nextflow/workflows/ingestion/make_reference/main.nf \
|
|
-profile docker \
|
|
-resume \
|
|
--id "GRCh38" \
|
|
--genome_fasta "$fasta_modified" \
|
|
--transcriptome_gtf "$gtf_modified" \
|
|
--target "cellranger" \
|
|
--output_fasta "reference.fa.gz" \
|
|
--output_gtf "reference.gtf.gz" \
|
|
--output_cellranger "GRCh38.cellranger.genome.fa.gz" \
|
|
--publish_dir "$HOME/.cache/openpipeline/"
|
|
fi
|
|
|
|
|
|
# Run mapping pipeline
|
|
cat > /tmp/params.yaml << HERE
|
|
param_list:
|
|
- id: "$ID"
|
|
input: "$raw_dir"
|
|
library_id:
|
|
- ${orig_sample_id}_subset
|
|
library_type:
|
|
- "Gene Expression"
|
|
library_lanes:
|
|
- "any"
|
|
|
|
probe_set: "$probe_set_corrected"
|
|
gex_reference: "$genome_tar"
|
|
feature_reference: "$feature_ref"
|
|
publish_dir: "$OUT/processed"
|
|
probe_barcode_ids:
|
|
- BC001
|
|
- BC002
|
|
- BC003
|
|
- BC004
|
|
sample_ids:
|
|
- Liver_BC1
|
|
- Ovarian_BC2
|
|
- Colorectal_BC3
|
|
- Pancreas_BC4
|
|
gex_generate_bam: false
|
|
sample_force_cells:
|
|
- 5000
|
|
- -1
|
|
- -1
|
|
- -1
|
|
HERE
|
|
|
|
nextflow \
|
|
run . \
|
|
-main-script target/nextflow/mapping/cellranger_multi/main.nf \
|
|
-resume \
|
|
-profile docker,mount_temp \
|
|
-params-file /tmp/params.yaml \
|
|
-c src/workflows/utils/labels_ci.config
|