Build pipeline: vsh-ci-dev-jsbwk
Source commit: 1e1ffb315f
Source message: Merge pull request #17 from viash-hub/add_biobox_modules
- Migrate a number of components to biobox
- Fix tests
- Reduce size of test resources
- Prepare for Viash Hub
247 lines
11 KiB
R
Executable File
247 lines
11 KiB
R
Executable File
#!/usr/bin/env Rscript
|
|
|
|
################################################
|
|
################################################
|
|
## REQUIREMENTS ##
|
|
################################################
|
|
################################################
|
|
|
|
## PCA, HEATMAP AND SCATTERPLOTS FOR SAMPLES IN COUNTS FILE
|
|
## - SAMPLE NAMES HAVE TO END IN e.g. "_R1" REPRESENTING REPLICATE ID. LAST 3 CHARACTERS OF SAMPLE NAME WILL BE TRIMMED TO OBTAIN GROUP ID FOR DESEQ2 COMPARISONS.
|
|
## - PACKAGES BELOW NEED TO BE AVAILABLE TO LOAD WHEN RUNNING R
|
|
|
|
################################################
|
|
################################################
|
|
## LOAD LIBRARIES ##
|
|
################################################
|
|
################################################
|
|
|
|
library(optparse)
|
|
library(DESeq2)
|
|
library(ggplot2)
|
|
library(RColorBrewer)
|
|
library(pheatmap)
|
|
|
|
################################################
|
|
################################################
|
|
## PARSE COMMAND-LINE PARAMETERS ##
|
|
################################################
|
|
################################################
|
|
|
|
option_list <- list(
|
|
make_option(c("-i", "--count_file"), type="character", default=NULL, metavar="path", help="Count file matrix where rows are genes and columns are samples."),
|
|
make_option(c("-f", "--count_col"), type="integer", default=3, metavar="integer", help="First column containing sample count data."),
|
|
make_option(c("-d", "--id_col"), type="integer", default=1, metavar="integer", help="Column containing identifiers to be used."),
|
|
make_option(c("-r", "--sample_suffix"), type="character", default='', metavar="string", help="Suffix to remove after sample name in columns e.g. '.rmDup.bam' if 'DRUG_R1.rmDup.bam'."),
|
|
make_option(c("-p", "--outprefix"), type="character", default='deseq2', metavar="string" , help="Output prefix."),
|
|
make_option(c("-v", "--vst"), type="logical", default=FALSE, metavar="boolean", help="Run vst transform instead of rlog."),
|
|
make_option(c("-c", "--cores"), type="integer", default=1, metavar="integer", help="Number of cores."),
|
|
make_option(c("-o", "--outdir"), type="character", default="./", metavar="path", help="Output directory.")
|
|
)
|
|
|
|
opt_parser <- OptionParser(option_list=option_list)
|
|
opt <- parse_args(opt_parser)
|
|
if (is.null(opt$count_file)){
|
|
print_help(opt_parser)
|
|
stop("Please provide a counts file.", call.=FALSE)
|
|
}
|
|
|
|
################################################
|
|
################################################
|
|
## READ IN COUNTS FILE ##
|
|
################################################
|
|
################################################
|
|
|
|
count.table <- read.delim(file=opt$count_file,header=TRUE, row.names=NULL)
|
|
rownames(count.table) <- count.table[,opt$id_col]
|
|
count.table <- count.table[,opt$count_col:ncol(count.table),drop=FALSE]
|
|
colnames(count.table) <- gsub(opt$sample_suffix,"",colnames(count.table))
|
|
colnames(count.table) <- gsub(pattern='\\.$', replacement='', colnames(count.table))
|
|
|
|
################################################
|
|
################################################
|
|
## RUN DESEQ2 ##
|
|
################################################
|
|
################################################
|
|
|
|
if (file.exists(opt$outdir) == FALSE) {
|
|
dir.create(opt$outdir, recursive=TRUE)
|
|
}
|
|
setwd(opt$outdir)
|
|
|
|
samples.vec <- colnames(count.table)
|
|
name_components <- strsplit(samples.vec, "_")
|
|
n_components <- length(name_components[[1]])
|
|
decompose <- n_components!=1 && all(sapply(name_components, length)==n_components)
|
|
coldata <- data.frame(samples.vec, sample=samples.vec, row.names=1)
|
|
if (decompose) {
|
|
groupings <- as.data.frame(lapply(1:n_components, function(i) sapply(name_components, "[[", i)))
|
|
n_distinct <- sapply(groupings, function(grp) length(unique(grp)))
|
|
groupings <- groupings[n_distinct!=1 & n_distinct!=length(samples.vec)]
|
|
if (ncol(groupings)!=0) {
|
|
names(groupings) <- paste0("Group", 1:ncol(groupings))
|
|
coldata <- cbind(coldata, groupings)
|
|
} else {
|
|
decompose <- FALSE
|
|
}
|
|
}
|
|
|
|
DDSFile <- paste(opt$outprefix,".dds.RData",sep="")
|
|
|
|
counts <- count.table[,samples.vec,drop=FALSE]
|
|
dds <- DESeqDataSetFromMatrix(countData=round(counts), colData=coldata, design=~ 1)
|
|
dds <- estimateSizeFactors(dds)
|
|
if (min(dim(count.table))<=1) { # No point if only one sample, or one gene
|
|
save(dds,file=DDSFile)
|
|
saveRDS(dds, file=sub("\\.dds\\.RData$", ".rds", DDSFile))
|
|
warning("Not enough samples or genes in counts file for PCA.", call.=FALSE)
|
|
quit(save = "no", status = 0, runLast = FALSE)
|
|
}
|
|
if (!opt$vst) {
|
|
vst_name <- "rlog"
|
|
rld <- rlog(dds)
|
|
} else {
|
|
vst_name <- "vst"
|
|
rld <- varianceStabilizingTransformation(dds)
|
|
}
|
|
|
|
assay(dds, vst_name) <- assay(rld)
|
|
save(dds,file=DDSFile)
|
|
saveRDS(dds, file=sub("\\.dds\\.RData$", ".rds", DDSFile))
|
|
|
|
################################################
|
|
################################################
|
|
## PLOT QC ##
|
|
################################################
|
|
################################################
|
|
|
|
##' PCA pre-processeor
|
|
##'
|
|
##' Generate all the necessary information to plot PCA from a DESeq2 object
|
|
##' in which an assay containing a variance-stabilised matrix of counts is
|
|
##' stored. Copied from DESeq2::plotPCA, but with additional ability to
|
|
##' say which assay to run the PCA on.
|
|
##'
|
|
##' @param object The DESeq2DataSet object.
|
|
##' @param ntop number of top genes to use for principla components, selected by highest row variance.
|
|
##' @param assay the name or index of the assay that stores the variance-stabilised data.
|
|
##' @return A data.frame containing the projected data alongside the grouping columns.
|
|
##' A 'percentVar' attribute is set which includes the percentage of variation each PC explains,
|
|
##' and additionally how much the variation within that PC is explained by the grouping variable.
|
|
##' @author Gavin Kelly
|
|
plotPCA_vst <- function (object, ntop = 500, assay=length(assays(object))) {
|
|
rv <- rowVars(assay(object, assay))
|
|
select <- order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))]
|
|
pca <- prcomp(t(assay(object, assay)[select, ]), center=TRUE, scale=FALSE)
|
|
percentVar <- pca$sdev^2/sum(pca$sdev^2)
|
|
df <- cbind( as.data.frame(colData(object)), pca$x)
|
|
#Order points so extreme samples are more likely to get label
|
|
ord <- order(abs(rank(df$PC1)-median(df$PC1)), abs(rank(df$PC2)-median(df$PC2)))
|
|
df <- df[ord,]
|
|
attr(df, "percentVar") <- data.frame(PC=seq(along=percentVar), percentVar=100*percentVar)
|
|
return(df)
|
|
}
|
|
|
|
PlotFile <- paste(opt$outprefix,".plots.pdf",sep="")
|
|
|
|
pdf(file=PlotFile, onefile=TRUE, width=7, height=7)
|
|
## PCA
|
|
ntop <- c(500, Inf)
|
|
for (n_top_var in ntop) {
|
|
pca.data <- plotPCA_vst(dds, assay=vst_name, ntop=n_top_var)
|
|
percentVar <- round(attr(pca.data, "percentVar")$percentVar)
|
|
plot_subtitle <- ifelse(n_top_var==Inf, "All genes", paste("Top", n_top_var, "genes"))
|
|
pl <- ggplot(pca.data, aes(PC1, PC2, label=paste0(" ", sample, " "))) +
|
|
geom_point() +
|
|
geom_text(check_overlap=TRUE, vjust=0.5, hjust="inward") +
|
|
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
|
|
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
|
|
labs(title = paste0("First PCs on ", vst_name, "-transformed data"), subtitle = plot_subtitle) +
|
|
theme(legend.position="top",
|
|
panel.grid.major = element_blank(),
|
|
panel.grid.minor = element_blank(),
|
|
panel.background = element_blank(),
|
|
panel.border = element_rect(colour = "black", fill=NA, size=1))
|
|
print(pl)
|
|
|
|
if (decompose) {
|
|
pc_names <- paste0("PC", attr(pca.data, "percentVar")$PC)
|
|
long_pc <- reshape(pca.data, varying=pc_names, direction="long", sep="", timevar="component", idvar="pcrow")
|
|
long_pc <- subset(long_pc, component<=5)
|
|
long_pc_grp <- reshape(long_pc, varying=names(groupings), direction="long", sep="", timevar="grouper")
|
|
long_pc_grp <- subset(long_pc_grp, grouper<=5)
|
|
long_pc_grp$component <- paste("PC", long_pc_grp$component)
|
|
long_pc_grp$grouper <- paste0(long_pc_grp$grouper, c("st","nd","rd","th","th")[long_pc_grp$grouper], " prefix")
|
|
pl <- ggplot(long_pc_grp, aes(x=Group, y=PC)) +
|
|
geom_point() +
|
|
stat_summary(fun=mean, geom="line", aes(group = 1)) +
|
|
labs(x=NULL, y=NULL, subtitle = plot_subtitle, title="PCs split by sample-name prefixes") +
|
|
facet_grid(component~grouper, scales="free_x") +
|
|
scale_x_discrete(guide = guide_axis(n.dodge = 3))
|
|
print(pl)
|
|
}
|
|
} # at end of loop, we'll be using the user-defined ntop if any, else all genes
|
|
|
|
## WRITE PC1 vs PC2 VALUES TO FILE
|
|
pca.vals <- pca.data[,c("PC1","PC2")]
|
|
colnames(pca.vals) <- paste0(colnames(pca.vals), ": ", percentVar[1:2], '% variance')
|
|
pca.vals <- cbind(sample = rownames(pca.vals), pca.vals)
|
|
write.table(pca.vals, file = paste(opt$outprefix, ".pca.vals.txt", sep=""),
|
|
row.names = FALSE, col.names = TRUE, sep = "\t", quote = TRUE)
|
|
|
|
## SAMPLE CORRELATION HEATMAP
|
|
sampleDists <- dist(t(assay(dds, vst_name)))
|
|
sampleDistMatrix <- as.matrix(sampleDists)
|
|
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
|
|
pheatmap(
|
|
sampleDistMatrix,
|
|
clustering_distance_rows=sampleDists,
|
|
clustering_distance_cols=sampleDists,
|
|
col=colors,
|
|
main=paste("Euclidean distance between", vst_name, "of samples")
|
|
)
|
|
|
|
## WRITE SAMPLE DISTANCES TO FILE
|
|
write.table(cbind(sample = rownames(sampleDistMatrix), sampleDistMatrix),file=paste(opt$outprefix, ".sample.dists.txt", sep=""),
|
|
row.names=FALSE, col.names=TRUE, sep="\t", quote=FALSE)
|
|
dev.off()
|
|
|
|
################################################
|
|
################################################
|
|
## SAVE SIZE FACTORS ##
|
|
################################################
|
|
################################################
|
|
|
|
SizeFactorsDir <- "size_factors/"
|
|
if (file.exists(SizeFactorsDir) == FALSE) {
|
|
dir.create(SizeFactorsDir, recursive=TRUE)
|
|
}
|
|
|
|
NormFactorsFile <- paste(SizeFactorsDir,opt$outprefix, ".size_factors.RData", sep="")
|
|
|
|
normFactors <- sizeFactors(dds)
|
|
save(normFactors, file=NormFactorsFile)
|
|
|
|
for (name in names(sizeFactors(dds))) {
|
|
sizeFactorFile <- paste(SizeFactorsDir,name, ".txt", sep="")
|
|
write(as.numeric(sizeFactors(dds)[name]), file=sizeFactorFile)
|
|
}
|
|
|
|
################################################
|
|
################################################
|
|
## R SESSION INFO ##
|
|
################################################
|
|
################################################
|
|
|
|
RLogFile <- "R_sessionInfo.log"
|
|
|
|
sink(RLogFile)
|
|
a <- sessionInfo()
|
|
print(a)
|
|
sink()
|
|
|
|
################################################
|
|
################################################
|
|
################################################
|
|
################################################
|