Files
htrnaseq/src/eset/create_eset/script.R
CI 228364274a Build branch fix/empty with version fix-empty (3b873b6)
Build pipeline: viash-hub.htrnaseq.fix-empty-p2njn

Source commit: 3b873b622f

Source message: Fix missing test file from config
2025-02-17 08:25:38 +00:00

432 lines
15 KiB
R

library(Biobase)
library(data.table)
library(nlcv)
library(Matrix)
library(Seurat)
### VIASH START
par <- list(
pDataFile = "src/eset/create_eset/test_data/pData.tsv",
fDataFile = "src/eset/create_eset/test_data/fData.tsv",
studyType = "Standard",
mappingDir = c("src/eset/create_eset/test_data/mapping_dir/AACAAGGTAC",
"src/eset/create_eset/test_data/mapping_dir/ACGCCTTCGT"),
output = "eset.rds",
poolName = "Foo"
)
### VIASH END
Read10X <- function(data_dir = NULL, gene_column = 2, unique_features = TRUE) {
full.data <- list()
for (i in seq_along(along.with = data_dir)) {
run <- data_dir[i]
if (!dir.exists(paths = run)) {
stop("Directory provided does not exist")
}
barcode.loc <- file.path(run, "barcodes.tsv")
gene.loc <- file.path(run, "features.tsv")
features.loc <- file.path(run, "features.tsv.gz")
matrix.loc <- file.path(run, "matrix.mtx")
pre_ver_3 <- file.exists(gene.loc)
if (!pre_ver_3) {
addgz <- function(s) {
return(paste0(s, ".gz"))
}
barcode.loc <- addgz(s = barcode.loc)
matrix.loc <- addgz(s = matrix.loc)
}
if (!file.exists(barcode.loc)) {
stop("Barcode file missing")
}
if (!pre_ver_3 && !file.exists(features.loc)) {
stop("Gene name or features file missing")
}
if (!file.exists(matrix.loc)) {
stop("Expression matrix file missing")
}
data <- readMM(file = matrix.loc)
cell.names <- readLines(barcode.loc)
if (all(grepl(pattern = "\\-1$", x = cell.names))) {
cell.names <- as.vector(x = as.character(x = sapply(X = cell.names,
FUN = ExtractField, field = 1, delim = "-")))
}
if (is.null(x = names(x = data_dir))) {
if (i < 2) {
colnames(x = data) <- cell.names
}
else {
colnames(x = data) <- paste0(i, "_", cell.names)
}
}
else {
colnames(x = data) <- paste0(names(x = data_dir)[i],
"_", cell.names)
}
feature.names <- read.delim(file = ifelse(test = pre_ver_3,
yes = gene.loc, no = features.loc), header = FALSE,
stringsAsFactors = FALSE)
if (any(is.na(x = feature.names[, gene_column]))) {
warning("Some features names are NA. Replacing NA names with ID from the opposite column requested",
call. = FALSE, immediate. = TRUE)
na.features <- which(x = is.na(x = feature.names[,
gene_column]))
replacement.column <- ifelse(test = gene_column ==
2, yes = 1, no = 2)
feature.names[na.features, gene_column] <- feature.names[na.features,
replacement.column]
}
if (unique_features) {
fcols = ncol(x = feature.names)
if (fcols < gene_column) {
stop(paste0("gene_column was set to ", gene_column,
" but feature.tsv.gz (or genes.tsv) only has ",
fcols, " columns.", " Try setting the gene_column ",
"argument to a value <= to ",
fcols, "."))
}
rownames(x = data) <- make.unique(names = feature.names[,
gene_column])
}
if (ncol(x = feature.names) > 2) {
data_types <- factor(x = feature.names$V3)
lvls <- levels(x = data_types)
if (length(x = lvls) > 1 && length(x = full.data) == 0) {
message(paste0("10X data contains more than one type and is ",
"being returned as a list containing matrices ",
"of each type."))
}
expr_name <- "Gene Expression"
if (expr_name %in% lvls) {
lvls <- c(expr_name, lvls[-which(x = lvls ==
expr_name)])
}
data <- lapply(X = lvls, FUN = function(l) {
return(data[data_types == l, , drop = FALSE])
})
names(x = data) <- lvls
} else {
data <- list(data)
}
full.data[[length(x = full.data) + 1]] <- data
}
list_of_data <- list()
for (j in 1:length(x = full.data[[1]])) {
list_of_data[[j]] <- do.call(cbind, lapply(X = full.data,
FUN = `[[`, j))
list_of_data[[j]] <- as(object = list_of_data[[j]], Class = "CsparseMatrix")
}
names(x = list_of_data) <- names(x = full.data[[1]])
if (length(x = list_of_data) == 1) {
return(list_of_data[[1]])
} else {
return(list_of_data)
}
}
match_features <- function(exprs_matrix, fdata) {
identical_features <- all(rownames(exprs_matrix) == rownames(fdata))
if (nrow(exprs_matrix) != nrow(fdata) || !identical_features) {
message(paste0("Features in 'fData' and expression matrix differ. ",
"Only matching features are returned."))
}
features <- intersect(rownames(exprs_matrix), rownames(fdata))
exprs_matrix <- exprs_matrix[which(rownames(exprs_matrix) %in% features), ]
fdata <- fdata[which(rownames(fdata) %in% features), ]
fdata[, seq_len(ncol(fdata))] <- lapply(fdata[, seq_len(ncol(fdata)), drop = FALSE], as.character)
# order features in exprs mat according to fdata
exprs_matrix <- exprs_matrix[match(rownames(fdata), rownames(exprs_matrix)), ]
list(exprs_matrix = exprs_matrix, fdata = fdata)
}
create_pdata <- function(sample_file, pool_name, barcodes) {
cols_to_remove <- c("SampleFileName", "Output", "Measure", "Strandedness")
pData <- sample_file[, !colnames(sample_file) %in% cols_to_remove,
drop = FALSE]
rownames(pData) <- lapply(sample_file$WellBC,
\(x) paste(pool_name, x, sep = "_"))
# pData[, ] <- lapply(pData, as.factor)
pData$PoolName <- pool_name
pData <- pData[match(barcodes, pData$WellBC), ]
return(pData)
}
check_sample_file <- function(mapping_dir, sample_file){
message("Checking sample annotation:")
requireNamespace("tools")
mapping_dir <- unlist(lapply(mapping_dir, function(x) {
if (!dir.exists(x)) {
stop(sprintf(paste0("Could not find directory ",
"provided in 'mappingDir' argument (%s)."), x))
}
tools::file_path_as_absolute(x)
}))
# additional check for STARsolo
check_STARsolo_output <- function(x) {
files <- c("barcodes.tsv", "features.tsv", "matrix.mtx")
test <- list.files(x) %in% c(files, paste0(files, ".gz"))
length(test) != 0 && all(test)
}
if (!"WellBC" %in% colnames(sample_file)) {
stop(paste0("STARsolo output is used. The sample annotation must ",
"contain 'WellBC' column providing cell barcodes."))
}
mapping_dir <- unique(mapping_dir)
all_STARsolo_files_present <- all(
unlist(
lapply(mapping_dir, function(x) {
check_STARsolo_output(x)
})
)
)
if (!all_STARsolo_files_present) {
stop(paste0("Could not find files: 'barcodes', 'features' and 'matrix'",
" for STARsolo output. Please check 'mappingDir' argument."))
}
message("- 'SampleFileName' column - OK")
list(sample_expression_files = mapping_dir)
}
create_exprs_matrix <- function(exprs_matrix_path, exprs_file_paths,
output, measure, col_names, cell_barcodes) {
read_matrix <- Read10X(data_dir = exprs_file_paths, gene_column = 1)
read_matrix <- read_matrix[, which(colSums(read_matrix) != 0)]
# keep index of feature names containing "_" because Seurat
#changes them to "-" and they no longer match with fdata[, "gene_id"]
idx <- grep("_", rownames(read_matrix))
requireNamespace("Seurat")
seurat_object <- Seurat::CreateSeuratObject(counts = read_matrix)
exprs_matrix <- as.matrix(seurat_object[['RNA']]$counts)
# replace "-" with "_" for features with "_"
# before converting to Seurat object
rownames(exprs_matrix)[idx] <- gsub("-", "_", rownames(exprs_matrix)[idx])
requireNamespace("stringr")
exprs_matrix <- exprs_matrix[, stringr::str_detect(colnames(exprs_matrix),
paste(cell_barcodes, collapse = "|"))]
# check if rownames are ENSEMBL and remove version suffix
isENSEMBL <- all(grepl("ENS", rownames(exprs_matrix)))
if (isENSEMBL) {
# do not use gsub("(.+)[.]\\d+", "\\1", rownames(exprs_matrix)),
# so that ENS000000.1_PAR_Y can be kept
rownames(exprs_matrix) <- gsub("\\.\\d+$", "", rownames(exprs_matrix))
}
colnames(exprs_matrix) <- col_names
exprs_matrix
}
create_eset <- function(feature_annotation_path,
sample_annotation_path,
mapping_dir,
barcodes,
output_path,
pool_name,
exprs_matrix_path = NULL,
path = NULL,
add_eset_annotation = NULL) {
if (!file.exists(feature_annotation_path)) {
stop("Could not find feature annotation at '", feature_annotation_path, "'")
}
if (!file.exists(sample_annotation_path)) {
stop("Could not find sample annotation at '", sample_annotation_path, "'")
}
if(!is.null(exprs_matrix_path)) {
if(!file.exists(exprs_matrix_path)) {
stop("Could not find expression matrix at '", exprs_matrix_path, "'")
}
}
if(!is.null(path)) {
if(!dir.exists(path)) {
stop("Provided 'path': '", path, "' does not exist.")
}
}
##### Import annotation files #####
message("Importing feature annotation")
fdata_file <- read.table(feature_annotation_path, header = TRUE,
sep = "\t", quote = "\"",
comment.char = "", stringsAsFactors = FALSE)
# for backwards compatibility
if("ENSEMBL" %in% colnames(fdata_file) && !all(grepl("ENS", fdata_file[, "ENSEMBL"])) & !"gene_id" %in% colnames(fdata_file)) {
colnames(fdata_file)[which(colnames(fdata_file) == "ENSEMBL")] <- "gene_id"
}
# Check gene annotation
if(!"gene_id" %in% colnames(fdata_file))
stop("'gene_id' column with unique feature identifiers must be present in 'feature_annotation_path'.")
# check if duplicated ids are present
if(any(duplicated(fdata_file$gene_id)))
stop("Duplicated features ids are not allowed. Please check the 'gene_id' column in 'feature_annotation_path'.")
message("Importing sample annotation")
sample_file <- read.table(sample_annotation_path, header = TRUE,
sep = "\t", quote = "\"",
comment.char = "", stringsAsFactors = FALSE)
# Check sample annotation
check_sample_file_list <- check_sample_file(mapping_dir = mapping_dir,
sample_file = sample_file)
output <- "STARsolo"
measure <- "counts"
sample_expression_files <- check_sample_file_list$sample_expression_files
##### Create phenodata #####
pdata_eset <- create_pdata(sample_file = sample_file, pool_name = pool_name,
barcodes = barcodes)
##### Create expression matrix #####
message("Creating expression matrix")
exprs_matrix_eset <- create_exprs_matrix(
exprs_matrix_path = exprs_matrix_path,
exprs_file_paths = sample_expression_files,
output = output,
measure = measure,
col_names = rownames(pdata_eset),
cell_barcodes = barcodes
)
##### Create featuredata #####
message("Creating feature data")
fdata_eset <- fdata_file
rownames(fdata_eset) <- fdata_eset[, "gene_id"]
# intersect features between exprs matrix and fdata
feature_files <- match_features(exprs_matrix = exprs_matrix_eset,
fdata = fdata_eset)
fdata_eset <- feature_files$fdata
exprs_matrix_eset <- feature_files$exprs_matrix
##### Create eSet #####
message("Creating eset")
if (nrow(pdata_eset) != ncol(exprs_matrix_eset)) {
stop("nrow(pData) and ncol(exprsMatrix) differ")
}
if (nrow(fdata_eset) != nrow(exprs_matrix_eset)) {
stop("nrow(fData) and nrow(exprsMatrix) differ")
}
if (!all(rownames(pdata_eset) == colnames(exprs_matrix_eset))) {
stop("rownames(pData) and colnames(exprsMatrix) differ")
}
if (!all(rownames(fdata_eset) == rownames(exprs_matrix_eset))) {
stop("rownames(fData) and rownames(exprsMatrix) differ")
}
if (!inherits(exprs_matrix_eset, "matrix")) {
stop("exprsMatrix must be of class 'matrix'")
}
additional_info <- paste0("Additional information about eSet \n",
" Expression matrix created from ",
output, " output. \n",
" Expression matrix contains non-transformed ",
ifelse(output %in% c("STAR", "STARsolo"),
"counts",
ifelse(measure == "expected_count",
"counts", measure)), ".")
if (isTRUE(!is.null(add_eset_annotation) &
is.character(add_eset_annotation))) {
additional_info <- paste0(additional_info, "\n", " ", add_eset_annotation)
}
fdata_eset <- new("AnnotatedDataFrame", data = fdata_eset)
pdata_eset <- new("AnnotatedDataFrame", data = pdata_eset)
requireNamespace("Biobase")
eset <- Biobase::ExpressionSet(assayData = exprs_matrix_eset,
phenoData = pdata_eset,
featureData = fdata_eset,
annotation = additional_info)
saveRDS(eset, file = output_path)
message(paste0("eset created succesfully for ", ncol(eset),
" samples and ", nrow(eset),
" genes and saved at ", output_path, "."))
eset
}
p_data_file <- par$pDataFile
f_data_file <- par$fDataFile
pool_name <- par$poolName
mapping_dir <- lapply(par$mappingDir,
\(x) file.path(x, "Solo.out", "Gene", "raw"))
get_barcode_from_mapping_dir <- function(raw_dir) {
barcodes_file <- file.path(raw_dir, "barcodes.tsv")
if (!file.exists(barcodes_file)) {
stop(paste0("Expected the 'Solo.out/Gene/raw' directory at ",
raw_dir, " to contain a 'barcodes.tsv' file."))
}
barcodes <- readLines(barcodes_file)
if (length(barcodes) != 1) {
stop(paste0("A single STAR Solo folder should only have ",
"mapped one (1) barcode, but found '",
length(barcodes), "'for mapping directory ", raw_dir))
}
return(barcodes)
}
barcodes <- lapply(mapping_dir, get_barcode_from_mapping_dir)
print(paste0("mappingDir: ", mapping_dir))
print(paste0("pDataFile: ", p_data_file))
print(paste0("fDataFile: ", f_data_file))
print(paste0("poolName: ", pool_name))
print(paste0("barcodes: ", barcodes))
# CREATE ESET WITH RAW UMI COUNTS
eset <- create_eset(feature_annotation_path = f_data_file,
sample_annotation_path = p_data_file,
mapping_dir = mapping_dir,
barcodes = barcodes,
output_path = par$output,
pool_name = pool_name,
path = NULL,
exprs_matrix_path = NULL)