#' Load count matrix from a txt (or txt.gz) file
#'
#' @param txt.file Filename of the counts txt (or txt.gz) file
#' @return Count matrix with genes and barcodes
#' @export
LoadTxt <- function (txt.file)
{
E = utils::read.delim(txt.file, stringsAsFactors=FALSE, row.names = 1)
E = as.matrix(E)
E = Matrix::Matrix(E, sparse = TRUE)
return(E)
}
#' Load count matrix from a txt (or txt.gz) file
#'
#' @param txt.folder folder where each sub-directory contains the counts txt (or txt.gz) file
#' @return Count matrix with genes and barcodes
#' @export
LoadTxtFolder <- function (txt.folder)
{
txt.folder = gsub("\\/$", "", txt.folder, perl = TRUE);
fls = list.files(txt.folder, full.names = T)
E = lapply(fls, function(x) utils::read.delim(x, stringsAsFactors=FALSE, row.names = 1))
E = lapply(E, as.matrix)
E = lapply(E, function(x) Matrix::Matrix(x, sparse = TRUE))
return(E)
}
#' Load 10X count matrix from an h5 file
#'
#' @param filename Directory and filename of the h5 file
#' @param use.names Boolean TRUE, see ?Seurat::Read10X_h5
#' @param unique Sets the rownames (genes) of the matrix to be unique, see ?make.names
#' @return Count matrix with genes and barcodes
#' @export
Read10Xh5 <- function (filename, use.names = TRUE, unique = TRUE)
{
if (!requireNamespace("hdf5r", quietly = TRUE)) {
stop("Please install hdf5r to read HDF5 files")
}
if (!file.exists(filename)) {
stop("File not found")
}
infile <- hdf5r::H5File$new(filename)
genomes <- names(infile)
output <- list()
flag = !infile$attr_exists("PYTABLES_FORMAT_VERSION")
if (flag) {
if (use.names) {
feature_slot <- "features/name"
}
else {
feature_slot <- "features/id"
}
} else {
if (use.names) {
feature_slot <- "gene_names"
feature_slot2 = "genes"
}
else {
feature_slot = "genes"
}
}
for (genome in genomes) {
counts <- infile[[paste0(genome, "/data")]]
indices <- infile[[paste0(genome, "/indices")]]
indptr <- infile[[paste0(genome, "/indptr")]]
shp <- infile[[paste0(genome, "/shape")]]
features <- infile[[paste0(genome, "/", feature_slot)]][]
if (!flag)
features2 <- infile[[paste0(genome, "/", feature_slot2)]][]
barcodes <- infile[[paste0(genome, "/barcodes")]]
sparse.mat <- Matrix::sparseMatrix(i = indices[] + 1, p = indptr[],
x = as.numeric(counts[]), dims = shp[], giveCsparse = FALSE)
if (!flag) {
rownames(sparse.mat) <- make.names(paste(features, features2, sep = "_._"), unique = unique)
} else {
rownames(sparse.mat) <- make.names(features, unique = unique)
}
colnames(sparse.mat) <- barcodes[]
sparse.mat <- as(object = sparse.mat, Class = "dgCMatrix")
output[[genome]] <- sparse.mat
}
infile$close_all()
if (length(output) == 1) {
return(output[[genome]])
}
else {
return(output)
}
}
#' Load decont count matrix from an h5 file
#'
#' @param filename Directory and filename of the h5 file
#' @param use.names Boolean TRUE, see ?Seurat::Read10X_h5
#' @return Count matrix with genes and barcodes
#' @export
ReadDeconth5 <- function (filename, use.names = TRUE)
{
if (!requireNamespace("hdf5r", quietly = TRUE)) {
stop("Please install hdf5r to read HDF5 files")
}
if (!file.exists(filename)) {
stop("File not found")
}
infile <- hdf5r::H5File$new(filename)
genomes <- names(infile)
output <- list()
if (!infile$attr_exists("PYTABLES_FORMAT_VERSION")) {
if (use.names) {
feature_slot <- "features/name"
}
else {
feature_slot <- "features/id"
}
} else {
if (use.names) {
feature_slot <- "gene_names"
feature_slot2 = "genes"
}
else {
feature_slot = "genes"
}
}
for (genome in genomes) {
counts <- infile[[paste0(genome, "/data")]]
indices <- infile[[paste0(genome, "/indices")]]
indptr <- infile[[paste0(genome, "/indptr")]]
shp <- infile[[paste0(genome, "/shape")]]
features <- infile[[paste0(genome, "/", "gene_names")]][]
features2 <- infile[[paste0(genome, "/", "genes")]][]
barcodes <- infile[[paste0(genome, "/barcodes")]]
sparse.mat <- Matrix::sparseMatrix(i = indices[] + 1, p = indptr[],
x = as.numeric(counts[]), dims = shp[], giveCsparse = FALSE)
rownames(sparse.mat) <- paste(features, features2, sep = "_._")
colnames(sparse.mat) <- barcodes[]
sparse.mat <- as(object = sparse.mat, Class = "dgCMatrix")
output[[genome]] <- sparse.mat
}
infile$close_all()
if (length(output) == 1) {
return(output[[genome]])
}
else {
return(output)
}
}
#' Load count matrix from a tsv (or tsv.gz) file
#'
#' @param counts.file Filename of the counts tsv (or tsv.gz) file
#' @param meta.file Filename of the metadata tsv (or tsv.gz) file
#' @return Count matrix with genes and barcodes
#' @export
ReadCelseq <- function (counts.file, meta.file)
{
E = suppressWarnings(readr::read_tsv(counts.file));
gns <- E$gene;
E = E[,-1]
M = suppressWarnings(readr::read_tsv(meta.file))
S = lapply(unique(M$sample), function(x) Matrix::Matrix(as.matrix(E[,M$sample == x]), sparse = TRUE))
S = lapply(S, function(x) as(x, Class = "dgCMatrix"))
names(S) <- unique(M$sample)
#rownames(S[[1]]) <- gns
S = lapply(S, function(x){
rownames(x) <- gns
x
})
S
}
#' Load Indrop count matrix from "matrix.txt" file
#'
#' @param filename Directory and filename of the "matrix.txt" file
#' @return Count matrix with genes and barcodes
#' @export
ReadIndrop <- function (filename)
{
q = utils::read.delim(filename, stringsAsFactors = FALSE)
barcodes = q$Name
genes = colnames(q)[-c(1,2,3,4,5)]
matrix = Matrix::Matrix(t(as.matrix(q[,-c(1,2,3,4,5)])), sparse = TRUE)
matrix = as(matrix, "dgTMatrix")
rownames(matrix) <- genes
colnames(matrix) <- barcodes
return(matrix)
}
#' Load folder of Indrop count matrices from "matrix.txt" file
#'
#' @param filedir Directory where each folder contains a "matrix.txt" file
#' @return Count matrix with genes and barcodes
#' @export
ReadIndropFolder <- function (filedir)
{
filedir = gsub("\\/$", "", filedir, perl = TRUE);
dirs = list.files(filedir, full.names = TRUE)
fns = list.files(dirs, full.names = TRUE)
q = lapply(fns, function(x) utils::read.delim(x, stringsAsFactors = FALSE))
barcodes = lapply(q, function(x) x$Name)
genes = lapply(q, function(x) colnames(x)[-c(1,2,3,4,5)])
matrix = lapply(q, function(x) Matrix::Matrix(t(as.matrix(x[,-c(1,2,3,4,5)])), sparse = TRUE))
matrix = lapply(matrix, function(x) as(x, "dgTMatrix"))
matrix = mapply(function(x,y) {
colnames(x) <- y
x
}, x = matrix, y = barcodes)
names(matrix) <- list.files(filedir)
return(matrix)
}
#' Load decont count matrix from an h5 file in each sub-folder
#'
#' @param filedir Directory where each folder contains filename fn
#' @param fn File name of each .h5 file. By default, fn = "raw_gene_bc_matrices_h5.h5"
#' @return Count matrix with genes and barcodes
#' @seealso Read10Xh5_v2, ReadCelseq
#' @export
ReadDeconth5Folder <- function (filedir, fn = "count_matrix.h5")
{
filedir = gsub("\\/$", "", filedir, perl = TRUE);
fls = list.files(filedir, full.names = TRUE)
fls = list.files(fls, full.names = TRUE)
fls = fls[grepl(fn, fls)]
if (length(fls) == 0)
{
warning(fn, "not detected in any folder. Perhaps you can check that the file exists? \n")
stop()
}
cat(" .......... Entry in ReadDeconth5_folder: \n");
ta = proc.time()[3];
cat(" Number of count matrices:", length(fls), "\n");
E = lapply(fls, function(x) ReadDeconth5(x))
names(E) <- list.files(filedir)[grepl(fn, fls)]
cat(" Number of genes:", sapply(E, nrow)[1], "\n");
cat(" Total number of barcodes:", sum(sapply(E, ncol)), "\n");
tb = proc.time()[3] - ta;
cat(" .......... Exit Read10X_h5_folder \n");
cat(" Execution time = ", tb, " s.\n", sep = "");
return(E)
}
#' Load 10X count matrix from an h5 file in each sub-folder
#'
#' @param filedir Directory where each folder contains filename fn
#' @param fn File name of each .h5 file. By default, fn = "raw_gene_bc_matrices_h5.h5"
#' @return Count matrix with genes and barcodes
#' @seealso Read10Xh5_v2, ReadCelseq
#' @export
Read10Xh5Folder <- function (filedir, fn = "raw_gene_bc_matrices_h5.h5")
{
filedir = gsub("\\/$", "", filedir, perl = TRUE);
fls = list.files(filedir, full.names = TRUE)
fls = list.files(fls, full.names = TRUE)
fls = fls[grepl(fn, fls)]
if (length(fls) == 0)
{
warning(fn, "not detected in any folder. Perhaps you can check that the file exists? \n")
stop()
}
cat(" .......... Entry in Read10X_h5_folder: \n");
ta = proc.time()[3];
cat(" Number of count matrices:", length(fls), "\n");
E = lapply(fls, function(x) Read10Xh5(x))
names(E) <- list.files(filedir)[grepl(fn, fls)]
cat(" Number of genes:", sapply(E, nrow)[1], "\n");
cat(" Total number of barcodes:", sum(sapply(E, ncol)), "\n");
tb = proc.time()[3] - ta;
cat(" .......... Exit Read10X_h5_folder \n");
cat(" Execution time = ", tb, " s.\n", sep = "");
return(E)
}
#' Filter barcodes
#'
#' @param E Expression matrix or a list of expression matrices
#' @param min_genes_before All barcodes with less than min_genes_before detected will be removed prior to background correction.
#' @param min_umi_before All barcodes with less than min_umi_before will be removed prior to background correction.
#' @param keep If TRUE, all genes are kept for total counts and genes detected calculation. If FALSE, total counts and genes detected are caculated after discarding mitochondrial and ribosomal genes.
#' @return Filtered expression matrices and barcodes
#' @export
FilterBarcodes <- function (E, min_genes_before = 0, min_umi_before = 0, keep = T)
{
if (! class(E) %in% "list")
E = list(E)
cat(" .......... Entry in FilterBarcodes: \n");
ta = proc.time()[3];
cat(" Total number of barcodes before filtering:", sum(sapply(E, ncol)), "\n");
# keep only cells with > min_genes genes detected
if (keep) {
E = lapply(E, function(x) x[,(Matrix::colSums(x != 0) > min_genes_before) & (Matrix::colSums(x) > min_umi_before)])
} else {
crap = grepl("^RPS", rownames(E[[1]])) | grepl("^RPL", rownames(E[[1]])) | grepl("^RP[0-9]", rownames(E[[1]])) | grepl("^MT-", rownames(E[[1]]))
E = lapply(E, function(x) x[,(Matrix::colSums(x[!crap,] != 0) > min_genes_before) & (Matrix::colSums(x[!crap,]) > min_umi_before)])
}
cat(" Total number of barcodes after filtering:", sum(sapply(E, ncol)), "\n");
tb = proc.time()[3] - ta;
cat(" .......... Exit FilterBarcodes \n");
cat(" Execution time = ", tb, " s.\n", sep = "");
return(E)
}
#' Main function
#'
#' @param E Expression matrix or a list of expression matrices
#' @param B Background matrix or a list of background matrices (see ?GetBackground)
#' @param min_genes_before All barcodes with less than min_genes_before detected will be removed prior to background correction.
#' @param min_genes_after All barcodes with less than min_genes_after detected will be removed after background correction.
#' @param min_umi_before All barcodes with less than min_umi_before will be removed prior to background correction.
#' @param min_umi_after All barcodes with less than min_umi_after will be removed after background correction.
#' @param do.par If TRUE, regressions will be done in parallel
#' @param num.cores Set number of cores for parallel computation. By default, all available cores - 1 will be used.
#' @param large if TRUE, dataset is chunked prior to regression to prevent any memory errors.
#' @param num.chunks if large, the dataset is chunked into num.chunks, and then regressed in parallel.
#' @return Background corrected expression matrices
#' @export
Decontaminate <- function(E, B, min_genes_before = 0, min_genes_after = 0, min_umi_before = 0, min_umi_after = 0, do.par = FALSE, num.cores = NULL, large = FALSE, num.chunks = 20)
{
if (! class(E) %in% "list")
E = list(E)
if (! class(B) %in% "list")
B = list(B)
cat(" .......... Entry in Decontaminate: \n");
ta = proc.time()[3];
cat(" Number of count matrices:", length(E), "\n");
cat(" Number of genes:", sapply(E, nrow)[1], "\n");
cat(" Total number of barcodes:", sum(sapply(E, ncol)), "\n");
# keep only cells with > min_genes genes detected
crap = grepl("RPS[0-9]", rownames(E[[1]]), ignore.case = T) | grepl("RPL[0-9]", rownames(E[[1]]), ignore.case = T) | grepl("^MT-", rownames(E[[1]]), ignore.case = T) | grepl("^RP[0-9]", rownames(E[[1]]), ignore.case = T)
# keep only cells with > min_umi_before detected
E = lapply(E, function(x) x[,(Matrix::colSums(x[!crap,]) > min_umi_before) & (Matrix::colSums(x[!crap,] != 0) > min_genes_before)])
if (min_genes_before != 0 | min_umi_before != 0)
cat(" Total number of barcodes after filtering:", sum(sapply(E, ncol)), "\n");
if (large)
E = lapply(E, function(x) array_split(x, num.chunks))
cat(" .......... Running regressions with RcppArmadillo (this may take a minute)... \n");
if (!do.par)
num.cores = 1
if (do.par & is.null(num.cores))
num.cores = parallel::detectCores() - 1
if (!large)
{
Q = parallel::mcmapply(function(x,y) {
if(sum(y) == 0)
return(x)
q = Matrix::Matrix(apply(x,2, function(z) {as.matrix(z) - as.matrix(y) %*% RcppArmadillo::fastLmPure(as.matrix(y), as.matrix(z))$coefficients
}), sparse = TRUE)
q[q < 1] <- 0
q}
,x = E, y = B, SIMPLIFY = FALSE, mc.cores = num.cores)
} else {
Q = parallel::mclapply(E, function(X) {mapply(function(x,y) {
if(sum(y) == 0)
return(x)
q = Matrix::Matrix(apply(x,2, function(z) {as.matrix(z) - as.matrix(y) %*% RcppArmadillo::fastLmPure(as.matrix(y), as.matrix(z))$coefficients
}), sparse = TRUE)
q[q < 1] <- 0
q
}
,x = X, y = B, SIMPLIFY = FALSE)}, mc.cores = num.cores)
}
if (large)
Q = list(do.call(cbind, lapply(Q, function(x) do.call(cbind, x))))
# round Q to be integers
Q = lapply(Q, round)
cat(" .......... Regressions completed successfully! \n");
Q[which(sapply(B,sum) != 0)] = lapply(Q[which(sapply(B,sum) != 0)], function(x) {
x[,(Matrix::colSums(x[!crap,]) > min_umi_after) & (Matrix::colSums(x[!crap,] != 0) > min_genes_after)]
})
cat(" Total number of decontaminated barcodes with genes detected > ", min_genes_after, "and total counts >", min_umi_after, ":", sum(sapply(Q, ncol)), "\n");
Q = lapply(Q, function(x){rownames(x) <- rownames(E[[1]])
x})
tb = proc.time()[3] - ta;
cat(" .......... Exit Decontaminate \n");
cat(" Execution time = ", tb, " s.\n", sep = "");
return(Q)
}
#' Regress MT percentage
#'
#' @param E Expression matrix or a list of expression matrices
#' @param M Mitochondrial gene percentage vector or a list of these vector (see ?GetMito)
#' @param min_genes_before All barcodes with less than min_genes_before detected will be removed prior to background correction.
#' @param min_genes_after All barcodes with less than min_genes_after detected will be removed after background correction.
#' @param min_umi_before All barcodes with less than min_umi_before will be removed prior to background correction.
#' @param do.par If TRUE, regressions will be done in parallel
#' @param num.cores Set number of cores for parallel computation. By default, all available cores - 1 will be used.
#' @param large if TRUE, dataset is chunked prior to regression to prevent memory errors.
#' @param num.chunks if large, the dataset is chunked into num.chunks, and then regressed in parallel.
#' @return Background corrected expression matrices
#' @export
RegressMito <- function(E, M, min_genes_before = 0, min_genes_after = 0, min_umi_before = 0, do.par = FALSE, num.cores = NULL, large = FALSE, num.chunks = 20)
{
if (! class(E) %in% "list")
E = list(E)
if (! class(M) %in% "list")
M = list(M)
cat(" .......... Entry in RegressMito: \n");
ta = proc.time()[3];
cat(" Number of count matrices:", length(E), "\n");
cat(" Number of genes:", sapply(E, nrow)[1], "\n");
cat(" Total number of barcodes:", sum(sapply(E, ncol)), "\n");
# keep only cells with > min_genes genes detected
crap = grepl("^RPS", rownames(E[[1]])) | grepl("^RPL", rownames(E[[1]])) | grepl("^RP1", rownames(E[[1]])) | grepl("^RP2", rownames(E[[1]])) | grepl("^RP3", rownames(E[[1]])) | grepl("^RP4", rownames(E[[1]])) | grepl("^RP5", rownames(E[[1]])) | grepl("^RP6", rownames(E[[1]])) | grepl("^RP7",rownames(E[[1]])) | grepl("^RP8", rownames(E[[1]])) | grepl("^RP9", rownames(E[[1]])) | grepl("^MT-", rownames(E[[1]]))
E = lapply(E, function(x) x[,Matrix::colSums(x[!crap,] != 0) > min_genes_before])
# keep only cells with > min_umi_before detected
E = lapply(E, function(x) x[,Matrix::colSums(x[!crap,]) > min_umi_before])
if (min_genes_before != 0 | min_umi_before != 0)
cat(" Total number of barcodes after filtering:", sum(sapply(E, ncol)), "\n");
if (large)
E = lapply(E, function(x) array_split(x, num.chunks))
cat(" .......... Running regressions with RcppArmadillo (this may take a minute)... \n");
if (!do.par)
num.cores = 1
if (do.par & is.null(num.cores))
num.cores = parallel::detectCores() - 1
if (!large)
{
Q = parallel::mcmapply(function(x,y) {
if(sum(y) == 0)
return(x)
q = Matrix::Matrix(apply(x,1, function(z) {as.matrix(z) - as.matrix(y) %*% RcppArmadillo::fastLmPure(as.matrix(y), as.matrix(z))$coefficients
}), sparse = TRUE)
q[q < 1] <- 0
q}
,x = E, y = M, SIMPLIFY = FALSE, mc.cores = num.cores)
} else {
Q = parallel::mclapply(E, function(X) {mapply(function(x,y) {
if(sum(y) == 0)
return(x)
q = Matrix::Matrix(apply(x,1, function(z) {as.matrix(z) - as.matrix(y) %*% RcppArmadillo::fastLmPure(as.matrix(y), as.matrix(z))$coefficients
}), sparse = TRUE)
q[q < 1] <- 0
q
}
,x = X, y = M, SIMPLIFY = FALSE)}, mc.cores = num.cores)
}
if (large)
Q = list(do.call(cbind, lapply(Q, function(x) do.call(cbind, x))))
# transpose
Q = lapply(Q, Matrix::t)
cat(" .......... Regressions completed successfully! \n");
Q[which(sapply(M,sum) != 0)] = lapply(Q[which(sapply(M,sum) != 0)], function(x) {
x[,Matrix::colSums(x != 0) >= min_genes_after]
})
cat(" Total number of regressed barcodes with genes detected >= ", min_genes_after, ":", sum(sapply(Q, ncol)), "\n");
rownames(Q[[1]]) <- rownames(E[[1]])
Q = mapply(function(x,y){
colnames(x) <- colnames(y)
x
}, x = Q, y = E)
tb = proc.time()[3] - ta;
cat(" .......... Exit RegressMito \n");
cat(" Execution time = ", tb, " s.\n", sep = "");
return(Q)
}
#' Get background matrix
#'
#' @param E Expression matrix or a list of expression matrices
#' @param Cutoffs A dataframe as returned by ?GetCutoffs
#' @return Background expression matrices
#' @export
GetBackgroundMatrix <- function(E, Cutoffs)
{
if (! class(E) %in% "list")
E = list(E)
if (NROW(Cutoffs) != 1)
{
B = mapply(function(x,y) {
if (!is.na(y$min_umi)) {
idx = Matrix::colSums(x) >= y$min_umi & Matrix::colSums(x != 0) <= y$min_genes;
x[,idx]
} else {
return(Matrix::Matrix(0, nrow(E[[1]]), 1, sparse = TRUE))
}
}, x = E, y = Cutoffs, SIMPLIFY = F)
} else {
B = lapply(E, function(x) {idx = Matrix::colSums(x) >= Cutoffs[[1]]$min_umi & Matrix::colSums(x != 0) <= Cutoffs[[1]]$min_genes;
x[,idx]})
}
B[sapply(B, class) == "numeric"] = lapply(B[sapply(B, class) == "numeric"], function(x) {
Matrix::Matrix(cbind(x,x))
})
return(B)
}
#' Get background distribution
#'
#' @param E Expression matrix or a list of expression matrices
#' @param Cutoffs A dataframe as returned by ?GetCutoffs
#' @return Background distributions
#' @export
GetBackground <- function(E, Cutoffs = NA)
{
if (! class(E) %in% "list")
E = list(E)
cat(" .......... Entry in GetBackground: \n");
ta = proc.time()[3];
cat(" Number of count matrices:", length(E), "\n");
cat(" Number of genes:", nrow(E[[1]]), "\n");
cat(" Total number of barcodes:", sum(sapply(E, ncol)), "\n");
if (is.na(Cutoffs[1]))
Cutoffs = GetCutoffs(E, verbose = F)
if (is.data.frame(Cutoffs))
Cutoffs = list(Cutoffs)
# Get background matrix
B = GetBackgroundMatrix(E = E, Cutoffs = Cutoffs)
# Define contaminated droplets
B = lapply(B, function(x){
if (!sum(x)) {
return(Matrix::Matrix(0, nrow(E[[1]]), 1, sparse = TRUE))
} else if (is.null(ncol(x)))
{
return(x)
} else {
apply(x, 1, max)
}})
if (0 %in% sapply(B, sum))
{
cat(" .......... Friendly message from GetBackground! No background detected for samples: \n");
cat(" ", paste(names(E)[which(sapply(B,sum) == 0)],collapse = ", "))
cat("\n")
}
tb = proc.time()[3] - ta;
cat(" .......... Exit GetBackground \n");
cat(" Execution time = ", tb, " s.\n", sep = "");
return(B)
}
#' Get background distribution
#'
#' @param B Expression matrix or a list of expression matrices as returned by ?GetBackgroundMatrix
#' @return Background distributions
#' @export
GetBackgroundFromMatrix <- function(B)
{
if (! class(B) %in% "list")
B = list(B)
# Define contaminated droplets
B = lapply(B, function(x){
if (!sum(x)) {
return(Matrix::Matrix(0, nrow(B[[1]]), 1, sparse = TRUE))
} else if (is.null(ncol(x)))
{
return(x)
} else {
apply(x, 1, max)
}})
return(B)
}
#' Inspect background matrix
#'
#' @param B Background expression matrix, as returned by GetBackgroundMatrix
#' @return a correlogram of the Pearson correlation coefficients
#' @export
InspectBackgroundMatrix <- function(B)
{
if (class(B) != "list")
B = list(B)
if (is.null(names(B)))
names(B) <- paste("Sample", seq_along(1:length(B)))
B = B[! 0 %in% sapply(B, sum)]
B = B[! "numeric" %in% sapply(B, class)]
# remove all zero expressed genes
B = lapply(B, function(x){
if (class(x) == "numeric")
x = Matrix::Matrix(cbind(x,x))
x[Matrix::rowSums(x) != 0,]
})
# generate correlation matrix
q = lapply(B, function(x) {
cor_mat = stats::cor(as.matrix(x))
colnames(cor_mat) <- paste("CB", 1:ncol(cor_mat))
rownames(cor_mat) <-paste("CB", 1:ncol(cor_mat))
corrplot::corrplot(cor_mat,
addgrid.col = NA,
tl.col='black',
cl.cex=1,
tl.srt = 60,
cl.align = "l",
tl.cex = 1.5,
cl.ratio = 0.2,
cl.length = 5)
})
return(q)
}
#' Filter background matrix
#'
#' @param B Background expression matrix, as returned by GetBackgroundMatrix
#' @param cutoff a value between 0 and 1 that filters based on the average Pearson correlation coefficient. Default is 0.75.
#' @return a filtered background matrix
#' @export
FilterBackgroundMatrix <- function(B, cutoff = 0.75)
{
if (class(B) != "list")
B = list(B)
if (is.null(names(B)))
names(B) <- paste("Sample", seq_along(1:length(B)))
# remove all zero expressed genes
D = lapply(B, function(x){
x[Matrix::rowSums(x) != 0,]
})
# generate correlation matrix
q = lapply(D, function(x) {
cor_mat = stats::cor(as.matrix(x))
})
# get average correlation coefficient
q = mapply(function(x,y) {
kmu = colMeans(x)
y[,kmu > cutoff]
}, x = q, y= B, SIMPLIFY = F)
return(q)
}
#' Inspect background genes in background matrix
#'
#' @param B Background expression matrix, as returned by GetBackgroundMatrix
#' @param num.genes number of genes to display. Default is 10.
#' @return a barplot of the percentage of cells expressing genes
#' @export
InspectBackgroundGenes <- function(B, num.genes = 10)
{
if (class(B) != "list")
B = list(B)
if (is.null(names(B)))
names(B) <- paste("Sample", seq_along(1:length(B)))
B = B[! 0 %in% sapply(B, sum)]
# generate plots
q = lapply(B, function(x) {
kmu = Matrix::rowSums(x != 0)
xx = factor(names(kmu)[order(kmu, decreasing = T)], levels = names(kmu)[order(kmu, decreasing = T)])
df = data.frame(Genes = xx,
Percent = kmu[order(kmu, decreasing = T)] / ncol(x))
df = df[1:num.genes,]
ggplot2::ggplot(data=df, ggplot2::aes(x=Genes, y=round(Percent*100))) +
ggplot2::geom_bar(stat="identity", fill="steelblue")+
ggplot2::theme_minimal() + ggplot2::xlab("Genes") + ggplot2::ylab("Percent expressed") + ggplot2::coord_flip() + ggplot2::ggtitle("Background expression profile")
})
return(q)
}
#' Inspect background distribution
#'
#' @param B Background expression, as returned by GetBackground
#' @return list of the gene names in the empty droplets for each sample with background.
#' @export
InspectBackground <- function(B)
{
if (class(B) != "list")
B = list(B)
if (is.null(names(B)))
names(B) <- paste("Sample", seq_along(1:length(B)))
B = B[! 0 %in% sapply(B, sum)]
q = lapply(B, function(x) {
names(x)[x != 0]
})
return(q)
}
#' Hierarchical cluster the background matrix
#'
#' @param B Background expression matrix, as returned by GetBackgroundMatrix
#' @param k Parameter for tree cutting, see ?cutree
#' @return group membership for each background cell
#' @export
HClustBackgroundMatrix <- function(B, k = 2)
{
if (class(B) != "list")
B = list(B)
if (is.null(names(B)))
names(B) <- paste("Sample", seq_along(1:length(B)))
B = B[! 0 %in% sapply(B, sum)]
# remove all zero expressed genes
B = lapply(B, function(x){
x[Matrix::rowSums(x) != 0,]
})
# generate correlation matrix, perform hierarchical clustering
q = lapply(B, function(x) {
cor_mat = stats::cor(as.matrix(x))
stats::hclust(stats::as.dist(1 - cor_mat))
})
# return cluster membership
hc = lapply(q, function(x){
stats::cutree(x, k = k)
})
return(hc)
}
#' Inspect the shannon entropy of edges
#'
#' @param G An adjacency matrix (cell by cell) for the nearest neighbors in the dataset
#' @param S A vector of labels (cells) for the samples of the dataset
#' @return list of the gene names in the empty droplets for each sample with background.
#' @export
BatchProblemPlot <- function(G, S)
{
# set diagonal to zero
diag(G) <- 0
m = Matrix::Matrix(0, nrow = length(S), ncol = length(unique(S)), sparse = T)
m[cbind(1:nrow(m), as.numeric(factor(S)))] <- 1
# compute neighbor graph
res = G %*% m
res = res / Matrix::rowSums(res)
N_unique = length(unique(S))
# compute normalized Shannon entropy
entropy = apply(res, 1, function(freqs) {-sum(freqs[freqs != 0] * log(freqs[freqs != 0])) / log(2) / log2(N_unique)})
# t-test
df = data.frame(Sample = S, entropy = entropy)
logik = sapply(unique(S), function(x) { stats::t.test(df$entropy[df$Sample == x], df$entropy[df$Sample != x], alternative = 'less')$p.value}) < 0.01
if (any(logik)) {
batch_problems = paste("Batch effects:", paste(as.character(unique(df$Sample)[logik]), collapse = ", "))
} else {
batch_problems = 'No batch effects present in these data'
}
# plot results
q = ggplot2::ggplot(df, ggplot2::aes(x=Sample, y=entropy, fill = Sample)) + ggplot2::geom_boxplot() + ggplot2::xlab('Samples') + ggplot2::ylab('Normalized Shannon entropy') + ggplot2::theme(axis.text.x=ggplot2::element_text(angle=45, hjust=1)) + ggplot2::ggtitle(batch_problems)
return(q)
}
#' Cells counts plot
#'
#' @param E A list of expression matrices
#' @return labels ordered by number of cells
#' @export
GenerateLbls <- function(E)
{
# generate sample vector
X = unlist(mapply(function(x, y) {
rep(x, y)
}, x = names(E), y = sapply(E, ncol)))
NumberOfCells = unlist(sapply(E, ncol))
names(E)[order(NumberOfCells, decreasing = F)]
}
#' Cells counts plot
#'
#' @param E A list of expression matrices
#' @param type R
#' @param lbls factor with levels of the desired order, see? GenerateLbls
#' @return barplot of the number of cells or percentage of cells
#' @export
CellBarPlot <- function(E, type = c("Count", "Percentage")[1], lbls)
{
# generate sample vector
X = unlist(mapply(function(x, y) {
rep(x, y)
}, x = names(E), y = sapply(E, ncol)))
AvgDepth = unlist(sapply(E, function(x) mean(Matrix::colSums(x))))
# first get data table
d = data.frame(table(X))
# compute fraction for each disease state
d$Frac= d$Freq/sum(d$Freq)
colnames(d) <- c("Sample", "Count", "Frac")
d$Sample = factor(d$Sample, levels = lbls)
if (type == "Percentage"){
p = ggplot2::ggplot(d, ggplot2::aes(x=Sample, y=Frac*100, fill=Sample)) +
ggplot2::geom_bar(stat = "identity", color = "black",
position = ggplot2::position_dodge()) + ggplot2::coord_flip() +
ggplot2::xlab("Samples") + ggplot2::ylab("Percentage (%)") + ggplot2::ggtitle("Abundance") + ggplot2::theme(legend.position = "none")
}
if (type == "Count"){
p = ggplot2::ggplot(d, ggplot2::aes(x=Sample, y=Count, fill=Sample)) +
ggplot2::geom_bar(stat = "identity", color = "black",
position = ggplot2::position_dodge()) + ggplot2::coord_flip() +
ggplot2::ylab("Number of cells") + ggplot2::xlab("Samples") + ggplot2::ggtitle("Abundance plot") + ggplot2::theme(legend.position = "none")
}
return(p)
}
#' Get Mito Fraction
#'
#' @param E Expression matrix or a list of expression matrices
#' @param choose can be "all" or "Mito," where all includes ribosomal "RP" genes. Default is all.
#' @return Mitochondrial gene percentage per cell
#' @export
GetMito <- function(E, choose = "all")
{
if (! class(E) %in% "list")
E = list(E)
n = rownames(E[[1]])
if (choose == "all")
logik = grepl("^MT-", n) | grepl("^MT.", n) | grepl("^RPL", n) | grepl("^RPS", n) | grepl("^RP[0-9]", n)
if (choose == "Mito")
logik = grepl("^MT-", n) | grepl("^MT.", n)
MitoFrac = lapply(E, function(x){
(Matrix::colSums(x[logik,]) / Matrix::colSums(x)) * 100
})
return(MitoFrac)
}
#' Filter by Mito Fraction
#'
#' @param E Expression matrix or a list of expression matrices.
#' @param cutoff Threshold, only cells with less than cutoff will be kept. Default is 25.
#' @param choose Can be "all" or "Mito". "all" filters based on RP and MT genes, mito just MT genes.
#' @return Filtered expression matrix
#' @export
FilterMito <- function(E, cutoff = 25, choose)
{
cat(" .......... Entry in FilterMito: \n");
ta = proc.time()[3];
cat(" Total number of barcodes before filtering:", sum(sapply(E, ncol)), "\n");
fracs = GetMito(E, choose = choose)
E = mapply(function(x , y){
x[, y < cutoff]
}, x = E, y = fracs, SIMPLIFY = F)
cat(" Total number of barcodes after filtering:", sum(sapply(E, ncol)), "\n");
tb = proc.time()[3] - ta;
cat(" .......... Exit FilterMito \n");
cat(" Execution time = ", tb, " s.\n", sep = "");
return(E)
}
#' Get suggested filter values
#'
#' @param E Expression matrix or a list of expression matrices, see ?Read_h5
#' @param verbose If false, hides output dialog
#' @param min_counts Minimum counts for outlier detection, default is 100.
#' @param max_counts Maximum counts for outlier detection, default is 1000.
#' @importFrom aplpack compute.bagplot
#' @return List where each element is a data frame with min_umi and min_genes_detected filters
#' @export
GetCutoffs <- function(E, verbose = T, min_counts = 100, max_counts = 1000)
{
if (! class(E) %in% "list")
E = list(E)
if (verbose)
{
cat(" .......... Entry in GetCutoffs: \n");
ta = proc.time()[3];
cat(" Number of count matrices:", length(E), "\n");
cat(" Number of genes:", nrow(E[[1]]), "\n");
cat(" Total number of barcodes:", sum(sapply(E, ncol)), "\n");
}
E = lapply(E, function(x) {x[,Matrix::colSums(x) > min_counts & Matrix::colSums(x) < max_counts]})
if (verbose)
cat(" Total number of barcodes after filtering:", sum(sapply(E, ncol)), "\n");
L = lapply(E, function(x){
if (is.null(colnames(x)))
colnames(x) <- 1:ncol(x)
logik = Matrix::colSums(x) > 0;
x = x[,logik]
total_counts = Matrix::colSums(x)
genes_detected = Matrix::colSums(x > 0)
p = suppressWarnings(suppressMessages(aplpack::compute.bagplot(cbind(log10(total_counts), genes_detected))))
idx = genes_detected < p$center[2] & log10(total_counts) > p$center[1] & colnames(x) %in% rownames(p$pxy.outlier)
if (sum(idx) > 1)
return(data.frame(min_umi = min(total_counts[idx]), min_genes = max(genes_detected[idx])))
if (sum(idx) == 1)
return(data.frame(min_umi = total_counts[idx], min_genes = genes_detected[idx]))
if (sum(idx) == 0)
return(data.frame(min_umi = NA, min_genes = NA))
})
if (verbose)
{
cat(" Total number of samples with background detected with bivariate statistics:", sum(sapply(L, function(x) !is.na(x$min_umi))), "\n");
tb = proc.time()[3] - ta;
cat(" .......... Exit GetCutoffs \n");
cat(" Execution time = ", tb, " s.\n", sep = "");
}
return(L)
}
#' Split large data into smaller chunks
#'
#' @param E Count matrix (see ?Read_h5)
#' @param number_of_chunks Number of chunks to divide the data
#' @return List where each element is a chunk of the original matrix
#' @export
array_split <- function(E, number_of_chunks) {
if (number_of_chunks > (ncol(E))/2)
number_of_chunks = (ncol(E))/2
colIdx <- seq_len(ncol(E))
lapply(split(colIdx, cut(colIdx, pretty(colIdx, number_of_chunks))), function(x) E[, x ])
}
#' Plot genes detected vs. nUMI
#'
#' @param E Count matrix (see ?Read_h5)
#' @param Cutoffs A dataframe as returned by ?GetCutoffs
#' @param Barcodes A list as returned by ?FilterBackgroundMatrix or ?GetBackgroundMatrix
#' @param show.min_genes If TRUE, plot a line with the minimum gene cutoff
#' @param show.min_umi If TRUE, plot a line with the minimum nUMI cutoff
#' @param print.pdf If TRUE, returns "DepthPlots.pdf" in the working directory. Default is TRUE.
#' @param fn Filename of the .pdf file. Default is "DepthPlots.pdf".
#' @return PDF where each page is a plot of the genes detected vs. nUMI, or a list of ggplot objects.
#' @seealso DepthPlotComparison, GetCutoffs
#' @export
DepthPlot <- function(E, Cutoffs = NULL, Barcodes = NULL, show.min_genes = T, show.min_umi = T, print.pdf = T, fn = "DepthPlots.pdf")
{
pointfill <- "#4271AE"
if (class(E) != "list")
E = list(E)
if (is.null(names(E)))
names(E) <- paste("Sample", seq_along(1:length(E)))
q = mapply(function(z, nms){
df = data.frame(total_counts = Matrix::colSums(E[[z]]), genes_detected = Matrix::colSums(E[[z]] != 0));
if (!is.null(Cutoffs)){
min_genes = Cutoffs[[z]]$min_genes
min_umi = Cutoffs[[z]]$min_umi
} else {
min_genes = NA
min_umi = NA
}
if (!is.null(Barcodes))
{
df$cells = "Cells"
df$cells[rownames(df) %in% colnames(Barcodes[[which(names(Barcodes) == nms)]])] = "Soup"
}
if (is.na(min_genes) | is.na(min_umi) & is.null(Barcodes)){
ggplot2::ggplot(df, ggplot2::aes_string(x = "total_counts", y = "genes_detected")) + ggplot2::geom_point() + ggplot2::xlab("Transcripts per barcode") + ggplot2::ylab("Genes detected") + ggplot2::scale_x_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),labels = scales::trans_format("log10", scales::math_format(10^.x))) + ggplot2::ggtitle(nms) + ggplot2::scale_color_manual(values=c(pointfill, "#999999"))
} else if (is.na(min_genes) | is.na(min_umi)) {
ggplot2::ggplot(df, ggplot2::aes_string(x = "total_counts", y = "genes_detected")) + ggplot2::geom_point(alpha = 0.3, color = pointfill) + ggplot2::xlab("Transcripts per barcode") + ggplot2::ylab("Genes detected") + ggplot2::scale_x_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),labels = scales::trans_format("log10", scales::math_format(10^.x))) + ggplot2::annotate("text", x = stats::median(df$total_counts), y = max(df$genes_detected), label = "No background detected") + ggplot2::theme_bw() + ggplot2::ggtitle(nms)
} else if (show.min_genes & show.min_umi ) {
df$cells = "Cells"
df$cells[(df$total_counts > min_umi) & (df$genes_detected < min_genes)] = "Soup"
ggplot2::ggplot(df, ggplot2::aes_string(x = "total_counts", y = "genes_detected", color = "cells")) + ggplot2::geom_point() + ggplot2::xlab("Transcripts per barcode") + ggplot2::ylab("Genes detected") + ggplot2::scale_x_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),labels = scales::trans_format("log10", scales::math_format(10^.x))) + ggplot2::geom_hline(yintercept = min_genes, color = "red", linetype = "dashed") + ggplot2::geom_vline(xintercept = min_umi, color = "blue", linetype = "dashed") + ggplot2::ggtitle(nms) + ggplot2::scale_color_manual(values=c(pointfill, "#999999"))
} else if (show.min_genes & show.min_umi & !is.null(Barcodes)) {
ggplot2::ggplot(df, ggplot2::aes_string(x = "total_counts", y = "genes_detected", color = "cells")) + ggplot2::geom_point() + ggplot2::xlab("Transcripts per barcode") + ggplot2::ylab("Genes detected") + ggplot2::scale_x_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),labels = scales::trans_format("log10", scales::math_format(10^.x))) + ggplot2::geom_hline(yintercept = min_genes, color = "red", linetype = "dashed") + ggplot2::geom_vline(xintercept = min_umi, color = "blue", linetype = "dashed") + ggplot2::ggtitle(nms) + ggplot2::scale_color_manual(values=c(pointfill, "#999999"))
}
else if (show.min_genes) {
ggplot2::ggplot(df, ggplot2::aes_string(x = "total_counts", y = "genes_detected")) + ggplot2::geom_point(alpha = 0.3, color = pointfill) + ggplot2::xlab("Transcripts per barcode") + ggplot2::ylab("Genes detected") + ggplot2::scale_x_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),labels = scales::trans_format("log10", scales::math_format(10^.x))) + ggplot2::geom_hline(yintercept = min_genes, color = "red", linetype = "dashed") + ggplot2::theme_bw() + ggplot2::ggtitle(nms)
} else if (show.min_umi) {
ggplot2::ggplot(df, ggplot2::aes_string(x = "total_counts", y = "genes_detected")) + ggplot2::geom_point(alpha = 0.3) + ggplot2::xlab("Transcripts per barcode") + ggplot2::ylab("Genes detected") + ggplot2::scale_x_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),labels = scales::trans_format("log10", scales::math_format(10^.x))) + ggplot2::geom_vline(xintercept = min_umi, color = "blue", linetype = "dashed") + ggplot2::theme_bw() + ggplot2::ggtitle(nms)
}
}, z = 1:length(E), nms = names(E), SIMPLIFY = F)
names(q) <- names(E)
if (print.pdf)
{
grDevices::pdf(fn)
invisible(suppressWarnings(lapply(q, print)))
grDevices::dev.off()
return("Done!")
} else {
return(q)
}
}
#' Compare results before and after
#'
#' @param E Expression matrix or a list of expression matrices
#' @param Z Background corrected expression matrices
#' @param print.pdf If TRUE, returns "DepthPlots.pdf" in the working directory. Default is TRUE.
#' @return Plot of genes detected vs. nUMI
#' @export
DepthPlotComparison <- function(E, Z, print.pdf = T)
{
if (class(E) != "list")
E = list(E)
if (class(Z) != "list")
Z = list(Z)
q = mapply(function(X, Y){
df = data.frame(state = c(rep("before",ncol(X)), rep("after", ncol(Y))),
total_counts = c(Matrix::colSums(X), Matrix::colSums(Y)),
genes_detected = c(Matrix::colSums(X != 0), Matrix::colSums(Y != 0)))
ggplot2::ggplot(df, ggplot2::aes_string(x = "total_counts", y = "genes_detected", color = "state")) + ggplot2::geom_point(alpha = 0.2) + ggplot2::xlab("Transcripts per barcode") + ggplot2::ylab("Genes detected") + ggplot2::scale_x_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),labels = scales::trans_format("log10", scales::math_format(10^.x)))
}, X = E, Y = Z, SIMPLIFY = F)
if (print.pdf)
{
grDevices::pdf("DepthPlotComparison.pdf")
invisible(suppressWarnings(lapply(q, print)))
grDevices::dev.off()
return("Done!")
} else {
return(q)
}
}
#' Plot a boxplot for the fraction of mitochondrial rna
#'
#' @param E Count matrix (see ?Read_h5)
#' @param order.by Order by a quantity associated with each sample. Options are "AvgDepth" or "Mito", default is AvgDepth.
#' @param choose See ?GetMito or ?FilterMito
#' @return ggplot objects of genes detected ordered by average depth
#' @seealso GeneBoxPlot; CoverageBoxPlot
#' @export
MitoBoxPlot <- function(E, order.by = c("average depth", "mito percentage")[1], choose = "all")
{
Samples = unlist(mapply(function(x, y) {
rep(x, y)}, x = names(E), y = sapply(E, ncol)))
M = GetMito(E = E, choose = choose)
if (order.by == "average depth")
sortVar = unlist(sapply(E, function(x) mean(log2(Matrix::colSums(x)))))
if (order.by == "genes detected")
sortVar = unlist(sapply(E, function(x) Matrix::colSums(x != 0)))
if (order.by == "mito percentage")
sortVar = unlist(sapply(M, function(x) mean(x)))
Samples = factor(Samples, levels = names(sortVar)[order(sortVar, decreasing = T)])
# wrangle data frame
df = data.frame(Samples = Samples, Mito = unlist(M))
# make ggplot
ggplot2::ggplot(df, ggplot2::aes(x = Samples, y=Mito, fill = Samples)) + ggplot2::geom_boxplot() + ggplot2::coord_flip() + ggplot2::ylab("Percentage") + ggplot2::ggtitle(paste("Mitochondrial percentage ordered by", order.by)) + ggplot2::xlab("Samples") + ggplot2::theme(legend.position = "none")
}
#' Plot a boxplot for genes detected
#'
#' @param E Count matrix (see ?Read_h5)
#' @param order.by Order by a quantity associated with each sample. Options are "AvgDepth" or "Mito", default is AvgDepth.
#' @return ggplot objects of genes detected ordered by average depth
#' @seealso GeneBoxPlot; CoverageBoxPlot
#' @export
CoverageBoxPlot <- function(E, order.by = c("average depth", "mito percentage")[1])
{
Samples = unlist(mapply(function(x, y) {
rep(x, y)}, x = names(E), y = sapply(E, ncol)))
Coverage = unlist(sapply(E, function(x) Matrix::colSums(x != 0)))
if (order.by == "average depth")
AvgDepth = unlist(sapply(E, function(x) mean(log2(Matrix::colSums(x)))))
if (order.by == "mito percentage")
{
M = GetMito(E = E, choose = "all")
AvgDepth = unlist(sapply(M, function(x) mean(x)))
}
Samples = factor(Samples, levels = names(AvgDepth)[order(AvgDepth, decreasing = T)])
# wrangle data frame
df = data.frame(Samples = Samples, Coverage = Coverage)
# make ggplot
ggplot2::ggplot(df, ggplot2::aes(x = Samples, y=Coverage, fill = Samples)) + ggplot2::geom_boxplot() + ggplot2::coord_flip() + ggplot2::ylab("Genes detected") + ggplot2::ggtitle(paste("Genes detected ordered by", order.by)) + ggplot2::xlab("Samples") + ggplot2::theme(legend.position = "none")
}
#' Plot a boxplot for sequencing depth
#'
#' @param E Count matrix (see ?Read_h5)
#' @param order.by Order by a quantity associated with each sample. Options are "average depth" or "mito percentage", default is AvgDepth.
#' @return ggplot objects ordered by average depth
#' @seealso GeneBoxPlot; CoverageBoxPlot
#' @export
DepthBoxPlot <- function(E, order.by = c("average depth", "mito percentage")[1])
{
Samples = unlist(mapply(function(x, y) {
rep(x, y)}, x = names(E), y = sapply(E, ncol)))
Depth = unlist(sapply(E, function(x) Matrix::colSums(x)))
if (order.by == "average depth")
AvgDepth = unlist(sapply(E, function(x) mean(log2(Matrix::colSums(x)))))
if (order.by == "mito percentage")
{
M = GetMito(E = E, choose = "all")
AvgDepth = unlist(sapply(M, function(x) mean(x)))
}
Samples = factor(Samples, levels = names(AvgDepth)[order(AvgDepth, decreasing = T)])
# wrangle data frame
df = data.frame(Samples = Samples, Depth = log2(Depth))
# make ggplot
ggplot2::ggplot(df, ggplot2::aes(x = Samples, y=Depth, fill = Samples)) + ggplot2::geom_boxplot() + ggplot2::coord_flip() + ggplot2::ylab("Library size (log2)") + ggplot2::ggtitle(paste("Sequencing depth ordered by", order.by)) + ggplot2::xlab("Samples") + ggplot2::theme(legend.position = "none")
}
#' Plot a boxplot for a single gene
#'
#' @param E Count matrix (see ?Read_h5)
#' @param order.by Order by a quantity associated with each sample. Options are "average depth" or "mito percentage", default is AvgDepth.
#' @param gene Gene to boxplot
#' @return ggplot objects
#' @seealso DepthPlot; DepthPlotComparison; QCPlots
#' @export
GeneBoxPlot <- function(E, gene, order.by = c("average depth", "mito percentage")[1])
{
logik = grepl(gene, rownames(E[[1]]))
expr = log2(1 + unlist(sapply(E, function(x) x[logik,])))
# generate sample vector
X = unlist(mapply(function(x, y) {
rep(x, y)
}, x = names(E), y = sapply(E, ncol)))
if (order.by == "average depth")
AvgDepth = unlist(sapply(E, function(x) mean(log2(Matrix::colSums(x)))))
if (order.by == "mito percentage")
{
M = GetMito(E = E, choose = "all")
AvgDepth = unlist(sapply(M, function(x) mean(x)))
}
X = factor(X, levels = names(AvgDepth)[order(AvgDepth, decreasing = T)])
df = data.frame(Sample = X, Expression = expr)
ggplot2::ggplot(df, ggplot2::aes(x = Sample, y=Expression, fill = Sample)) +
ggplot2::geom_boxplot() +
ggplot2::coord_flip() +
ggplot2::theme(legend.position = "none") +
ggplot2::ylab("Expression UMI (log2)") + ggplot2::ggtitle(paste(sub(pattern = "_._", replacement = "", x = gene), "order by", order.by))
}
#' Plot total genes and total counts histograms
#'
#' @param E Count matrix (see ?Read_h5)
#' @param print.pdf If TRUE, returns pdf files in the working directory. Default is TRUE; if FALSE, returns list of ggplot2 objects.
#' @param QC Type of QC plot, "Genes Detected", "Total Counts" or "Mito". Default is Genes Detected.
#' @param fn Filename of the .pdf file. Default is the input parameter QC saved as a .pdf file.
#' @param choose See ?FilterMito or ?GetMito
#' @param min_counts Keep cells with at least min_counts, default is 100.
#' @param min_genes Keep cells with at least min_genes, default is 100.
#' @param order.by Order by a quantity associated with each sample. Options are "average depth" or "mito percentage", default is AvgDepth.
#' @return PDF, histogram of total counts, or a list of ggplot objects.
#' @seealso DepthPlot; DepthPlotComparison
#' @export
QCPlots <- function(E, print.pdf = T, QC = c("Genes detected", "Total counts", "Mito")[1], order.by = c("average depth", "mito percentage")[1], fn = "default", choose = c("Mito", "all")[1], min_counts = 100, min_genes = 100)
{
E = lapply(E, function(x) { x[,(Matrix::colSums(x) > min_counts) & (Matrix::colSums(x != 0) > min_genes)] })
if (fn == "default")
fn = paste0(QC, ".pdf")
if (class(E) != "list")
E = list(E)
if (is.null(names(E)))
names(E) <- paste("Sample", seq_along(1:length(E)))
if (QC == "Mito")
E = GetMito(E, choose = choose)
barfill <- "#4271AE"
barlines <- "#1F3552"
q = mapply(function(z, nms){
if (QC != "Mito")
{
df = data.frame(total_counts = Matrix::colSums(z), genes_detected = Matrix::colSums(z != 0));
if (QC == "Total counts")
{
ggplot2::ggplot(df, ggplot2::aes(x=total_counts)) + ggplot2::geom_histogram(binwidth = 50,colour = barlines, fill = barfill) + ggplot2::xlab('Total counts (nUMI)') + ggplot2::ylab('Counts') + ggplot2::theme_bw() + ggplot2::ggtitle(nms)
} else {
ggplot2::ggplot(df, ggplot2::aes(x=genes_detected)) + ggplot2::geom_histogram(binwidth = 50,colour = barlines, fill = barfill) + ggplot2::xlab('Genes detected') + ggplot2::ylab('Counts') + ggplot2::theme_bw() + ggplot2::ggtitle(nms)
}
} else {
df = data.frame(mito = z)
if (choose == "Mito"){
ggplot2::ggplot(df, ggplot2::aes(x=mito)) + ggplot2::geom_histogram(binwidth = 5, colour = barlines, fill = barfill) + ggplot2::xlab('Mitochondrial percentage') + ggplot2::ylab('Counts') + ggplot2::theme_bw() + ggplot2::ggtitle(nms)
} else {
ggplot2::ggplot(df, ggplot2::aes(x=mito)) + ggplot2::geom_histogram(binwidth = 5, colour = barlines, fill = barfill) + ggplot2::xlab('Mitochondrial and ribosomal percentage') + ggplot2::ylab('Counts') + ggplot2::theme_bw() + ggplot2::ggtitle(nms)
}
}
}, z = E, nms = names(E), SIMPLIFY = F)
names(q) <- names(E)
if (print.pdf)
{
grDevices::pdf(fn)
invisible(suppressWarnings(lapply(q, print)))
grDevices::dev.off()
return("Done!")
} else {
return(q)
}
}
#' Write results to a folder
#'
#' @param D A list of count matrices
#' @param data.dir directory (will be created if it does not exist) where results are saved
#' @param genome default is GRCh38.
#' @importFrom methods as
#' @importClassesFrom Matrix dgCMatrix
#' @return matrix.h5 file, where each is background corrected
#' @export
#' @importFrom rhdf5 h5createFile h5createGroup h5write h5write.default
SaveCountsToH5 <- function(D, data.dir, genome = "GRCh38")
{
data.dir = gsub("\\/$", "", data.dir, perl = TRUE);
if (!dir.exists(data.dir))
dir.create(data.dir)
if (!"list" %in% class(D))
D = list(D)
cat(" .......... Entry in SaveCountsToH5: \n");
ta = proc.time()[3];
cat(" Number of cells:", sum(sapply(D, ncol)), "\n");
cat(" Number of genes:", nrow(D[[1]]), "\n");
cat(" Number of samples:", length(D), "\n");
if (is.null(names(D)))
names(D) <- paste0("Sample", seq_along(1:length(D)))
flag = rownames(D[[1]])[1] != gsub( "_.*$", "", rownames(D[[1]])[1])
if (flag)
{
genes = strsplit(x = rownames(D[[1]]), split = "_._")
gene_names = sapply(genes, `[[`, 1)
genes = sapply(genes, `[[`, 2)
}
D = lapply(D, function(x)
{
x@Dimnames[[1]] = rownames(D[[1]])
x
})
data.dirs = paste(data.dir, names(D), sep = "/")
q = lapply(data.dirs, function(x){
if (!dir.exists(x))
dir.create(x)})
d = suppressWarnings(
mapply(function(x,y){
fn = "count_matrix.h5"
fn = paste(y, fn, sep = "/")
rhdf5::h5createFile(fn)
rhdf5::h5createGroup(fn, genome)
rhdf5::h5write.default(x@Dimnames[[2]], file = fn, name = paste0(genome, "/barcodes"))
if (flag)
{
rhdf5::h5write.default(genes, file = fn, name=paste0(genome, "/genes"))
rhdf5::h5write.default(gene_names, file = fn, name=paste0(genome, "/gene_names"))
} else {
rhdf5::h5write.default(x@Dimnames[[1]], file = fn, name=paste0(genome, "/genes"))
rhdf5::h5write.default(x@Dimnames[[1]], file = fn, name=paste0(genome, "/gene_names"))
}
rhdf5::h5write.default(x@x, file = fn, name=paste0(genome, "/data"))
rhdf5::h5write.default(dim(x), file = fn, name=paste0(genome, "/shape"))
rhdf5::h5write.default(x@i, file = fn, name=paste0(genome, "/indices")) # already zero-indexed.
rhdf5::h5write.default(x@p, file = fn, name=paste0(genome, "/indptr"))
}, x = D, y = data.dirs)
)
rhdf5::h5closeAll()
tb = proc.time()[3] - ta;
cat(" .......... Exit SaveCountsToH5 \n");
cat(" Execution time = ", tb, " s.\n", sep = "");
}
#' Convert a list of count matrices to a Seurat object
#'
#' @param E a list of count matrices, named by sample names
#' @param filter.mito if true, populates the percent of mitochondrial RNA and removes those genes from the expression matrix. Default is TRUE.
#' @return Seurat object
#' @export
GenerateSeuratObject <- function(E, filter.mito = T)
{
# merge data into single expression matrix
if (class(E) != "list"){
E = list(E)
names(E) <- paste("Sample", 1:length(E))
}
# create a vector of sample identities
nms = unlist(mapply(function(x, y) {
rep(x, y)
}, x = names(E), y = sapply(E, ncol)))
E = do.call(cbind, E)
# create seurat object
pbmc <- Seurat::CreateSeuratObject(counts = E)
pbmc@meta.data$Samples = nms
# get mito percentage
if (filter.mito) {
mito.pct = GetMito(E, choose = "Mito")[[1]]
logik = grepl("^MT-", rownames(E)) | grepl("^MT.", rownames(E)) | grepl("^RPL", rownames(E)) | grepl("^RPS", rownames(E)) | grepl("^RP[0-9]", rownames(E))
E = E[!logik,]
E = E[Matrix::rowSums(E) != 0, ]
rownames(E) <- gsub(pattern = "_._ENSG", replacement = "$ENSG", x = rownames(E))
pbmc@meta.data$percent.mt = mito.pct
}
return(pbmc)
}
#' Save network h5 files
#'
#' @param A an adjacency matrix with row names and columns names genes (gene by gene)
#' @return a saved network file
#' @export
SaveNetworkH5 <- function(A)
{
# convert to sparse matrix
if (!"dgCMatrix" %in% class(A))
A = Matrix::Matrix(A, sparse = T)
# save HDF5 file
cat('Saving hdf5 file for fast network loading...')
file.h5 <- H5File$new("network.hdf5", mode="w")
file.h5$create_group("edges")
file.h5$create_group("nodes")
lapply(rownames(A), function(x){
edges_list = A[rownames(A) == x,]
nodes_list = which(edges_list != 0)
edges_list = edges_list[nodes_list]
file.h5[[paste0("edges/", x)]] <- edges_list
file.h5[[paste0("nodes/", x)]] <- nodes_list
cars_ds <- file.h5[[paste0("edges/", x)]]
cars_ds <- file.h5[[paste0("nodes/", x)]]
})
file.h5$create_group("ngene")
file.h5[["ngene/ngenes"]] <- nrow(A)
cars_ds <- file.h5[["ngene/ngenes"]]
file.h5$close_all()
}
#' Load gene from network h5 file
#'
#' @param gene A gene to query from the network
#' @param filename file name of network. Default is "network.hdf5"
#' @return a saved network file
#' @export
GetGeneFromNetwork <- function(gene, filename = "network.hdf5")
{
if (!requireNamespace("hdf5r", quietly = TRUE))
stop("Please install hdf5r to read HDF5 files")
if (!file.exists(filename))
stop("File not found")
infile = hdf5r::H5File$new(filename)
E = Matrix::Matrix(0, ncol = 1, nrow = infile[["ngene/ngenes"]][], sparse = T)
nodes <- infile[[paste("/nodes", gene, sep = "/")]][]
E[nodes] = infile[[paste("/edges", gene, sep = "/")]][]
return(E)
}
#' Jackknife Differential Expression Analysis
#'
#' @param E A gene by cell expression matrix
#' @param Samples A character vector of samples
#' @param Identities A character vector of cell type or cluster labels
#' @param Disease A character vector of disease labels
#' @param do.par if TRUE, code is run in parallel on all available cores minus one
#' @param num.cores optionally, user can hard set the number of cores to use
#' @return a DEG table Jackknifed
#' @export
JackD <- function(E, Samples, Disease, Identities, do.par = F, num.cores = 1)
{
cat(" .......... Entry in JackD: \n");
ta = proc.time()[3];
cat(" Number of cells:", ncol(E), "\n");
cat(" Number of genes:", nrow(E), "\n");
u = as.character(unique(Samples))
cat(" Number of samples:", length(u), "\n");
if (do.par)
num.cores = parallel::detectCores() - 1
colnames(E) <- 1:ncol(E)
outs = parallel::mclapply(u, function(x){
# dropout one sample
logik = Samples != x;
E_dummy = E[,logik]
Disease_dummy = Disease[logik]
Identities_dummy = Identities[logik]
# create Seurat object for each cell type; run differential expression
y = as.character(unique(Identities_dummy))
qq = lapply(y, function(z){
logik = Identities_dummy == z;
E. = E_dummy[,logik]
Disease. = Disease_dummy[logik]
Identities. = Identities_dummy[logik]
ctrl <- Seurat::CreateSeuratObject(counts = E.)
ctrl <- Seurat::NormalizeData(object = ctrl)
ctrl <- Seurat::AddMetaData(ctrl, metadata=Disease., col.name = "Disease")
ctrl <- Seurat::SetIdent(ctrl, value='Disease')
Seurat::FindAllMarkers(ctrl, only.pos = T)
})
names(qq) <- y
qq
}, mc.cores = num.cores)
return(outs)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.