####################################
########## basic functions #########
####################################
#######################################################
#' custom_apply Function
#' @author Crhisto
#' @description Apply function for sparse matrix with the same parameter form that the native apply.
#' @name custom_apply
#' @param X matrix to apply the function (sparse matrix)
#' @param MARGIN 1 for rows, 2 for columns
#' @param FUN function for apply to the X matrix
#' @return Matrix resulted with the function applied to original X matrix
#' @import slam
#' @import Matrix
#' @export
custom_apply <- function (X, MARGIN, FUN)
{
x_value <- as.simple_triplet_matrix(X)
if(MARGIN==1){
return <- rowapply_simple_triplet_matrix(x_value, FUN=FUN)
}else if(MARGIN==2){
return <- colapply_simple_triplet_matrix(x_value, FUN=FUN)
}else{
stop('The custom_apply function needs a MARGIN value equal to 1=row, 2=column.')
}
return
}
#######################################################
#' Normalize matrix / vector
#' @description Normalize matrix by column or normalize a vector
#' @name getCPM0
#' @param x a matrix or a vector
#' @import Matrix
#' @export
getCPM0 <- function(x, verbose = F){
if (is.null(dim(x))){
if (verbose){
message("Normalizing a vector instead of a matrix")
}
vec = as.matrix(x/sum(x))
vec
} else {
#Using the custom_apply for sparse matrix
cpm <- Matrix::t(Matrix::t(x)/custom_apply(x,2,sum))
cpm
}
}
#######################################################
#' Get ExpressionSet
#' @description Use Pdata, Fdata, and count matrix to derive ExpressionSet Object
#' @name getESET
#' @import Biobase
#' @param exprs raw count matrix
#' @param fdata feature data, for genes, usually it's gene name
#' @param pdata pheno data, for samples, usually it's the characteristics for each single cell/bulk sample, including name, gender, age, cluster, disease,...
#' @import Biobase
#' @export
getESET <- function(exprs, fdata, pdata){
pdata <- as.data.frame(pdata)
fdata <- as.data.frame(fdata)
exprs <- Matrix(exprs, sparse = TRUE)
rownames(pdata) <- colnames(exprs)
rownames(fdata) <- rownames(exprs)
eset <- ExpressionSet(exprs,
AnnotatedDataFrame(pdata),
AnnotatedDataFrame(fdata))
}
######################################################
#' Construct Pseudo bulk samples
#' @description Construct Pseudo bulk samples by actual number of cells per subject
#' @name generateBulk_allcells
#' @param eset ExpressionSet object for single cells
#' @param ct.varname variable name for 'cell types'
#' @param sample variable name for subject/samples
#' @param disease indicate the health condition of subjects
#' @param ct.sub a subset of cell types that are selected to construct pseudo bulk samples. If NULL, then all cell types are used.
#' @return pseudo bulk samples ExpressionSet, and actual cell-type proportions
#' @import Biobase
#' @import xbioc
#' @import Matrix
#' @export
generateBulk_allcells <- function(eset, ct.varname, sample, disease = NULL, ct.sub = NULL){
if (is.null(ct.sub)){
ct.sub <- unique(eset@phenoData@data[,ct.varname])
}
eset <- eset[, eset@phenoData@data[,ct.varname] %in% ct.sub]
cluster.id <- eset@phenoData@data[,ct.varname]
sample.id <- eset@phenoData@data[,sample]
condition.id <- eset@phenoData@data[,disease]
## expression
pseudo.exprs <- sapply(unique(sample.id), function(sid){
y <- exprs(eset)[, sample.id %in% sid]
Matrix::rowSums(y, na.rm = T)
})
colnames(pseudo.exprs) <- unique(sample.id)
## true proportion: sample by cell types
ncount <- table(sample.id, cluster.id)
true.prop <- ncount / Matrix::rowSums(ncount, na.rm = T)
true.prop <- true.prop[complete.cases(true.prop),]
## eset for pseudo bulk sample
if (is.null(disease)){
pseudo.disease <- NA
} else {
pseudo.disease <- sapply(unique(sample.id), function(sid){
condition.id[sample.id == sid][1]
})
}
pseudo.pdata <- data.frame(sample = colnames(pseudo.exprs),
disease = pseudo.disease)
pseudo.fdata <- data.frame(genes = rownames(pseudo.exprs))
rownames(pseudo.fdata) <- rownames(pseudo.exprs)
pseudo_eset <- getESET(exprs = pseudo.exprs,
fdata = pseudo.fdata,
pdata = pseudo.pdata)
return(list(truep = true.prop, pseudo_eset = pseudo_eset))
}
######################################################
#' Construct Pseudo bulk samples -- random
#' @description Construct Pseudo bulk samples by random sampled number of cells per subject, but based on the actual numbers.
#' @name generateBulk_norep
#' @param eset ExpressionSet object for single cells
#' @param ct.varname variable name for 'cell types'
#' @param sample variable name for subject/samples
#' @param disease indicate the health condition of subjects
#' @param ct.sub a subset of cell types that are selected to construct pseudo bulk samples. If NULL, then all cell types are used.
#' @param prop_mat manually input the cell-type proportion for pseudo bulk samples
#' @param nbulk number of pseudo bulk samples to be constructed
#' @param samplewithRep logical, randomly sample single cells with replacement. Default is F.
#' @return pseudo bulk samples ExpressionSet, and actual cell-type proportions
#' @import Matrix
#' @export
generateBulk_norep <- function(eset, ct.varname, sample, disease = NULL, ct.sub,
prop_mat = NULL, nbulk=10, samplewithRep = F){
x.sub <- eset[,eset@phenoData@data[,ct.varname] %in% ct.sub]
# qc: remove non-zero genes
x.sub <- x.sub[Matrix::rowSums(exprs(x.sub)) > 0,]
# calculate sample mean & sample variance matrix: genes by cell types
ct.id <- droplevels(as.factor(x.sub@phenoData@data[,ct.varname]))
sample.id <- x.sub@phenoData@data[,sample]
pdatab <- x.sub@phenoData@data
if (! is.null(disease)){
disease <- x.sub@phenoData@data[,disease]
names(disease) <- sample.id
}
# number of cell type of interest
k <- length(unique(ct.id))
message(paste('Using',k,'cell types to generate pseudo bulk samples...'))
# select donors for each pseudo bulk sample
pseudo_donors <- sample(sample.id, nbulk, replace = T)
names(pseudo_donors) <- paste("bulk",1:nbulk, sep = "_")
# generate random matrix for true proportions
if (!is.null(prop_mat)){
true.p1 <- prop_mat # manually input proportion matrix...
colnames(true.p1) <- unique(ct.id)[order(unique(ct.id))]
rownames(true.p1) <- names(pseudo_donors)
message("Using input proportion matrix to create pseudo bulk samples...")
true.ct <- matrix(data = 0,ncol = k, nrow = nbulk) # true number of cells per cell type for each sample
colnames(true.ct) <- unique(ct.id)[order(unique(ct.id))]
rownames(true.ct) <- paste(pseudo_donors,1:nbulk,sep = "_")
# make sure if without replacement, number of cells matches the input prop mat...
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
} else {
true.p1 <- matrix(data = 0,ncol = k, nrow = nbulk)
colnames(true.p1) <- unique(ct.id)[order(unique(ct.id))]
rownames(true.p1) <- paste(pseudo_donors,1:nbulk,sep = "_")
true.ct <- matrix(data = 0,ncol = k, nrow = nbulk) # true number of cells per cell type for each sample
colnames(true.ct) <- unique(ct.id)[order(unique(ct.id))]
rownames(true.ct) <- paste(pseudo_donors,1:nbulk,sep = "_")
message("Generating random cell type proportions...")
}
# create pseudo bulk sample.id according to total available number of cells:
pseudo_bulk <- NULL
for(xx in 1:length(pseudo_donors)){ #length(pseudo_donors)
# xx = 1
message('generating bulk ', xx, ' from donor ', pseudo_donors[xx], '...')
idxd <- sample.id == pseudo_donors[xx] # for a selected donor
temp <- exprs(x.sub)[,idxd] # his expression matrix
temp.cluster <- ct.id[idxd] # cluster info for cells
# match names!!!!!!!!!!!!!!!!!
temp.ncellk <- table(factor(temp.cluster))
if (is.null(prop_mat)){ #if using random proportions
temp.nct <- ceiling(runif(length(temp.ncellk), min = 0.6, max = 1)*temp.ncellk) # take random number of available single cells
true.p1[xx,names(temp.nct)] <- temp.nct/sum(temp.nct) # true proportions
true.ct[xx,] <- temp.nct[colnames(true.ct)] # true number of cells in the pseudo bulk.
true.p1[is.na(true.p1)] <- 0
true.ct[is.na(true.ct)] <- 0
} else { #if using the user-defined proportions
temp.ntotal <- min(temp.ncellk / true.p1[xx,], na.rm = T)
if (temp.ntotal <= k){
message("Please check if your input prop_mat is reasonable. The number of cells of certain selected cell type might be too small.")
}
true.ct[xx,] <- round(temp.ntotal*true.p1[xx,names(temp.ncellk)] ) # true number of cells in the pseudo bulk.
true.p1[is.na(true.p1)] <- 0
true.ct[is.na(true.ct)] <- 0
}
temp.b1 <- sapply(ct.sub, function(ucluster){
temp.vec <- temp[,temp.cluster %in% ucluster] # for a specific cell type
if (is.null(dim(temp.vec))){
temp.sum <- rep(0, length(temp.vec))
} else if (! is.null(dim(temp.vec))) {
if (dim(temp.vec)[2] == 0){
temp.sum <- rep(0, dim(temp.vec)[1])
} else {
temp.sample <- sample(1:ncol(temp.vec), true.ct[xx, ucluster], replace = samplewithRep) # which cells in this cell type will be selected
temp.mat <- temp.vec[,temp.sample] # select all those cells
if (is.null(dim(temp.mat))){
temp.sum <- temp.mat
} else {
temp.sum <- Matrix::rowSums(temp.mat, na.rm = T) # expression sum for one cell type in this bulk, need to sum up all types.
}
}
}
})
out = Matrix::rowSums(temp.b1)
pseudo_bulk <- cbind(pseudo_bulk, out)
#convert to sparse matrix
pseudo_bulk <- Matrix(pseudo_bulk, sparse = TRUE)
colnames(pseudo_bulk)[xx] <- paste(pseudo_donors[xx],xx,sep = "_")
}
# create pseudo eset for bulk sample.id
if (! is.null(disease)){
pseudo_pdata <- data.frame(sample.id = colnames(pseudo_bulk), donors = pseudo_donors, disease = disease[pseudo_donors])
} else {
pseudo_pdata <- data.frame(sample.id = colnames(pseudo_bulk), donors = pseudo_donors)
}
rownames(pseudo_pdata) <- colnames(pseudo_bulk)
pseudo_fdata <- data.frame(labelDescription = rownames(pseudo_bulk),
row.names = rownames(pseudo_bulk))
message("generating expression set object for pseudo bulk sample.id...")
pseudo_eset <- ExpressionSet(pseudo_bulk,
AnnotatedDataFrame(pseudo_pdata),
AnnotatedDataFrame(pseudo_fdata))
pseudo_eset0 <- pseudo_eset[,Matrix::rowSums(!is.na(true.p1))>0] #non-zero eset
true.p0 <- true.p1[Matrix::rowSums(!is.na(true.p1))>0,]
true.ct0 <- true.ct[Matrix::rowSums(!is.na(true.p1))>0,]
return(list(true_p = true.p1, pseudo_bulk = pseudo_bulk, pseudo_eset = pseudo_eset,
num.real = true.ct, true_p0 = true.p0, true.ct0 = true.ct0, pseudo_eset0 = pseudo_eset0)) # , entropy = entropy
}
#' Grid search matrix
#' @description Generate Grid-search matrix
#' @name getSearchGrid
#' @param lengthby search length
#' @param nparam number of paramters/reference datasets
#' @export
getSearchGrid <- function(lengthby, nparam){
wlist <- list()
for(i in 1:nparam){
wlist[[i]] <- seq(0,1+lengthby,by = lengthby)
}
w1grid <- round(expand.grid(wlist),digits = 2) # without the round function, there's missing combinations
w1grid <- w1grid[round(Matrix::rowSums(w1grid), digits = 2)==1.00,]
colnames(w1grid) <- paste("w",1:nparam, sep = "")
return(w1grid)
}
###############################################
#' Evaluate deconvolved proportions
#' @description Evaluation function, for deconvolved proportions and the actual/true proportions
#' @name SCDC_peval
#' @param ptrue a matrix of true/actual cell-type proportions for bulk samples
#' @param pest a list of estimated cell-type proportion matrix
#' @param pest.names method name for the estimated proportions in the pest list
#' @param select.ct selected cell types for deconvolution
#' @export
SCDC_peval <- function(ptrue, pest, pest.names, select.ct = NULL){
if (!is.list(pest)){
pest <- list(pest)
}
if (!is.data.frame(ptrue)){
ptrue <- as.data.frame.matrix(ptrue)
}
n_est <- length(pest)
sample_names <- lapply(pest, rownames)
ctype_names <- lapply(pest, colnames)
sample_common <- Reduce(intersect, sample_names)
ctype_common <- Reduce(intersect, ctype_names)
celltype <- intersect(colnames(ptrue), ctype_common)
if (!is.null(select.ct)) {
celltype <- intersect(celltype, select.ct)
}
sample <- intersect(rownames(ptrue), sample_common)
N <- length(sample)
K <- length(celltype)
if (N < 1) {
stop("No common Subjects! Check rowname!")
}
if (K <= 1) {
stop("Not enough cell types!")
}
ptrue.use <- ptrue[intersect(rownames(ptrue), sample), intersect(colnames(ptrue), celltype)]
ptrue.use <- as.data.frame.matrix(ptrue.use / apply(ptrue.use,1,sum))
ptrue.use[is.na(ptrue.use)] <- 0
# for each estimation method in the list
evals <- lapply(pest, function(xx){
pest.use <- xx[intersect(rownames(xx), sample), intersect(colnames(xx), celltype)]
pest.use <- as.data.frame.matrix(pest.use / apply(pest.use,1,sum))
pest.use <- pest.use[rownames(ptrue.use),colnames(ptrue.use)]
RMSD_bysample <- round(sqrt(rowMeans((ptrue.use - pest.use)^2)), digits = 5)
mAD_bysample <- round(rowMeans(abs(ptrue.use - pest.use)), digits = 5)
Pearson_bysample <- sapply(1:nrow(ptrue.use), function(ss) {
round(cor(c(as.matrix(ptrue.use[ss, ])), c(as.matrix(pest.use[ss, ]))), digits = 5)
})
RMSD <- round(sqrt(mean(as.matrix((ptrue.use - pest.use)^2), na.rm = T)), digits = 5)
mAD <- round(mean(as.matrix(abs(ptrue.use - pest.use)), na.rm = T), digits = 5)
Pearson <- round(cor(c(as.matrix(ptrue.use)), c(as.matrix(pest.use))), digits = 4)
return(list(pest.use = pest.use, RMSD_bysample = RMSD_bysample, mAD_bysample = mAD_bysample, Pearson_bysample = Pearson_bysample,
RMSD = RMSD, mAD = mAD, Pearson = Pearson))
})
evals.table <- NULL
for (l in 1:n_est){
evals.table <- rbind(evals.table, c(evals[[l]]$RMSD, evals[[l]]$mAD, evals[[l]]$Pearson))
}
colnames(evals.table) <- c("RMSD","mAD","R")
rownames(evals.table) <- pest.names
# evals per sample
pearson.sample.table <- NULL
for (l in 1:n_est){
pearson.sample.table <- rbind(pearson.sample.table,evals[[l]]$Pearson_bysample)
}
rownames(pearson.sample.table) <- pest.names
colnames(pearson.sample.table) <- rownames(ptrue.use)
RMSD.sample.table <- NULL
for (l in 1:n_est){
RMSD.sample.table <- rbind(RMSD.sample.table,evals[[l]]$RMSD_bysample)
}
rownames(RMSD.sample.table) <- pest.names
colnames(RMSD.sample.table) <- rownames(ptrue.use)
mAD.sample.table <- NULL
for (l in 1:n_est){
mAD.sample.table <- rbind(mAD.sample.table,evals[[l]]$mAD_bysample)
}
rownames(mAD.sample.table) <- pest.names
colnames(mAD.sample.table) <- rownames(ptrue.use)
return(list(evals = evals, evals.table = evals.table, pearson.sample.table = pearson.sample.table,
RMSD.sample.table = RMSD.sample.table, mAD.sample.table = mAD.sample.table))
}
###############################################
#' Evaluate predicted gene expression levels
#' @description Evaluation function, for observed and predicted gene expression levels
#' @name SCDC_yeval
#' @param y observed raw counts of a bulk sample
#' @param yest a list of predicted gene expression levels
#' @param yest.names method name for the predicted gene expression levels in yest list
#' @export
SCDC_yeval <- function(y, yest, yest.names=NULL){
if (!is.list(yest)){
yest <- list(yest)
}
n_est <- length(yest)
# for each estimation method in the list
evals <- lapply(yest, function(xx){
if (!is.null(xx)){
if (dim(xx)[2] >1){
g.use = intersect(rownames(y), rownames(xx))
y.norm <- getCPM0(y[g.use,])
x = xx[g.use,colnames(y)]
yest.norm = getCPM0(x)
#workaround: convert the sparse matrix to matrix due to the size of the actual matrix.
yest.norm <- as.matrix(yest.norm)
y.norm <- as.matrix(y.norm)
spearmany <-round(cor(c(yest.norm), c(y.norm), method = "spearman"), digits = 5)
RMSDy = round(sqrt(mean((yest.norm - y.norm)^2)), digits = 7)
mADy = round(mean(abs(yest.norm - y.norm)), digits = 8)
spearmany_bysample <- sapply(1:ncol(x), function(ss) {round(cor(yest.norm[,ss],y.norm[,ss], method = "spearman"), digits = 4)})
RMSDy_bysample = round(sqrt(colMeans((yest.norm[,] - y.norm[,])^2)), digits = 6)
mADy_bysample = round(colMeans(abs(yest.norm - y.norm)), digits = 6)
} else { # one sample case
g.use = intersect(rownames(y), rownames(xx))
y.norm <- getCPM0(y[g.use,])
x = xx[g.use,colnames(y)]
yest.norm = getCPM0(x)
spearmany <-round(cor(c(yest.norm), c(y.norm), method = "spearman"), digits = 5)
RMSDy = round(sqrt(mean((yest.norm - y.norm)^2)), digits = 7)
mADy = round(mean(abs(yest.norm - y.norm)), digits = 8)
spearmany_bysample <- spearmany
RMSDy_bysample = RMSDy
mADy_bysample = mADy
}
} else if (is.null(xx)){
spearmany = NA
RMSDy = NA
mADy = NA
spearmany_bysample = NA
RMSDy_bysample = NA
mADy_bysample =NA
}
return(list(spearmany = spearmany, RMSDy = RMSDy, mADy = mADy, spearmany_bysample = spearmany_bysample,
RMSDy_bysample = RMSDy_bysample, mADy_bysample = mADy_bysample))
})
yevals.table <- NULL
for (l in 1:n_est){
yevals.table <- rbind(yevals.table, c(evals[[l]]$spearmany, evals[[l]]$RMSDy, evals[[l]]$mADy))
}
colnames(yevals.table) <- c("spearman_Y","RMSD_Y","mAD_Y")
rownames(yevals.table) <- yest.names
# evals per sample
spearmany.sample.table <- NULL
for (l in 1:n_est){
spearmany.sample.table <- rbind(spearmany.sample.table,evals[[l]]$spearmany_bysample)
}
rownames(spearmany.sample.table) <- yest.names
colnames(spearmany.sample.table) <- colnames(y)
RMSDy.sample.table <- NULL
for (l in 1:n_est){
RMSDy.sample.table <- rbind(RMSDy.sample.table,evals[[l]]$RMSDy_bysample)
}
rownames(RMSDy.sample.table) <- yest.names
colnames(RMSDy.sample.table) <- colnames(y)
mADy.sample.table <- NULL
for (l in 1:n_est){
mADy.sample.table <- rbind(mADy.sample.table,evals[[l]]$mADy_bysample)
}
rownames(mADy.sample.table) <- yest.names
colnames(mADy.sample.table) <- colnames(y)
return(list(evals = evals, yevals.table = yevals.table, spearmany.sample.table =spearmany.sample.table,
RMSDy.sample.table = RMSDy.sample.table, mADy.sample.table = mADy.sample.table))
}
###############################################
#' Calculate cell type proportions by suggested weights
#' @description Calculate proportions by linear combination of a list of proportions
#' @name wt_prop
#' @param wt a vector of weights for each reference dataset. should be of the same length as the proplist.
#' @param proplist a list of estimated proportions from each reference dataset
#' @export
wt_prop <- function(wt, proplist){
wt <- as.numeric(wt)
combo.list <- list()
for (i in 1:length(proplist)){
combo.list[[i]] <- proplist[[i]]*wt[i]
}
#Sum up each cluster value that was multiplied by the weight
combo.prop <- Reduce("+", combo.list)
return(combo.prop)
}
###############################################
#' Calculate cell type proportions by suggested weights
#' @description Calculate proportions by linear combination of a list of proportions
#' @name wt_y
#' @param wt a vector of weights for each reference dataset. Should be of the same length as the y.list.
#' @param y.list a matrix, with each column containing the vectorized estimated gene expressions from each reference dataset.
#' @export
wt_y <- function(wt, y.list = y.list){
wt <- as.numeric(wt)
combo.list <- list()
for (i in 1:ncol(y.list)){
combo.list[[i]] <- y.list[,i]*wt[i]
}
combo.y <- Reduce("+", combo.list)
return(combo.y)
}
###############################################
#' Create user-defined SCDC_prop object, as the SCDC_ENSEMBLE input
#' @description Create user-defined SCDC_prop object, as the SCDC_ENSEMBLE input
#' @name CreateSCDCpropObj
#' @param p the estimated proportion matrix. sample by cell type.
#' @param b the estimated signature/feature/basis matrix. gene by cell type.
#' @export
CreateSCDCpropObj <- function(p,b){
yhat <- b %*% t(p[,colnames(b)])
obj <- list(prop.est.mvw = p,
basis.mvw = b,
yhat = yhat)
return(obj)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.