############################
## pseudoBulk.R
## total column number smaller than 20 will be considered bulk
############################
pseudoPure <- function(accIDs,sN,cellMap,seqD,gCover=0.6){
if(is.null(names(accIDs))) names(accIDs) <- gsub("\\.rds","",basename(accIDs))
res <- BiocParallel::bplapply(setNames(names(accIDs),names(accIDs)),onePure,sN,cellMap,accIDs,seqD)
## filtering genes not common accross datasets for each cell types
resType <- base::lapply(names(res),function(x)return(unique(base::sapply(strsplit(colnames(res[[x]]),"\\|"),head,1))))
resGenes <- base::lapply(names(res),function(x)return(rownames(res[[x]])))
gNames <- base::lapply(unique(unlist(resType)),function(one){
ix <- base::sapply(resType,function(x)return(one%in%x))
gC <- table(unlist(resGenes[ix]))/sum(ix)
return(names(gC)[gC>gCover])
})
gNames <- unique(unlist(gNames))
message("gName in pseudoPure:",length(gNames))
bulk <- c()
genes <- list()
for(i in names(accIDs)){
message(i," in pseudoPure\n\treads ratio after removing the genes not detected in ")
genes[[i]] <- rownames(res[[i]])
totalReads <- apply(res[[i]],2,sum)
res[[i]] <- res[[i]][rownames(res[[i]])%in%gNames,]
cat("\t",apply(res[[i]],2,sum)/totalReads,"\n")
if(length(bulk)==0){
bulk <- res[[i]]
}else{
bulk <- merge(bulk,res[[i]],by="row.names",all=T,sort=F)
rownames(bulk) <- bulk[,1]
bulk <- bulk[,-1]
}
}
bulk[is.na(bulk)] <- 0
return(list(express=bulk,geneL=genes))
}
pseudoLogCPM <- function(X,normDep=1e6,grp=NULL,cutoffCPM=8,cutoffRatio=0.6){
## normalized to gene length
#X <- apply(X,2,function(x,gLen){return(x/gLen)},as.numeric(sapply(strsplit(rownames(X),"\\|"),tail,1)))
####
CPM <- apply(X,2,function(x){return(normDep/sum(x)*x)})
cName <- base::sapply(strsplit(colnames(CPM),"\\|"),head,1)
selGene <- rep(F,nrow(CPM))
for(i in unique(cName)){
selGene <- selGene | apply(CPM[,cName==i],1,function(x){return(sum(x>cutoffCPM)/length(x))})>cutoffRatio
}
fLogCPM <- log2(1+CPM[selGene,])
pheno <- data.frame(row.names=colnames(fLogCPM),
cType=cName,
cData=sapply(strsplit(colnames(fLogCPM),"\\|"),"[[",2))
cat("pure logCPM:",range(fLogCPM),"\n")
return(list(logCPM=fLogCPM,pheno=pheno))
}
#names of cellMap is the ones in the data, the value is the one used for doconvolution
pseudoMix <- function(accIDs,sN,cellMap,seqDep){
if(is.null(names(accIDs))) names(accIDs) <- gsub("\\.rds","",basename(accIDs))
mixR <- bulk <- list()
res <- BiocParallel::bplapply(setNames(names(accIDs),names(accIDs)),oneMix,sN,cellMap,accIDs,seqDep)
res[base::sapply(res,is.null)] <- NULL
return(res)
}
onePure <-function(dID,sampleN,cellMap,accPaths,seqD=2e6){
strData <- accPaths[dID]#paste("Data/",dID,".rds",sep="")
#dID <- gsub("\\.rds","",basename(dID))
if(!file.exists(strData)){
cat("Missing",strData,"\n")
return()
}
cat("Starting",strData,"\n")
X <- readRDS(strData)
bulk <- c()
cType <- base::sapply(strsplit(colnames(X),"\\|"),head,1)
if(is.null(cellMap)) cellMap <- setNames(unique(cType),unique(cType))
ix <- cType %in% names(cellMap)
cType[ix] <- cellMap[cType[ix]]
for(j in intersect(unique(cellMap),unique(cType))){
cat("\tworking on",j,sum(cType==j),":")
if(ncol(X)<30){# for bulk
one <- as.matrix(X[,cType==j])
colnames(one) <- base::sapply(strsplit(colnames(one),"\\|"),function(x){return(paste(c(j,x[-1],x[1]),collapse="|"))})
}else{
iPOS <- grep(j,cType)
ix <- rep(0,ceiling(length(iPOS)*0.5))
one <- c()
for(k in 1:sampleN){
ix <- sample(iPOS,length(ix),replace=T)
while(sum(X[,ix])>10*seqD && length(ix)>0.5*length(iPOS))
ix <- sample(iPOS,length(ix)-ceiling(length(iPOS)/20),replace=T)
while(sum(X[,ix])<seqD && length(ix)<2*length(iPOS))
ix <- sample(iPOS,length(ix)+ceiling(length(iPOS)/10),replace=T)
cat(length(ix),"; ",sep="")
one <- cbind(one,apply(X[,ix],1,sum))
}
colnames(one) <- paste(j,dID,1:sampleN,sep="|")
}
bulk <- cbind(bulk,one)
cat("\n")
}
rm(X)
## remove MT genes and Ribosomal genes and miRNA----
bulk <- bulk[!grepl("^MT|^RP|^MIR",rownames(bulk)),]
cat("\tTotal range:",range(bulk),"\n")
## ------
return(bulk)
}
extCellIndex <- function(nCell,cType){
ix <- c()
cTypeN <- table(cType)
iR <- setNames(sample(100,length(cTypeN)),names(cTypeN))
while(sum(iR/sum(iR)<0.05)>0) iR <- setNames(sample(100,length(cTypeN)),names(cTypeN))
iR <- round(nCell * iR/sum(iR))
for(i in names(iR)){
ix <- c(ix,sample(grep(paste("^",i,"$",sep=""),cType),iR[i],replace=T))
}
## select 10% of nCell for each type ----
# for(i in unique(cType)) ix <- c(ix,sample(grep(i,cType),ceiling(0.1*nCell),replace=T))
## select the result of nCell from the total ----
# ix <- c(ix,sample(length(cType),nCell-length(ix),replace=T))
## ----
return(ix)
}
oneMix <- function(dID,sampleN,cellMap,accPaths,seqD=2e6){
strData <- accPaths[dID]#paste("Data/",dID,".rds",sep="")
#dID <- gsub("\\.rds","",basename(dID))
if(!file.exists(strData)){
cat("Missing",strData,"\n")
return()
}
selType <- names(cellMap)
cat("Starting extracting mixture from",dID,"\n")
X <- readRDS(strData)
selIndex <- base::sapply(strsplit(colnames(X),"\\|"),head,1)%in%selType
if(sum(selIndex)<10) return(NULL)
X <- X[,selIndex]
cType <- cellMap[sapply(strsplit(colnames(X),"\\|"),head,1)]
print(table(cType))
## selecting ----
selCell <- bulk <- c()
nCell <- as.integer(ncol(X)*0.5)
for(i in 1:sampleN){
ix <- extCellIndex(nCell,cType)
while(sum(X[,ix])<seqD){
nCell <- nCell + as.integer(ncol(X)*0.1)
ix <- extCellIndex(nCell,cType)
}
bulk <- cbind(bulk,apply(X[,ix],1,sum,na.rm=T))
res <- table(cType[ix])
selCell <- merge(selCell,
setNames(data.frame(res,row.names=names(res))[,-1,drop=F],LETTERS[i]),
all=T,by="row.names")
rownames(selCell) <- selCell[,1]
selCell <- selCell[,-1,drop=F]
}
rm(list=c("X"))
selCell[is.na(selCell)] <- 0
colnames(selCell) <- colnames(bulk) <- paste(dID,1:sampleN,sep="|")
## remove MT genes and Ribosomal genes and miRNA----
bulk <- bulk[!grepl("^MT|^RP|^MIR",rownames(bulk)),]
## ------
print(selCell)
return(list(mBulk=bulk,mixR=selCell))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.