R/common.pseudoBulk.R

Defines functions oneMix extCellIndex onePure pseudoMix pseudoLogCPM pseudoPure

############################
## pseudoBulk.R
## total column number smaller than 20 will be considered bulk
############################

pseudoPure <- function(accIDs,sN,cellMap,seqD,gCover=0.6){
  
  if(is.null(names(accIDs))) names(accIDs) <- gsub("\\.rds","",basename(accIDs))
  res <- BiocParallel::bplapply(setNames(names(accIDs),names(accIDs)),onePure,sN,cellMap,accIDs,seqD)

  ## filtering genes not common accross datasets for each cell types
  resType <- base::lapply(names(res),function(x)return(unique(base::sapply(strsplit(colnames(res[[x]]),"\\|"),head,1))))
  resGenes <- base::lapply(names(res),function(x)return(rownames(res[[x]])))
  gNames <- base::lapply(unique(unlist(resType)),function(one){
    ix <- base::sapply(resType,function(x)return(one%in%x))
    gC <- table(unlist(resGenes[ix]))/sum(ix)
    return(names(gC)[gC>gCover])
  })
  gNames <- unique(unlist(gNames))
  message("gName in pseudoPure:",length(gNames))

  bulk <- c()
  genes <- list()
  for(i in names(accIDs)){
    message(i," in pseudoPure\n\treads ratio after removing the genes not detected in ")
    genes[[i]] <- rownames(res[[i]])
    totalReads <- apply(res[[i]],2,sum)
    res[[i]] <- res[[i]][rownames(res[[i]])%in%gNames,]
    cat("\t",apply(res[[i]],2,sum)/totalReads,"\n")
    if(length(bulk)==0){
      bulk <- res[[i]]
    }else{
      bulk <- merge(bulk,res[[i]],by="row.names",all=T,sort=F)
      rownames(bulk) <- bulk[,1]
      bulk <- bulk[,-1]
    }
  }
  bulk[is.na(bulk)] <- 0
  return(list(express=bulk,geneL=genes))
}

pseudoLogCPM <- function(X,normDep=1e6,grp=NULL,cutoffCPM=8,cutoffRatio=0.6){
  ## normalized to gene length 
  #X <- apply(X,2,function(x,gLen){return(x/gLen)},as.numeric(sapply(strsplit(rownames(X),"\\|"),tail,1)))
  ####
  CPM <- apply(X,2,function(x){return(normDep/sum(x)*x)})
  cName <- base::sapply(strsplit(colnames(CPM),"\\|"),head,1)
  selGene <- rep(F,nrow(CPM))
  for(i in unique(cName)){
    selGene <- selGene | apply(CPM[,cName==i],1,function(x){return(sum(x>cutoffCPM)/length(x))})>cutoffRatio
  }
  fLogCPM <- log2(1+CPM[selGene,])
  pheno <- data.frame(row.names=colnames(fLogCPM),
                      cType=cName,
                      cData=sapply(strsplit(colnames(fLogCPM),"\\|"),"[[",2))
  cat("pure logCPM:",range(fLogCPM),"\n")
  return(list(logCPM=fLogCPM,pheno=pheno))
}

#names of cellMap is the ones in the data, the value is the one used for doconvolution
pseudoMix <- function(accIDs,sN,cellMap,seqDep){ 
  if(is.null(names(accIDs))) names(accIDs) <- gsub("\\.rds","",basename(accIDs))
  mixR <- bulk <- list()
  res <- BiocParallel::bplapply(setNames(names(accIDs),names(accIDs)),oneMix,sN,cellMap,accIDs,seqDep)
  res[base::sapply(res,is.null)] <- NULL
  return(res)
}

onePure <-function(dID,sampleN,cellMap,accPaths,seqD=2e6){
  strData <- accPaths[dID]#paste("Data/",dID,".rds",sep="")
  #dID <- gsub("\\.rds","",basename(dID))
  if(!file.exists(strData)){
    cat("Missing",strData,"\n")
    return()
  }
  cat("Starting",strData,"\n")
  X <- readRDS(strData)

  bulk <- c()
  cType <- base::sapply(strsplit(colnames(X),"\\|"),head,1)
  if(is.null(cellMap)) cellMap <- setNames(unique(cType),unique(cType))
  ix <- cType %in% names(cellMap)
  cType[ix] <- cellMap[cType[ix]]
  for(j in intersect(unique(cellMap),unique(cType))){
    cat("\tworking on",j,sum(cType==j),":")
    if(ncol(X)<30){# for bulk
      one <- as.matrix(X[,cType==j])
      colnames(one) <- base::sapply(strsplit(colnames(one),"\\|"),function(x){return(paste(c(j,x[-1],x[1]),collapse="|"))})
    }else{
      iPOS <- grep(j,cType)
      ix <- rep(0,ceiling(length(iPOS)*0.5))
      one <- c()
      for(k in 1:sampleN){
        ix <- sample(iPOS,length(ix),replace=T)
        while(sum(X[,ix])>10*seqD && length(ix)>0.5*length(iPOS))
          ix <- sample(iPOS,length(ix)-ceiling(length(iPOS)/20),replace=T)
        while(sum(X[,ix])<seqD && length(ix)<2*length(iPOS))
          ix <- sample(iPOS,length(ix)+ceiling(length(iPOS)/10),replace=T)
        cat(length(ix),"; ",sep="")
        one <- cbind(one,apply(X[,ix],1,sum))
      }
      colnames(one) <- paste(j,dID,1:sampleN,sep="|")
    }
    bulk <- cbind(bulk,one)
    cat("\n")
  }
  rm(X)
  ## remove MT genes and Ribosomal genes and miRNA----
  bulk <- bulk[!grepl("^MT|^RP|^MIR",rownames(bulk)),]
  cat("\tTotal range:",range(bulk),"\n")
  ## ------
  return(bulk)
}

extCellIndex <- function(nCell,cType){
  ix <- c()
  cTypeN <- table(cType)
  iR <- setNames(sample(100,length(cTypeN)),names(cTypeN))
  while(sum(iR/sum(iR)<0.05)>0) iR <- setNames(sample(100,length(cTypeN)),names(cTypeN))
  iR <- round(nCell * iR/sum(iR))
  for(i in names(iR)){
    ix <- c(ix,sample(grep(paste("^",i,"$",sep=""),cType),iR[i],replace=T))
  }

  ## select 10% of nCell for each type ----
#  for(i in unique(cType)) ix <- c(ix,sample(grep(i,cType),ceiling(0.1*nCell),replace=T))
  ## select the result of nCell from the total ----
#  ix <- c(ix,sample(length(cType),nCell-length(ix),replace=T))
  ## ----
  return(ix)
}
oneMix <- function(dID,sampleN,cellMap,accPaths,seqD=2e6){
  strData <- accPaths[dID]#paste("Data/",dID,".rds",sep="")
  #dID <- gsub("\\.rds","",basename(dID))
  if(!file.exists(strData)){
    cat("Missing",strData,"\n")
    return()
  }
  selType <- names(cellMap)
  cat("Starting extracting mixture from",dID,"\n")
  X <- readRDS(strData)
  
  selIndex <- base::sapply(strsplit(colnames(X),"\\|"),head,1)%in%selType
  if(sum(selIndex)<10) return(NULL)
  X <- X[,selIndex]
  cType <- cellMap[sapply(strsplit(colnames(X),"\\|"),head,1)]
  print(table(cType))
  ## selecting ----
  selCell <- bulk <- c()
  nCell <- as.integer(ncol(X)*0.5)
  for(i in 1:sampleN){
    ix <- extCellIndex(nCell,cType)
    while(sum(X[,ix])<seqD){
      nCell <- nCell + as.integer(ncol(X)*0.1)
      ix <- extCellIndex(nCell,cType)
    }
    bulk <- cbind(bulk,apply(X[,ix],1,sum,na.rm=T))
    res <- table(cType[ix])
    selCell <- merge(selCell,
                     setNames(data.frame(res,row.names=names(res))[,-1,drop=F],LETTERS[i]),
                     all=T,by="row.names")
    rownames(selCell) <- selCell[,1]
    selCell <- selCell[,-1,drop=F]
  }
  rm(list=c("X"))
  selCell[is.na(selCell)] <- 0
  colnames(selCell) <- colnames(bulk) <- paste(dID,1:sampleN,sep="|")
  ## remove MT genes and Ribosomal genes and miRNA----
  bulk <- bulk[!grepl("^MT|^RP|^MIR",rownames(bulk)),]
  ## ------
  print(selCell)
  return(list(mBulk=bulk,mixR=selCell))
}
interactivereport/CellMap documentation built on March 17, 2024, 2:01 a.m.