R/common.featureSel.R

Defines functions featureSel

###########
## featureSel.R
##
#############
featureSel <- function(X,grp,method="voom",batch=NULL,para,strPrefix=NULL,candiG=NULL){
  cat("Finding features from",nrow(X),"for",length(unique(grp)),"groups by",method,"\n")
  print(table(grp))
  if(!is.null(batch))print(table(batch))
  features <- list()
  for(i in sort(unique(grp))) features[[i]] <- rownames(X)
  strDEG <- NULL
  if(!is.null(strPrefix)) strDEG <- paste(strPrefix,"DESeq.gene.txt",sep=".")
  if(method=="DESeq2"){# for deseq2
    features <- list()
    pheno <- data.frame(row.names=colnames(X),cType=grp)
    if(!is.null(batch)) pheno$batch <- batch
    Data <- DESeq2::DESeqDataSetFromMatrix(countData=matrix(as.integer(as.matrix(X)),nrow=nrow(X),dimnames=dimnames(X)),
                                   colData=pheno,
                                   design=formula(paste("~",paste(colnames(pheno),collapse = "+"))))
    #selG <- apply(X,1,function(x)return(sum(x>16))>2)
    dds <- DESeq2::DESeq(Data,betaPrior=TRUE,parallel=T)#,quiet=T
    oneGrp <- setNames(unique(as.character(grp)),unique(as.character(grp)))
    features<- BiocParallel::bplapply(oneGrp,function(i){#
      cat("working on",i,"\n")
      one <- data.frame(row.names=rownames(X))#matrix(0,nrow=nrow(X),ncol=0,dimnames=list(rownames(X),c()))
      for(j in unique(as.character(grp))){
        if(i==j) next
        res <- as.data.frame(DESeq2::results(dds,c("cType",i,j)))
        oneF <- rownames(res)[!is.na(res$padj) & res$padj<para$DEGqvalcut & res$log2FoldChange>para$DEGlogFCcut & res$baseMean>para$DEGbasemeancut]#&selG
        g <- intersect(rownames(one),oneF)
        one <- setNames(cbind(one[g,,drop=F],res[g,"log2FoldChange"]),c(colnames(one),j))
        cat("\tAfter intersect with",j,"DEG:",length(g),"\n")
        #if(length(one)<100) cat("WARNING:",i,"and",j,"are too similar (<100 DEG)\n")
      }
      #print(head(one))
      one <- one[order(apply(one,1,quantile,0.75),decreasing = T),]
      if(nrow(one)<5) cat(paste("Cannot find features for",i))
      return(rownames(one))
    })
    if(F){## one vs rest 
      for(i in unique(grp)){##"Neuron"[sample(9,9)]
        cat("\n\tDESeq2: working on",i,"\n")
        grp1 <- as.character(grp)
        grp1[grp1!=i] <- "others"
        desM1 <- data.frame(row.names=colnames(X),grp=grp1)
        if(!is.null(batch)) desM1 <- cbind(desM1,batch=gsub("\\-","_",batch))
        print(table(grp1))
        Data1 <- DESeqDataSetFromMatrix(countData=matrix(as.integer(as.matrix(X)),nrow=nrow(X),dimnames=dimnames(X)),#counts(dds),#
                                        colData=desM1,
                                        design=formula(paste("~",paste(colnames(desM1),collapse = "+"))))
        dds1 <- DESeq(Data1,betaPrior=TRUE,parallel=T)#,quiet=T
        res <- results(dds1,c("grp",i,"others"))
        rm(list=c("Data1","dds1"))
        if(sum(!is.na(res$padj)&res$padj<0.01&res$log2FoldChange>1)<3){
          cat("\tFew signature genes for",i,"\n")
          next
        }
        res <- res[!is.na(res$padj)&res$padj<0.01,]
        one <- rownames(res)[order(res$log2FoldChange,decreasing=T)[1:sum(res$log2FoldChange>1)]]#min(featureN,)
        cat("\tSignificante UP:",length(one),"\n")
        #cat(length(one),"features selected\n")
        ## remove DEGs among data sets of the same cell type
        cat("\t\tRemove DEG between data sets of the same cell type\n")
        selBatch <- intersect(unique(as.character(batch[grp1!="others"])),c("GSE76381","GSE103723","phs001836","GSE67835","GSE104276",#Brain related cell types
                                                                            "GSE120795","GSE86469","GSE83139"))#pancreatic
        if(length(selBatch)>1){
          for(j in 1:(length(selBatch)-1)){
            for(k in (j+1):length(selBatch)){
              selA <- grp1!="others" & batch==selBatch[j]
              selB <- grp1!="others" & batch==selBatch[k]
              cat("\t\t",selBatch[j],sum(selA),selBatch[k],sum(selB),"\n")
              D1 <- cbind(X[,selA],X[,selB])
              D1 <- D1[apply(D1,1,function(x){return(sum(x>4))})>0.8*min(sum(selA),sum(selB)),]
              compInfo <- data.frame(row.names = colnames(D1),grp=c(rep("A",sum(selA)),rep("B",sum(selB))))
              Data1 <- DESeqDataSetFromMatrix(countData=matrix(as.integer(as.matrix(D1)),nrow=nrow(D1),dimnames=dimnames(D1)),#counts(dds),#
                                              colData=compInfo,
                                              design=formula("~grp"))
              dds1 <- DESeq(Data1,betaPrior=TRUE)#,quiet=T
              res <- results(dds1)
              rm(list=c("D1","Data1","dds1"))
              res <- res[!is.na(res$padj)&res$padj<0.001,]
              one <- one[!one%in%rownames(res)[order(abs(res$log2FoldChange),decreasing = T)][1:min(para$selFeatureN,nrow(res))]]
              cat("\t\tSignificant UP:",length(one),"after compared",selBatch[j],selBatch[k],"\n")
            }
          }
        }
        features[[i]] <- one
      }
    }
    for(i in 1:length(features)){
      uniqueSel <- rep(T,length(features[[i]]))#!features[[i]]%in%unlist(features[-i])
      if(!is.null(candiG)) uniqueSel <- features[[i]]%in%candiG
      features[[i]] <- features[[i]][uniqueSel][1:min(para$selFeatureN,sum(uniqueSel))]
    }
  }else if(method=="Voom"){
    d0 <- edgeR::DGEList(matrix(as.integer(as.matrix(X)),nrow=nrow(X),dimnames=dimnames(X)))
    d0 <- edgeR::calcNormFactors(d0)
    d <- d0[apply(cpm(d0),1,function(x){return(sum(x>para$DEGbasemeancut))})>ncol(X)/2,]
    grp <- as.factor(grp)
    mm <- stats::model.matrix(~0 + grp)
    y <- limma::voom(d,mm,plot=T)
    fit <- limma::lmFit(y,mm)
    features <- list()
    for(i in levels(grp)){
      ## pair-wised comparisons ----
      one <- c()
      for(j in levels(grp)){
        if(i==j) next
        contr <- limma::makeContrasts(contrasts=paste("grp",i,"-","grp",j,sep=""), levels = colnames(coef(fit)))
        tmp <- limma::eBayes(limma::contrasts.fit(fit, contr))
        res <- limma::topTable(tmp, sort.by = "P", n = Inf,p.value=para$DEGqvalcut)
        if(sum(abs(res$logFC)>1)<100) cat("WARNING:",i,"and",j,"are too similar (<100 DEG)\n")
        one <- c(one,rownames(res)[order(res$logFC,decreasing = T)][1:min(20,sum(res$logFC>para$DEGlogFCcut))])
      }
      if(length(one)<5){
        cat(paste("Cannot find features for",i))
      }
      ## one vs all ----------
      grp1 <- as.character(grp)
      grp1[grp!=i] <- "others"
      grp1 <- as.factor(grp1)
      mm1 <- stats::model.matrix(~0 + grp1)
      y1 <- limma::voom(d,mm1,plot=T)
      fit1 <- limma::lmFit(y1,mm1)
      contr1 <- limma::makeContrasts(contrasts=paste("grp1",i,"-","grp1others",sep=""), levels = colnames(coef(fit1)))
      tmp <- limma::eBayes(limma::contrasts.fit(fit1, contr1))
      res <- limma::topTable(tmp, sort.by = "P", n = Inf,p.value=para$DEGqvalcut)
      one <- rownames(res)[order(res$logFC,decreasing = T)][1:min(para$selFeatureN,sum(res$logFC>para$DEGlogFCcut))]
      
      features[[i]] <- one#unique(c(features,one))
    }
  }else if(method=="Top"){
    features <- list()
    for(i in levels(grp)){
      medianG <- apply(X[,grp==i],1,median)
      features[[i]] <- rownames(X)[order(medianG,decreasing=T)[1:min(para$selFeatureN,sum(medianG>para$DEGbasemeancut))]]
    }
  }else if(method=="edgeR"){
    features <- list()
    dg <- edgeR::DGEList(2^X-1,group=grp)
    des <- stats::model.matrix(~factor(grp)-1,dg$samples)
    dg <- edgeR::estimateGLMTrendedDisp(dg,design=des)
    dg <- edgeR::estimateGLMTagwiseDisp(dg,design=des)
    fit <- edgeR::glmFit(dg,des)
    gType <- base::sapply(strsplit(colnames(des),"\\)"),tail,1)
    for(i in gType){
      cat("EdgeR for",i,"\n")
      logFC <- data.frame(row.names=rownames(X))
      for(j in gType){
        if(i==j) next
        one <- setNames(rep(0,length(gType)),gType)
        one[c(i,j)] <- c(1,-1)
        one <- edgeR::topTags(edgeR::glmLRT(fit,contrast=one),n=10000)$table
        one <- setNames(one[one$FDR<para$DEGqvalcut&one$logFC>para$DEGlogFCcut,"logFC",drop=F],j)
        logFC <- merge(one,logFC,by="row.names",sort=F)
        rownames(logFC) <- logFC[,1]
        logFC <- logFC[,-1,drop=F]
        cat("\tvs",j,nrow(logFC),"\n")
      }
      features[[i]] <- rownames(logFC)[order(apply(logFC,1,median),decreasing=T)][1:min(nrow(logFC),para$selFeatureN)]
    }
  }else{
    cat("\n\nUNKNOW METHODS: no feature selection\n\n")
  }
  return(features)
}
interactivereport/CellMap documentation built on March 17, 2024, 2:01 a.m.