R/GOCAT_Engine.R

Defines functions GSEPD_ProjectionProcessor

###
## Summer 2013 - 2014
## rgsepd-0.3.3 KStamm 
## a tool to perform linear vector projection in GO-Term space
###

GSEPD_ProjectionProcessor <- function(GSEPD) {
  
  GSEPD <- GSEPD_CheckCounts(GSEPD) # make sure normCounts is calculated!
  finalCounts <- GSEPD$normCounts
  C2T <- GSEPD$C2T
  sampleMeta <- GSEPD$sampleMeta
  
  Type1=GSEPD$Conditions[1]
  Type2=GSEPD$Conditions[2]
  
  DefinedGroup <- subset(sampleMeta, sampleMeta$Condition %in% c(Type1, Type2))
  ArcheTypes <- as.character(DefinedGroup$Sample)
  G1=ArcheTypes[DefinedGroup$Condition == Type1];
  G2=ArcheTypes[DefinedGroup$Condition == Type2]
  cols=rep(1,ncol(finalCounts))
  pchs=rep(3,ncol(finalCounts))
  
  if(length(GSEPD$COLORS)!=3){
    stop("Vector of GSEPD$COLORS should be three elements (Group1, middle, Group2).")
  }#doesnt this really belong elsewhere. 
  
  cols[colnames(finalCounts) %in% G1] <- GSEPD$COLORS[1]
  cols[colnames(finalCounts) %in% G2] <- GSEPD$COLORS[3]
  pchs[colnames(finalCounts) %in% G1] <- 1
  pchs[colnames(finalCounts) %in% G2] <- 2
  
  #pull up the merged doc and grab the minor GO term
  M.data <- read.csv(GSEPD_MFile(GSEPD), as.is=TRUE, header=TRUE,row.names=1) 

  AssembleGOSTAT <- function(GOTERM){
    x=list();
    x$Term=GOTERM;
    DEG=subset(M.data,M.data$category==GOTERM)
    x$Name=DEG$Term.x[1]
    x$DataProjection <- ExtractProjection(GSEPD, DEG$REFSEQ ,
                                          PRINTING = FALSE, DRAWING = FALSE)
    x
  }
  
  tcats <- table(M.data$category)
  cats <- unique( subset( M.data, M.data$over_represented_padj.x < GSEPD$LIMIT$GO_PVAL)$category)
  #omit those with too few or too many
  cats = setdiff(cats, names(tcats)[tcats<GSEPD$MinGenesInSet | tcats>GSEPD$MaxGenesInSet]  ) 

  if(length(cats)>1) {
    outfile=paste(GSEPD$Output_Folder, "/SCA.GSEPD.",C2T[1],".",C2T[2],".pdf",sep="")
    Message_Generate(outfile)
    pdf(outfile)
    for(i in seq(1,length(cats)-1,2)){
      #print(paste(cats[i],cats[i+1]))
      DIM_1=AssembleGOSTAT(cats[i])
      DIM_2=AssembleGOSTAT(cats[i+1])
      xs=(DIM_1$DataProjection$alpha) #GOSet 1 axis score
      ys=(DIM_2$DataProjection$alpha) #GOSet 2 axis score
      plot(x=xs, y=ys, main=paste(ncol(finalCounts), "Samples by 2 GO:Terms"),
           xlab=paste(Type2,DIM_1$Name,Type1, sep=" - "), ylab=paste(Type2,DIM_2$Name,Type1, sep=" - "),
           pch=pchs, col=cols )
      legend("topleft",legend=c(GSEPD$Conditions[1],GSEPD$Conditions[2],"Other"),pch=c(1,2,3),col=c(GSEPD$COLORS[1],GSEPD$COLORS[3],1))
    }
    dev.off()
  }else{
    warning("Skipping generation of SCA.GSEPD.pdf because there aren't enough enriched GO Categories.")
  }
  
  if(length(cats)>0) {
    message("Calculating Projections and Segregation Significance")
    OM=matrix(nrow=length(cats), ncol=ncol(finalCounts))
    WhiteOut <- OM;  G1OM <- OM;  G2OM <- OM;
    Segregation_PV=rep(1,nrow(OM)) # see GSEPD$LIMIT$Seg_P later.
    Segregation_Val=rep(1,nrow(OM)) # see GSEPD$LIMIT$Seg_P later.
    
    colnames(WhiteOut)<-colnames(finalCounts)
    rownames(OM)<-cats
    rownames(WhiteOut)<-cats
    rownames(G1OM)<-cats
    rownames(G2OM)<-cats
    names(Segregation_PV)<- cats
    names(Segregation_Val)<- cats
    
    for(i in 1:nrow(OM)){
      ExDP <- AssembleGOSTAT(cats[i])$DataProjection
      OM[i,] <-ExDP$alpha
      WhiteOut[i,]<-ExDP$beta
      G1OM[i,] <-ExDP$gamma1
      G2OM[i,] <-ExDP$gamma2
      #Segregation_P[i] <- Resampled_Significance.t(GSEPD,data.frame(alpha=ExDP$alpha, type=GSEPD$sampleMeta$Condition ), GSEPD$Conditions)
      O <- Resampled_Significance.k(GSEPD, 
                      ROI=M.data$REFSEQ[M.data$category==cats[i]],
                      SOI=ArcheTypes)
      Segregation_PV[i] <- O$PV
      Segregation_Val[i] <- O$Validity
    }
    colnames(OM)<-colnames(finalCounts)
    colnames(WhiteOut)<-colnames(finalCounts)
    colnames(G1OM)<-colnames(finalCounts)
    colnames(G2OM)<-colnames(finalCounts)
    SIGFIG=3
    write.csv(signif(OM,SIGFIG), paste(GSEPD$Output_Folder, "/GSEPD.Alpha.", C2T[1],".",C2T[2],".csv",sep=""))
    write.csv(signif(WhiteOut,SIGFIG), paste(GSEPD$Output_Folder, "/GSEPD.Beta.", C2T[1],".",C2T[2],".csv",sep=""))
    write.csv(cbind(Segregation_Val, Segregation_PV), GSEPD_Seg_File(GSEPD))
    write.csv(signif(G1OM,SIGFIG), GSEPD_HMG1CSV_File(GSEPD))
    write.csv(signif(G2OM,SIGFIG), GSEPD_HMG2CSV_File(GSEPD))
    
    GOPs <- rep(0,length(cats))
    GONames <- rep("",length(cats))
    for(i in 1:length(cats)) { 
      roi <- (M.data$category == cats[i]) #find the entry in Mdata
      GONames[i] <- sprintf("%s %s",M.data$Term.x[roi][1], M.data$category[roi][1]); 
      GOPs[i] <- M.data$over_represented_padj.x[roi][1];
      #if this GOSet fails to separate the classes, we can drop it from further processing
      if( Segregation_PV[i] < GSEPD$LIMIT$Seg_P){
        #if it segregates let's make a pairs plot to go with
        deg_ngenes <- sum(roi)
        if(deg_ngenes >= 2 && deg_ngenes <= 10){# only on those sets with a decent number of genes
          GSEPD_Pairs_Plot(GSEPD, M.data$REFSEQ[roi], GONames[i], GOPs[i], Segregation_PV[i])
        }
        pdf(paste(GSEPD$Output_SCGO,"/Scatter.",C2T[1],".",C2T[2],".GO",substring(cats[i],4),".pdf",sep=""))
          GSEPD_PCA_Spec(GSEPD=GSEPD,GOT=cats[i],MDATA=M.data)
          title(sub=sprintf("%d Genes / %d Samples / GOSeq p=%f / segregation p=%f",deg_ngenes, ncol(finalCounts), GOPs[i], Segregation_PV[i]))
        dev.off()
      }
    }
    
    #we'll use the combination to choose great sets
    GO_SumPs <- GOPs + Segregation_PV
    
    Message_Generate(GSEPD_HMA_File(GSEPD))
   
    if(length(G1)>1 && nrow(OM)>1)
      OM.A1 <- apply(OM[,G1], 1, mean)-min(OM)+1
    else
      OM.A1 <-OM[,G1] - min(OM)+1
    
    if(length(G2)>1 && nrow(OM)>1)
      OM.A2 <- apply(OM[,G2], 1, mean)-min(OM)+1
    else
      OM.A2 <-OM[,G2] - min(OM)+1
    
    OM.min <- apply(cbind(OM.A1,OM.A2),1,min)#*0.90
    OM.max <- apply(cbind(OM.A1,OM.A2),1,max)#*1.10
    #Z-scored OutMatrix
    zOM <- OM-min(OM)+1;
    for(i in 1:nrow(zOM)){
      zOM[i, zOM[i,]< OM.min[i] ] <- OM.min[i]
      zOM[i, zOM[i,]> OM.max[i] ] <- OM.max[i]
      zOM[i,] <- (zOM[i,] - OM.min[i]) / (OM.max[i] - OM.min[i])
    }#goes pretty much 0-1;

    #generate the asterisks you see in the heatmap
    #first lets get some Z-scores on the projection distance
    zWhiteOut <- t(scale(t(WhiteOut)))
    cellnote=matrix(" ", nrow=nrow(zOM),ncol=ncol(zOM))
    for(j in 1:nrow(zOM)){
     cellnote[j,  zWhiteOut[j,] > GSEPD$VECTOR_DISTANCE_ZTHRESH_Moderate] <- "." 
     cellnote[j,  zWhiteOut[j,] > GSEPD$VECTOR_DISTANCE_ZTHRESH_Severe ] <- rawToChar(as.raw(149)) #a fat circle bullet character
    }
    
    ColumnLabels <- colnames(zOM);
    for(j in 1:length(ColumnLabels))
      ColumnLabels[j] <- paste(sampleMeta$SHORTNAME[sampleMeta$Sample==ColumnLabels[j]],
                               sampleMeta$Condition[sampleMeta$Sample==ColumnLabels[j]])
    
    ColLabelColors <- rep("black",ncol(OM));
    names(ColLabelColors) <- colnames(OM)
    ColLabelColors[G1] <- GSEPD$COLORS[1];
    ColLabelColors[G2] <- GSEPD$COLORS[3];
    
    #which rows are worth looking at??
    sr=1:length(GO_SumPs) 
    sr=sort(GO_SumPs, decreasing=FALSE, index.return=TRUE)$ix[1:min(GSEPD$MAX_GOs_for_Heatmap,length(cats))]
    #and remove those non-significant
    sr <- sr[ GO_SumPs[sr] <= (GSEPD$LIMIT$GO_PVAL + GSEPD$LIMIT$Seg_P)] ;
    
    if(length(sr) > 1) {
      RowLabelColors <- rep("black",length(sr));
      names(RowLabelColors)<-cats[sr]
      sMdata<-subset(M.data, !duplicated(M.data$category), select=c("category","GOSEQ_DEG_Type"))
      for(j in 1:length(sr)){
        cj<-cats[sr[j]];DEGType<-sMdata$GOSEQ_DEG_Type[sMdata$category == cj];
        RowLabelColors[j] <- ifelse(DEGType == "Up",GSEPD$COLORS[3],
                                    ifelse(DEGType == "Down",GSEPD$COLORS[1],
                                           GSEPD$COLORS[2]))
      } ; rm(sMdata)
      
      pdf(GSEPD_HMA_File(GSEPD), height=6+length(sr)/7, width=6+ncol(zOM)*0.33)
      heatmap.2(zOM[sr,], labRow=GONames[sr],
            scale="none", trace="none", margins=c(10,35),
            cexRow=1.25, labCol=ColumnLabels, cellnote=cellnote[sr,],
            notecex=3, notecol="white",
            ColSideColors=ColLabelColors, RowSideColors=RowLabelColors,
            col=GSEPD$COLORFUNCTION);
      dev.off()
      
      #then plot HMG#####
      #it's overlaying the zOM
      #but for the HMG file now I need to make a new 0-1 score from the G1OM/G2OM matrices.
      zOM[,] <- 0.5 ; #default all to indeterminate/black
      zOM[G1OM < 0.45] <- 0.35 #dark green
      zOM[G1OM < (0.45*(2/3)) ] <- 0 # bright green
      zOM[G2OM < 0.45] <- 0.65 # dark red
      zOM[G2OM < (0.45*(2/3)) ] <- 1 # bright red
                   
      pdf(GSEPD_HMG_File(GSEPD), height=6+length(sr)/7, width=6+ncol(zOM)*0.33)
      heatmap.2(zOM[sr,], labRow=GONames[sr],
                scale="none", trace="none", margins=c(10,35),
                cexRow=1.25,labCol=ColumnLabels,
                ColSideColors=ColLabelColors, RowSideColors=RowLabelColors,
                col=GSEPD$COLORFUNCTION);
      dev.off()
      
    }else{
      warning("Not generating HMA/HMG files:  <2 significant rows. See your GOSEQ and Segregation tables for details.")
    }
    
    #for the rows seen in the heatmap...
    for(j in sr){ #make lots of scatter plots, one for each GO term with some enrichment
      thisDEG=subset(M.data, M.data$category==cats[j])
      if( nrow(thisDEG) >= GSEPD$MinGenesInSet ) { 
#thought this would be taken care of above, but the GO term whitelist could create entries with invalid gene counts.
        pdf(paste(GSEPD$Output_SCGO,"/GSEPD.", C2T[1],".",C2T[2],".GO",substring(cats[j],4),".pdf",sep=""))
        nGenes=length(unique(thisDEG$REFSEQ))
        if(nGenes>1){
          for(i in seq(1,nGenes-1,2))
              ExtractProjection(GSEPD, txids = unique(thisDEG$REFSEQ),
                                DRAWING=TRUE, GN=c(i,i+1),PRINTING=FALSE,
                                plotTitle=thisDEG$Term.x[1])
        }
        dev.off();
      }
    }

  }else{ #length(cats)<1
    warning("GSEPD : Can't make output heatmap images when zero GO-Categories enriched.")
  }

}  # end GOCATPROCESSOR for Type1, Type2

Try the rgsepd package in your browser

Any scripts or data that you put into this service are public.

rgsepd documentation built on Nov. 8, 2020, 4:58 p.m.