signeR: Empirical Bayesian approach to mutational signature discovery

################################################################################
# Classification:
################################################################################
setGeneric("ExposureClassify",
           def=function(signexp_obj=NA, labels, addata=NA, method="knn",
                        max_instances=200, k=3, weights=NA,
                        plot_to_file=FALSE, file="Classification_barplot.pdf",
                        colors=TRUE, min_agree=0.75,...){
               standardGeneric("ExposureClassify")
           }
)
setMethod("ExposureClassify",signature(signexp_obj="ANY",labels="character",
                                       addata="ANY", method="ANY", 
                                       max_instances="ANY", k="ANY", 
                                       weights="ANY", plot_to_file="ANY",
                                       file="ANY", colors="ANY", 
                                       min_agree="ANY"),
          function(signexp_obj, labels, addata, method, max_instances, k, 
                   weights, plot_to_file, file, colors, min_agree,...){
              SEinput<-FALSE
              Addinput<-FALSE
              if(class(signexp_obj)=="SignExp"){
                  if(!signexp_obj@normalized) signexp_obj<-Normalize(signexp_obj)
                  dp <- dim(signexp_obj@Sign) #[i,n,r]
                  de <- dim(signexp_obj@Exp) #[n,j,r]
                  i<-dp[[1]]; n<-dp[[2]]; j<-de[[2]]; r<-de[[3]]
                  if(r > max_instances){
                    select_instances<-sort(sample(1:r,max_instances,replace=FALSE))
                    signexp_obj@Exp <-signexp_obj@Exp[,,select_instances]
                    signexp_obj@Sign<-signexp_obj@Sign[,,select_instances]
                    r<-max_instances
                  }
                  SEinput<-TRUE
                  Exposures<-signexp_obj@Exp
              }else if(class(signexp_obj) %in% c("matrix","data.frame")){
                      n<-nrow(signexp_obj)
                      j<-ncol(signexp_obj)
                      r<-1
                      Exposures<-array(as.vector(signexp_obj),dim=c('n'=n,'j'=j,'r'=1))
              }else  if(class(signexp_obj)=="logical" & is.na(signexp_obj)){
                      r<-1
              }else{
                      stop("Provided signexp_obj should be of class SignExp or matrix.\n")
              }
              if(class(addata) %in% c("matrix","data.frame")){
                  Addinput<-TRUE
              }
              knearest<-k
              if(!length(labels)==j){
                  stop(paste("Labels should be provided for each sample.",
                             "Samples that will be classified should be",
                             "labeled with 'NA'.",sep=" "))
              }
              totest<-is.na(labels)
              totrain<-!totest
              ntest<-sum(totest)
              ntrain<-sum(totrain)
              if(ntest==0){
                  stop("All samples are already classified. There is nothing to be done.\n")
              }else{
                  if(identical(method,"knn") & ntrain<k){
                      stop(paste("The number of labeled samples is less then the number ",
                                 "of nearest neighbors used for classification, k=",k,
                                 ". Please run classification with k<=",ntrain,".",sep=""))
                  }
                  testsamples<-signexp_obj@samples[totest]
                  cl<-as.factor(labels[totrain])
                  classes<-as.vector(levels(as.factor(cl)))
                  nclass<-length(classes)
                  is.binary <- nclass<=2
                  Allprobs<-array(data = as.vector(future_apply(Exposures,3,function(D){
                      n0<-n
                      if(Addinput){ 
                          D<-rbind(D,t(addata))
                          n<-nrow(D)
                      }
                      if(all(is.na(weights))){
                          weights<-rep(1,n)
                      }
                      if (length(weights)<n){
                          weights<-rep(weights,n)[1:n]
                      }
                      if (any(weights == "standardize")){
                          cond <- which(weights == "standardize")
                          weights[cond]<-1/apply(D[cond,],1,sd)
                          weights<-as.numeric(weights)
                      }
                      D<-D*weights
                      rownames(D)<-c(signexp_obj@signames,colnames(addata))
                      D<-D/median(D[D>0]) #avoid too small values
                      Train<-t(D[,!is.na(labels),drop=FALSE])
                      Test<-t(D[,is.na(labels),drop=FALSE])
                      method<-tolower(method)
                      switch(method,
                             knn = {
                                 Classific<-kknn(cl ~ ., train=data.frame(Train,cl),
                                                 test=data.frame(Test),k=knearest,kernel="rectangular")
                                 signexp_objclass<-as.vector(Classific$fitted.values)
                                 Class_probs<-Classific$prob
                             }, #K-NearestNeighbors - pkg kknn
                             lvq = {
                                 Codebook<-lvqinit(Train,cl)
                                 LVQmod<-olvq1(Train,cl,Codebook)
                                 Classific<-kknn(cl ~ ., train=data.frame(LVQmod$x,cl=LVQmod$cl),
                                                 test=data.frame(Test),k=1,kernel="rectangular")
                                 #Classific<-lvqtest(LVQmod, Test)
                                 signexp_objclass<-as.vector(Classific$fitted.values)
                                 Class_probs<-Classific$prob
                             }, #LearningVectorQuantization - pkg class
                             logreg = {
                                 if(is.binary){
                                     lnMod <- glm(cl ~ ., data=data.frame(Train,cl), family=binomial(link="logit"))
                                     Classific<-predict(lnMod, data.frame(Test),type="response")
                                     signexp_objclass<-rep(classes[1],ntest)
                                     signexp_objclass[round(Classific)==1]<-classes[2]
                                     Class_probs<-data.frame(matrix(NA,ntest,2))
                                     colnames(Class_probs)<-classes
                                     rownames(Class_probs)<-testsamples
                                     Class_probs[,2]<-Classific
                                     Class_probs[,1]<-1-Classific
                                 }else{ 
                                     lnMod <- vglm(cl ~ ., data=data.frame(Train,cl), family=multinomial(refLevel = 1))
                                     Classific<-predict(lnMod, data.frame(Test),type="response")
                                     signexp_objclass<-apply(Classific,1,function(v){colnames(Classific)[which.max(v)]})
                                     Class_probs<-Classific
                                 }
                             }, #LogisticRegression - pkg VGAM
                             lda = {
                                 ldaMod <- lda(cl ~ ., data=data.frame(Train,cl))
                                 Classific<-predict(ldaMod, data.frame(Test))
                                 signexp_objclass<-as.vector(Classific$class)
                                 Class_probs<-Classific$posterior
                             }, #LinearDiscriminantAnalysis - pkg MASS
                             lasso = {
                                 if(is.binary){
                                     cvglm<-cv.glmnet(x=as.matrix(Train), y=cl, family="binomial")
                                     gn<-try(glmnet(x=as.matrix(Train), y=cl,
                                                    lambda=cvglm$lambda.min,family="binomial"),silent=TRUE)
                                     if(!class(gn)[1]=="try-error"){
                                         Classific<-predict(gn,as.matrix(Test),type="response")
                                         signexp_objclass<-rep(classes[1],ntest)
                                         signexp_objclass[round(Classific)==1]<-classes[2]
                                         Class_probs<-data.frame(matrix(NA,ntest,2))
                                         colnames(Class_probs)<-classes
                                         rownames(Class_probs)<-testsamples
                                         Class_probs[,2]<-Classific
                                         Class_probs[,1]<-1-Classific
                                     }else{
                                         signexp_objclass<-rep(classes[1],ntest)
                                         Class_probs<-data.frame(matrix(NA,ntest,2))
                                         colnames(Class_probs)<-classes
                                         rownames(Class_probs)<-testsamples
                                         Class_probs[,1]<-1
                                         Class_probs[,2]<-0
                                     }
                                 }else{
                                     cvglm<-cv.glmnet(x=as.matrix(Train), y=cl, family="multinomial")
                                     gn<-try(glmnet(x=as.matrix(Train), y=cl,
                                                    lambda=cvglm$lambda.min,family="multinomial"),silent=TRUE)
                                     if(!class(gn)[1]=="try-error"){
                                         Classific<-predict(gn,as.matrix(Test),type="class")
                                         signexp_objclass<-as.vector(Classific)
                                         Class_probs<-predict(gn,as.matrix(Test),type="response")
                                     }else{
                                         signexp_objclass<-rep(classes[1],ntest)
                                         Class_probs<-data.frame(matrix(NA,ntest,2))
                                         colnames(Class_probs)<-classes
                                         rownames(Class_probs)<-testsamples
                                         Class_probs[,1]<-1
                                         Class_probs[,2]<-0
                                     }
                                 }
                             }, #Lasso - pkg glmnet
                             nb = {
                                 NBmod<-naiveBayes(cl ~ ., data=data.frame(Train,cl))
                                 Classific<-predict(NBmod,data.frame(Test),type="class")
                                 signexp_objclass<-as.vector(Classific)
                                 Class_probs<-predict(NBmod,data.frame(Test),type="raw")
                             }, #NaiveBayes - pkg e1071
                             svm = {
                                 SVMmod<-svm(cl ~ ., data=data.frame(Train,cl),probability=T)
                                 Classific<-predict(SVMmod,data.frame(Test),probability=T)
                                 signexp_objclass<-as.vector(Classific)
                                 Class_probs<-attr(Classific,"probabilities")
                             }, #SupportVectorMachine - pkg e1071
                             rf = {
                                 RF<-randomForest(cl ~ ., data=data.frame(Train,cl))
                                 Classific<-predict(RF,data.frame(Test),type="response")
                                 signexp_objclass<-as.vector(Classific)
                                 Class_probs<-predict(RF,data.frame(Test),type="prob")
                             }, #RandomForest - pkg randomForest
                             ab = {
                                 if(is.binary){
                                     AdaMod <- ada(cl ~ ., data=data.frame(Train,cl), loss="logistic")
                                 }else{ 
                                     stop("Adaboost algorithm is implemented only for binary classificaton.\n")
                                 }
                                 Classific<-predict(AdaMod, data.frame(Test),type="both")
                                 signexp_objclass<-as.vector(Classific$class)
                                 Class_probs<-data.frame(Classific$probs)
                             }, #Adaboost - pkg ada
                             {
                                 Classific<-method(Train,Test,cl,...)
                                 signexp_objclass<-as.vector(Classific)
                                 Class_probs<-data.frame(matrix(0,ntest,nclass))
                                 colnames(Class_probs)<-classes
                                 rownames(Class_probs)<-testsamples
                                 Class_probs[signexp_objclass==classes[1],1]<-1
                                 Class_probs[signexp_objclass==classes[2],2]<-1
                             }#Other
                      )
                      return(as.vector(Class_probs))
                  })), dim=c('sample'=ntest,'class'=nclass,'rep'=r),
                  dimnames = list(testsamples,classes,paste("r",1:r,sep="_")))
                  FoundClasses<-future_apply(Allprobs,c(1,3),function(v){
                      resp<-rep(0,length(v))
                      resp[which.max(v)]<-1
                      return(resp)
                  })
                  Freqs<-future_apply(FoundClasses,c(1,2),sum)
                  rownames(Freqs)<-classes
                  #Summaries
                  result<-rep("",ntest)#Final classification
                  winner_freq<-rep(0,ntest)#Frequencies of classifications equal to final class
                  FinalProbs<-apply(Allprobs,c(1,2),mean)
                  for (k in 1:ntest){ #Frequencies and final classifications
                      counts<-Freqs[,k]
                      result[k] <- classes[which.max(counts)][1]
                      winner_freq[k]   <- max(counts)/sum(counts)
                  }
                  result_final<-result
                  result_final[winner_freq < min_agree] <- "undefined"
                  colnames(Freqs)<-paste(testsamples,result_final,sep="-")
                  names(result_final)<-testsamples
                  names(winner_freq)<-testsamples
                  #Plots:
                  if(is.na(colors[1])){
                      #cols<-rainbow(nclass,start=0.5,end=0.8)
                      cols<-terrain.colors(nclass+1,0.6)[1:nclass]
                  }else{
                      if(length(colors)==nclass){
                          cols<-colors
                      }else{
                          cols<-rep(colors,nclass)[1:nclass]
                      }
                  }
                  if(plot_to_file){
                      if(length(grep("\\.pdf$",file))==0){
                          file<-paste(file,"pdf",sep=".")
                      }
                      pdf(file,width=max(5,2*ntest),height=7)
                  }else{
                      if(!grepl("pdf|postscript|cairo_|png|tiff|jpeg|bmp",
                                names(dev.cur()),perl=TRUE)){
                          dev.new(width=max(5,2*ntest), height=7)
                      }
                  }
                  Mfreqs<-Mymelt(Freqs) #occurrence counts
                  colnames(Mfreqs)[3]<-"Frequency"
                  ####### Plotting - ggplot2
                  g<-ggplot(Mfreqs, aes(x=Col,y=Frequency,fill=Row)) + 
                      geom_col(position='stack') +  
                      theme(axis.text.x=element_text(angle=90,vjust=.5)) + 
                      coord_cartesian(expand=0) + 
                      labs(x="",fill="Class")
                  plot(g)
                  if(plot_to_file){
                      dev.off()
                      outmessage<-paste("Classification results were",
                                        "plotted to the file", file,
                                        "on the current directory.",sep=" ")
                      cat(outmessage,"\n")
                  }
                  colnames(Freqs)<-testsamples
                  return(list(class=result_final,freq=winner_freq,
                              allfreqs=Freqs,probs=t(FinalProbs)))
              }
          }
)

#Cross validation
setGeneric("ExposureClassifyCV",
           def=function(signexp_obj=NA, labels, addata=NA, method="knn", 
                        max_instances=200, k=3, 
                        weights=NA, plot_to_file=FALSE, 
                        file="Classification_CV_barplot.pdf",
                        colors=TRUE, min_agree=0.75,
                        fold=8,...){
             standardGeneric("ExposureClassifyCV")
           }
)
setMethod("ExposureClassifyCV",signature(signexp_obj="ANY",labels="character",
                                    addata="ANY",method="ANY", 
                                    max_instances="ANY", k="ANY", 
                                    weights="ANY", plot_to_file="ANY",
                                    file="ANY", colors="ANY", min_agree="ANY",
                                    fold="ANY"),
          function(signexp_obj, labels, addata, method, max_instances, k, 
                   weights, plot_to_file, file, colors, min_agree,fold,...){
            cond_NA<-is.na(labels)
            if(any(cond_NA)){
              labels<-labels[!cond_NA]
              signexp_obj<-Reorder_samples(signexp_obj,which(!cond_NA))
            }
            dp <- dim(signexp_obj@Sign) #[i,n,r]
            de <- dim(signexp_obj@Exp) #[n,j,r]
            i<-dp[[1]]; n<-dp[[2]]; j<-de[[2]]; r<-de[[3]]
            if(r > max_instances){
              select_instances<-sort(sample(1:r,max_instances,replace=FALSE))
              signexp_obj@Exp <-signexp_obj@Exp[,,select_instances]
              signexp_obj@Sign<-signexp_obj@Sign[,,select_instances]
              r<-max_instances
            }
            classes<-levels(as.factor(labels))
            counts<-sapply(classes,function(cl){sum(labels==cl)})
            ntest<-length(labels)
            sample_group<-rep(0,ntest)
            #split samples in fold subgroups.
            for(cl in 1:length(classes)){
              cond_class<-!is.na(labels) & labels==classes[cl]
              n_class<-sum(cond_class)
              thisgroups<-rep(1:fold,ceiling(n_class/fold))[1:n_class]
              #Random permutation
              thisgroups<-thisgroups[sample(c(1:n_class),n_class,replace=F)]
              sample_group[cond_class]<-thisgroups
            }
            Classifications<-future_lapply(as.list(1:fold),function(kk){
              thislabels<-labels
              thislabels[sample_group==kk]<-NA
              This_result<-ExposureClassify(signexp_obj, thislabels, addata, 
                                            method, max_instances, k, weights, 
                                            plot_to_file=TRUE, file="tmp.pdf", 
                                            colors, min_agree,...)
              return(This_result)              
            })
            found_class<-rep(NA,ntest)
            found_freqs<-rep(0,ntest)
            Allfound_freqs<-matrix(0,length(classes),ntest)
            colnames(Allfound_freqs)<-signexp_obj@samples
            Allfound_probs<-matrix(0,length(classes),ntest)
            colnames(Allfound_probs)<-signexp_obj@samples
            for(kk in 1:fold){
              thiscond<-sample_group==kk
              This_result<-Classifications[[kk]]
              found_class[thiscond]<-This_result$class
              found_freqs[thiscond]<-This_result$freq
              Allfound_freqs[,thiscond]<-This_result$allfreqs
              Allfound_probs[,thiscond]<-This_result$probs
            }
            rownames(Allfound_freqs)<-rownames(This_result$allfreqs)
            rownames(Allfound_probs)<-rownames(This_result$probs)
            Confusion<-data.frame(Truth=labels,Found=found_class)
            Tb<-table(Confusion)
            Mfreqs<-Mymelt(Allfound_freqs) #counts
            colnames(Mfreqs)[3]<-"Frequency"
            if(plot_to_file){
              if(length(grep("\\.pdf$",file))==0){
                file<-paste(file,"pdf",sep=".")
              }
              pdf(file,width=max(5,2*ntest),height=7)
            }else{
              if(!grepl("pdf|postscript|cairo_|png|tiff|jpeg|bmp",
                        names(dev.cur()),perl=TRUE)){
                dev.new(width=max(5,2*ntest), height=7)
              }
            }
            g<-ggplot(Mfreqs, aes(x=Col,y=Frequency,fill=Row)) + 
              geom_col(position='stack') +  
              theme(axis.text.x=element_blank(), axis.ticks.x = element_blank()) + 
              coord_cartesian(expand=0) + 
              labs(x="Samples",fill="Class")
            plot(g)
            if(plot_to_file){
              dev.off()
              outmessage<-paste("Crossvalidation results were",
                                "plotted to the file", file,
                                "on the current directory.",sep=" ")
              cat(outmessage,"\n")
            }
            return(list(confusion_matrix=Tb,
                        class=found_class,
                        freq=found_freqs, #winner frequency
                        allfreqs=Allfound_freqs,#all frequencies
                        probs=Allfound_probs))  #all probs found in classifiers
          }
)

Mymelt<-function(M){
    D<-data.frame(M)
    Di<-NROW(D)
    Dj<-NCOL(D)
    N<-data.frame(matrix(0.1,Di*Dj,3))
    colnames(N)<-c("Row","Col","value")
    for(i in 1:Di){
        for(j in 1:Dj){
            N[(j-1)*Di+i,1:2]<-c(rownames(D)[i],colnames(D)[j])
            N[(j-1)*Di+i,3]<-D[i,j]
        }
    }
    return(N)
}


################################################################################
# Exposure GLM multiv.:
################################################################################
setGeneric("ExposureGLM",
           def=function(Exposures, feature, max_instances=200,
                        cutoff_pvalue=0.05, quant=0.5,
                        plot_to_file=FALSE, 
                        file="ExposureGLM_plot.pdf",
                        colors=TRUE,...){
               standardGeneric("ExposureGLM")
           }
)

setMethod("ExposureGLM",signature(Exposures="matrix",feature="numeric",
                                  max_instances="ANY", cutoff_pvalue="ANY",
                                  plot_to_file="ANY", file="ANY",colors="ANY"),
          function(Exposures, feature, max_instances, cutoff_pvalue, 
                   plot_to_file, file, colors,...){
              de <- dim(Exposures) #[n,j]
              n<-de[[1]]; j<-de[[2]]
              Es <- data.frame(t(Exposures),target=feature)
              if(colors){ col1<-"darkgreen"; col2<-"red"; col3<-"blue"
              }else{ col1<-"black"; col2<-"black"; col3<-"black"  }
              gl<-glm("target~.", data=Es,...)
              thisTable<-summary(gl)$coefficients
              Pvalues<-thisTable[-1,4]
              #p-values boxplots
              signif_signatures <- rep(TRUE,n)
              if(!is.na(cutoff_pvalue)){
                  signif_signatures <- signif_signatures & 
                      !is.na(Pvalues) & 
                      Pvalues <= cutoff_pvalue  
              }
              Lpval <- -1*log(Pvalues)
              Lpval[Lpval>750]<-750 ### avoid Inf
              y.min<-min(Lpval)
              y.max<-max(Lpval)
              lcut <- -1*log(cutoff_pvalue)
              cor<-rep("black",n)
              cor[signif_signatures]<-col1
              bigexp<-rep(NA,n)
              if(!is.null(rownames(Exposures)[1])){
                  boxnames<-rownames(Exposures)
              }else{
                  boxnames<-paste("S",1:n,sep="")
              }
              boxlines<-rep(0.5,n)
              if(plot_to_file){
                  if(length(grep("\\.pdf$",file))==0){file<-paste(file,"pdf",sep=".")}
                  pdf(file,width=7,height=7)
                  par(mfrow=c(1,1),mar=c(3.1,4.2,2,2))
              }else{
                  if(!grepl("pdf|postscript|cairo_|png|tiff|jpeg|bmp",
                            names(dev.cur()),perl=TRUE)){
                      dev.new(width=7, height=7)
                  }
                  par(mfrow=c(1,1),mar=c(4.2,5.2,2,2))
              }
              ####### Plotting
              plot(1:n,rep(lcut,n),type="n",main="",xlab="",ylab="-log(pvalue)",
                   xlim=c(0.5,n+0.5),ylim=c(y.min,y.max),xaxt="n",cex.lab=1.2)
              boxplot(data.frame(t(Lpval)),at=1:n,add=TRUE,border=cor,
                      names=rep("",n),pch=45)
              mtext(boxnames,side=1,line=boxlines,at=1:n,cex=1,las=1)
              lines(x=c(0,n+1),y=rep(lcut,2),col=col2)
              for (k in 1:n){segments(k-0.39,Lpval[k],k+0.39,Lpval[k],col=col3,lwd=2)}
              ####### Plotting end
              if(plot_to_file){
                  dev.off()
                  cat(paste("Exposure GLM analysis results",
                            "were plotted to the file",file,
                            "on the current directory.",sep=" "),"\n")
              }
              signif<-as.integer(signif_signatures)
              return(list("Significance"=signif,
                          "Stats"=thisTable,
                          "Pvalues"=Pvalues))
          }
)

setMethod("ExposureGLM",signature(Exposures="SignExp",feature="numeric",
                                  max_instances="ANY", cutoff_pvalue="ANY",
                                  quant="ANY", plot_to_file="ANY", 
                                  file="ANY",colors="ANY"),
          function(Exposures, feature, max_instances, cutoff_pvalue, quant, 
                   plot_to_file, file, colors,...){
              if(!Exposures@normalized) Exposures<-Normalize(Exposures)
              dp <- dim(Exposures@Sign) #[i,n,r]
              de <- dim(Exposures@Exp) #[n,j,r]
              i<-dp[[1]]; n<-dp[[2]]; j<-de[[2]]; r<-de[[3]]
              if(r > max_instances){
                select_instances<-sort(sample(1:r,max_instances,replace=FALSE))
                Exposures@Exp <-Exposures@Exp[,,select_instances]
                Exposures@Sign<-Exposures@Sign[,,select_instances]
                r<-max_instances
              }
              Es <- Exposures@Exp
              if(colors){ col1<-"darkgreen"; col2<-"red"; col3<-"blue"
              }else{ col1<-"black"; col2<-"black"; col3<-"black"  }
              Stats<-array(as.vector(
                  future_apply(Es,3,function(D){
                      thisexposures<-data.frame(t(D),target=feature)
                      gl<-glm("target~.", data=thisexposures,...)
                      thisTable<-summary(gl)$coefficients
                      return(thisTable)
                  })
              ),dim=c((n+1),4,r))
              Pvalues.df<-Stats[-1,4,,drop=T] #2:(n+1)
              
              Pvalues_quant<-apply(Pvalues.df,1,function(v){quantile(v,probs=quant, na.rm=T)})
              #p-values boxplots
              signif_signatures <- rep(TRUE,n)
              if(!is.na(cutoff_pvalue)){
                  signif_signatures <- signif_signatures & 
                      !is.na(Pvalues_quant) & 
                      Pvalues_quant <= cutoff_pvalue  
              }
              Lpval <- -1*log(Pvalues.df)
              Lpval[Lpval>750]<-750 ### avoid Inf
              y.min<-min(Lpval)
              y.max<-max(Lpval)
              invquant<-1-quant
              Lp_quant<-apply(Lpval,1,quantile,invquant,na.rm=TRUE)
              lcut <- -1*log(cutoff_pvalue)
              cor<-rep("black",n)
              cor[signif_signatures]<-col1
              bigexp<-rep(NA,n)
              #boxnames<-Exposures@signames
              #boxlines<-rep(0.5,n)
              if(plot_to_file){
                  if(length(grep("\\.pdf$",file))==0){file<-paste(file,"pdf",sep=".")}
                  pdf(file,width=7,height=7)
                  par(mfrow=c(1,1),mar=c(3.1,4.2,2,2))
              }else{
                  if(!grepl("pdf|postscript|cairo_|png|tiff|jpeg|bmp",
                            names(dev.cur()),perl=TRUE)){
                      dev.new(width=7, height=7)
                  }
                  par(mfrow=c(1,1),mar=c(4.2,5.2,2,2))
              }
              ####### Plotting
              #plot(1:n,rep(lcut,n),type="n",main="",xlab="",ylab="-log(pvalue)",
              #     xlim=c(0.5,n+0.5),ylim=c(y.min,y.max),xaxt="n",cex.lab=1.2)
              #boxplot(data.frame(t(Lpval)),at=1:n,add=TRUE,border=cor,
              #        names=rep("",n),pch=45)
              #mtext(boxnames,side=1,line=boxlines,at=1:n,cex=1,las=1)
              #lines(x=c(0,n+1),y=rep(lcut,2),col=col2)
              #for (k in 1:n){segments(k-0.39,Lp_quant[k],k+0.39,Lp_quant[k],col=col3,lwd=2)}
              ####### Plotting end
              ####################### ggplot2
              signature_names<-Exposures@signames
              sig_order<-order(signature_names)
              md<-data.frame(Sig=rep(signature_names,each=r),
                             Pvalues=as.vector(t(Lpval)))
              ms = group_by(md, Sig) %>% summarize(q1=min(Pvalues),
                                                   q2=quantile(Pvalues,p=0.25),
                                                   q3=median(Pvalues),
                                                   q4=quantile(Pvalues,p=0.75),
                                                   q5=max(Pvalues))
              segments<-data.frame(Sig=signature_names[sig_order],x_bgn=c(1:n)-0.4,x_end=c(1:n)+0.4,
                                   y_bgn=Lp_quant[sig_order],
                                   y_end=Lp_quant[sig_order],
                                   q1=0,q2=0,q3=0,q4=0,q5=0)
              g1<-ggplot(ms, aes(x=Sig,ymin=q1,lower=q2,middle=q3,upper=q4,ymax=q5)) + 
                  geom_boxplot(stat='identity',show.legend = FALSE) +
                  geom_hline(yintercept=lcut,col=col2) +
                  geom_segment(aes(x = x_bgn, y = y_bgn, xend = x_end, yend = y_end), col = col3,
                               data = segments, show.legend = FALSE)+
                  theme_classic(base_size = 15) + #theme_bw()+
                  theme(axis.text.x=element_text(angle=90,vjust=.5,hjust=0,face="bold")) + 
                  labs(x="",y="-log(pvalue)")
              plot(g1)

              if(plot_to_file){
                  dev.off()
                  cat(paste("Exposure GLM analysis results",
                            "were plotted to the file",file,
                            "on the current directory.",sep=" "),"\n")
              }
              signif<-as.integer(signif_signatures)
              return(list("Significance"=signif,
                          "Stats"=Stats,
                          "Pvalues"=Pvalues.df))
          }
)

################################################################################
# Coxmodel multiv.:
################################################################################
setGeneric("ExposureSurvModel",
           def=function(Exposures=NA, 
                        surv, 
                        addata=NA,
                        max_instances=200,
                        quant=0.5, #quantile of statistics used to attribute significance. Higher means stricter.  
                        cutoff_pvalue=0.05,
                        cutoff_hr=NA,
                        plot_to_file=FALSE, 
                        file="ExposureSurvModel_plot.pdf",
                        colors=TRUE,...){
               standardGeneric("ExposureSurvModel")
           }
)

setMethod("ExposureSurvModel",signature(Exposures="matrix",surv="ANY",
                                        addata="ANY", max_instances="ANY",
                                        quant="ANY", cutoff_pvalue="ANY",
                                        cutoff_hr="ANY", plot_to_file="ANY", 
                                        file="ANY",colors="ANY"),
          function(Exposures, surv, addata, max_instances, quant, cutoff_pvalue,
                   cutoff_hr, plot_to_file, file, colors,...){
              if(is.Surv(surv)){
                  dtime <- surv[,1]
                  os <- surv[,2]
              }else{
                  if( is.matrix(surv) & all(c("time","status") %in% colnames(surv)) ){
                      dtime <- surv[,"time"] 
                      os <- surv[,"status"]
                  }else stop("'surv' should be a Surv object or a matrix ",
                             "with 'time' and 'status' columns.\n")
                  surv<-Surv(dtime,os)
              }
              de <- dim(Exposures) #[n,j]
              n<-de[[1]]; j<-de[[2]]
              Ehat<-t(Exposures)
              invquant<-1-quant
              if(is.null(colnames(Ehat))){
                  signature_names<-paste("Sig",1:n,sep="")
                  colnames(Ehat)<-signature_names
              }else{
                  signature_names<-colnames(Ehat)
              }
              if(colors){ col1<-"darkgreen"; col2<-"red"; col3<-"blue"
              }else{ col1<-"black"; col2<-"black"; col3<-"black"  }
              if(plot_to_file){
                  if(length(grep("\\.pdf$",file))==0){file<-paste(file,"pdf",sep=".")}
                  pdf(file,width=7,height=7)
                  par(mfrow=c(2,1),mar=c(3.1,4.2,2,2))
              }else{
                  if(!grepl("pdf|postscript|cairo_|png|tiff|jpeg|bmp",
                            names(dev.cur()),perl=TRUE)){
                      dev.new(width=7, height=7)
                  }
                  par(mfrow=c(2,1),mar=c(4.2,5.2,2,2))
              }
              const<-min(Ehat[Ehat>0])*1e-3
              model0<-!is.na(addata[[1]][1])
              if(model0){ 
                  cph0<-coxph(Surv(dtime,os)~., data=data.frame(dtime,os,addata))
                  #addata<-matrix(1,j,1)
                  #colnames(addata)<-"Intercept0"
                  thisdata<-data.frame(dtime,os,log2(Ehat+const),addata)
                  colnames(thisdata)<-c("time","os",signature_names,
                                        colnames(as.data.frame(addata)))
              }else{
                  thisdata<-data.frame(dtime,os,log2(Ehat+const))
                  colnames(thisdata)<-c("time","os",signature_names)
              }
              cph<-coxph(Surv(dtime,os)~., data=thisdata)
              coxz<-cox.zph(cph)
              Pvalues_prop<-round(coxz$table[,3],5)
              thisTable<-summary(cph)
              Pvalues<-round(thisTable$coefficients[,5],5)
              HR<-round(thisTable$coefficients[,2],5)
              multiv.tests<-cox_as_data_frame(thisTable)[,4:10]
              colnames(multiv.tests)[7]<-"P.value"
              if(model0){
                  thisAnova<-anova(cph0,cph)
                  pvalAnova<-thisAnova[[4]][2]
                  LpvalAnova <- -1*log(pvalAnova)
                  y.max_anova<-max(LpvalAnova)
                  y.min_anova<-min(LpvalAnova)
                  Lp_quant_Anova<-quantile(LpvalAnova,invquant,na.rm=TRUE)
              }
              signif_signatures <- rep(TRUE,n)
              if(!is.na(cutoff_pvalue)){
                  signif_signatures <- signif_signatures & 
                      !is.na(Pvalues) & 
                      Pvalues <= cutoff_pvalue  
              }
              if(!is.na(cutoff_hr)){
                  signif_signatures <- signif_signatures & 
                      !is.na(HR) & 
                      abs(log(HR)) >= log(cutoff_hr)  
              }
              Lpval <- -1*log(Pvalues)
              Lpval[Lpval>750]<-750 ### avoid Inf
              y.min<-min(Lpval)
              y.max<-max(Lpval)
              lcut <- -1*log(cutoff_pvalue)
              lcut <- -1*log(cutoff_pvalue)
              Lp_quant<-Lpval
              cor[signif_signatures]<-col1
              bigexp<-rep(NA,n)
              boxnames<-signature_names
              boxlines<-rep(0.5,n)
              pos=c(1:n)
              box_width=0.8
              data<-multiv.tests
              np <- ifelse((data$P.value < 0.05), ifelse((data$P.value < 0.01), paste0(round(data$P.value, 3)," **"), 
                                                         paste0(round(data$P.value, 3)," *")), round(data$P.value,3))
              hr.ic <- ifelse((!is.na(data$HR)), paste0(round(data$HR, 4),' [',round(data$Lower_CI,3),',',round(data$Upper_CI,3),']'), NA)
              tabletext <- cbind(c("Variable",rownames(data)), 
                                 c("Hazard Ratio [95% CI]",hr.ic),
                                 c("P-Value",np))
              ####### Plotting p-values boxplots & forestplots univariate
              ####################### ggplot2
              md<-data.frame(Sig=signature_names,
                             Pvalues=as.vector(Lpval))
              ms = group_by(md, Sig) %>% summarize(q1=min(Pvalues),
                                                   q2=quantile(Pvalues,p=0.25),
                                                   q3=median(Pvalues),
                                                   q4=quantile(Pvalues,p=0.75),
                                                   q5=max(Pvalues))
              sig_order<-order(signature_names)
              segments<-data.frame(Sig=signature_names[sig_order],x_bgn=c(1:n)-0.4,x_end=c(1:n)+0.4,
                                   y_bgn=Lp_quant[sig_order],
                                   y_end=Lp_quant[sig_order],
                                   q1=0,q2=0,q3=0,q4=0,q5=0)
              g1<-ggplot(ms, aes(x=Sig,ymin=q1,lower=q2,middle=q3,upper=q4,ymax=q5)) + 
                geom_boxplot(stat='identity',show.legend = FALSE,col=cor) +
                geom_hline(yintercept=lcut,col=col2) +
                geom_segment(aes(x = x_bgn, y = y_bgn, xend = x_end, yend = y_end), col = col3,
                             data = segments, show.legend = FALSE)+
                theme_bw()+
                theme(axis.text.x=element_text(angle=0,vjust=.5,hjust=0,face="bold")) + 
                labs(x="",y="-log(pvalue)")
              
              fp <- ggplot(multiv.tests, aes(x = HR, y = labels, xmin = Lower_CI, xmax = Upper_CI)) +
                geom_hline(aes(yintercept = labels, colour = colour), size = 7) + 
                geom_pointrange(shape = 22, fill = "black") +
                geom_vline(xintercept = 1, linetype = 3) +
                ylab("") +
                xlab("Hazard Ratio with 95% CI") +
                theme_classic() +
                scale_colour_identity() +
                scale_y_discrete(limits = rev(multiv.tests$labels)) +
                scale_x_log10(limits = c(0.25, 4), 
                              breaks = c(0.25, 0.5, 1, 2, 4), 
                              labels = c("0.25", "0.5", "1", "2", "4"), expand = c(0,0)) +
                theme(axis.text.y = element_blank(), axis.title.y = element_blank())
              
              multiv.table<-data.frame(labels=multiv.tests[,"labels"],
                                       HR_CI=paste(round(multiv.tests$HR,3)," (",
                                                   round(multiv.tests$Lower_CI,3),"-",
                                                   round(multiv.tests$Upper_CI,3),")",sep=""),
                                       P.value=signif(multiv.tests$P.value,3),
                                       colour=multiv.tests$colour)
              
              ggtable <- ggplot(data = multiv.table, aes(y = labels)) +
                geom_hline(aes(yintercept = labels, colour = colour), size = 7) +
                geom_text(aes(x = 0, label = labels), hjust = 0) +
                geom_text(aes(x = 3, label = HR_CI)) +
                geom_text(aes(x = 3, y=n+0.5, label = "HR(CI)")) +
                geom_text(aes(x = 7, label = P.value), hjust = 1) +
                geom_text(aes(x = 7, y=n+0.5, label = "P.value"), hjust = 1) +
                scale_colour_identity() +
                scale_y_discrete(limits = rev(multiv.tests$labels)) +
                theme_void() + 
                theme(plot.margin = margin(5, 0, 35, 0))
              
              forestplot <- ggarrange(plotlist = list(ggtable,fp),
                                      ncol = 2, nrow = 1)
              if(model0){
                md<-data.frame(Sig="Anova",
                               Pvalues=as.vector(LpvalAnova))
                ms = group_by(md, Sig) %>% summarize(q1=min(Pvalues),
                                                     q2=quantile(Pvalues,p=0.25),
                                                     q3=median(Pvalues),
                                                     q4=quantile(Pvalues,p=0.75),
                                                     q5=max(Pvalues))

                segments<-data.frame(Sig="Anova",x_bgn=0.6,x_end=1.4,
                                     y_bgn=Lp_quant_Anova,y_end=Lp_quant_Anova,
                                     q1=Lp_quant_Anova,q2=Lp_quant_Anova,
                                     q3=Lp_quant_Anova,q4=Lp_quant_Anova,
                                     q5=Lp_quant_Anova)
                g2<-ggplot(ms, aes(x=Sig,ymin=q1,lower=q2,middle=q3,upper=q4,ymax=q5)) + 
                  geom_boxplot(stat='identity',show.legend = FALSE,col=cor) +
                  geom_hline(yintercept=lcut,col=col2) +
                  geom_segment(aes(x = x_bgn, y = y_bgn, xend = x_end, yend = y_end), col = col3,
                               data = segments, show.legend = FALSE)+
                  theme_bw()+
                  theme(axis.text.x=element_text(angle=0,vjust=.5,hjust=0,face="bold")) + 
                  labs(x="",y="-log(pvalue)")
                final_figure <- ggarrange(g1,forestplot,g2,ncol = 1)
              }else{
                final_figure <- ggarrange(g1,forestplot,ncol = 1)
              } 
              plot(final_figure)
              
              ####### Plotting end
              if(plot_to_file){
                  dev.off()
                  cat(paste("Exposure Survival model results",
                            "were plotted to the file",file,
                            "on the current directory.",sep=" "),"\n")
              }
              signif<-as.integer(signif_signatures)
              return(list("Significance"=signif,
                          "Stats"=multiv.tests))
          })


setMethod("ExposureSurvModel",signature(Exposures="SignExp",surv="ANY",
                                        addata="ANY", max_instances="ANY",
                                        quant="ANY", cutoff_pvalue="ANY", 
                                        cutoff_hr="ANY", plot_to_file="ANY",
                                        file="ANY", colors="ANY"),
          function(Exposures, surv, addata, max_instances, quant, cutoff_pvalue,
                   cutoff_hr, plot_to_file, file, colors,...){
              if(!Exposures@normalized) Exposures<-Normalize(Exposures)
              if(is.Surv(surv)){
                  dtime <- surv[,1]
                  os <- surv[,2]
              }else{
                  if( is.matrix(surv) & all(c("time","status") %in% colnames(surv)) ){
                      dtime <- surv[,"time"] 
                      os <- surv[,"status"]
                  }else stop("'surv' should be a Surv object or a matrix ",
                             "with 'time' and 'status' columns.\n")
                  surv<-Surv(dtime,os)
              }
              de <- dim(Exposures@Exp) #[n,j,r]
              n<-de[[1]]; j<-de[[2]]; r<-de[[3]]
              if(r > max_instances){
                select_instances<-sort(sample(1:r,max_instances,replace=FALSE))
                Exposures@Exp <-Exposures@Exp[,,select_instances]
                Exposures@Sign<-Exposures@Sign[,,select_instances]
                r<-max_instances
              }
              invquant<-1-quant
              Ehat<-Median_exp(Exposures)
              Es<-Exposures@Exp
              signature_names<-Exposures@signames
              rownames(Ehat)<-signature_names
              model0<-!is.na(addata[[1]][1])
              if(model0){ 
                  cph0<-coxph(Surv(dtime,os)~., data=data.frame(dtime,os,addata))
              }else{
                  addata<-matrix(0,j,0) # empty matrix
              }
              Allstats<-future_apply(Es,3,function(D){
                  E<-t(D)
                  const<-min(E[E>0])*1e-3
                  thisdata<-data.frame(dtime,os,log2(E+const),addata)
                  cph<-coxph(Surv(dtime,os)~., data=thisdata)
                  coxz<-cox.zph(cph)
                  pval_prop.df<-round(coxz$table[,3],5)[1:(n+1)] #<-output n+1 real
                  thisTable<-summary(cph)
                  pval.df<-round(thisTable$coefficients[,5],5)[1:n] #<-output n real
                  hr.df<-round(thisTable$coefficients[,2],5)[1:n] #<-output n real
                  mult.tests.ar<-as.vector(as.matrix(cox_as_data_frame(thisTable)[1:n,4:10])) #<-output nx7 real
                  if(model0){
                      thisAnova<-anova(cph0,cph)
                      pval.anova<-thisAnova[[4]][2] #<-output 1 real
                  }else{
                      pval.anova<-NA
                  }
                  return(c(pval.anova,pval_prop.df,pval.df,hr.df,mult.tests.ar))
              })
              pvalAnova<-as.vector(Allstats[1,])
              Pvalues_prop.df<-data.frame(Allstats[2:(n+2),])
              Pvalues.df<-data.frame(Allstats[(n+3):(2*n+2),])
              HR.df<-data.frame(Allstats[(2*n+3):(3*n+2),])
              multiv.tests.ar<-array(as.vector(Allstats[(3*n+3):(10*n+2),]),dim=c(n,7,r))
              if(model0){
                 LpvalAnova <- -1*log(pvalAnova)
                 y.max_anova<-max(LpvalAnova)
                 y.min_anova<-min(LpvalAnova)
                 Lp_quant_Anova<-quantile(LpvalAnova,invquant,na.rm=TRUE)
              }
              if(colors){ col1<-"darkgreen"; col2<-"red"; col3<-"blue"
              }else{ col1<-"black"; col2<-"black"; col3<-"black"  }
              if(plot_to_file){
                  if(length(grep("\\.pdf$",file))==0){file<-paste(file,"pdf",sep=".")}
                  pdf(file,width=7,height=7)
                  par(mfrow=c(n,2),mar=c(3.1,4.2,2,2))
              }else{
                  if(!grepl("pdf|postscript|cairo_|png|tiff|jpeg|bmp",
                            names(dev.cur()),perl=TRUE)){
                      dev.new(width=7, height=7)
                  }
                  par(mfrow=c(n,2),mar=c(4.2,5.2,2,2))
              }
              Pvalues_quant<-apply(Pvalues.df,1,function(v){quantile(v,probs=quant)})
              HR_quantiles=apply(multiv.tests.ar[,1,],1,function(v){quantile(v,probs=c(invquant,0.5,quant))})
              cond<-HR_quantiles[2,]>=1
              HR_quant<-ifelse(cond,HR_quantiles[1,],HR_quantiles[3,])#if HR>1, takes inferior quantile.
              signif_signatures <- rep(TRUE,n)
              if(!is.na(cutoff_pvalue)){
                  signif_signatures <- signif_signatures & 
                      !is.na(Pvalues_quant) & 
                      Pvalues_quant <= cutoff_pvalue  
              }
              if(!is.na(cutoff_hr)){
                  signif_signatures <- signif_signatures & 
                      !is.na(HR_quant) & 
                      abs(log(HR_quant)) >= log(cutoff_hr)  
              }
              Lpval <- -1*log(Pvalues.df)
              Lpval[Lpval>750]<-750 ### avoid Inf
              y.min<-min(Lpval)
              y.max<-max(Lpval)
              lcut <- -1*log(cutoff_pvalue)
              Lp_quant<-apply(Lpval,1,quantile,invquant,na.rm=TRUE)
              cor<-rep("black",n)
              cor[signif_signatures]<-col1
              bigexp<-rep(NA,n)
              boxnames<-signature_names
              boxlines<-rep(0.5,n)
              pos=c(1:n)
              box_width=0.8
              bplt<-boxplot(data.frame(t(Lpval)),at=1:n,plot=FALSE,
                            names=rep("",n),pch=45)
              boxplot_stats<-bplt$stats
              Lower_CI_quant=apply(multiv.tests.ar[,2,],1,function(v){quantile(v,probs=invquant)})
              Upper_CI_quant=apply(multiv.tests.ar[,3,],1,function(v){quantile(v,probs=quant)})
              Inv_HR_quantiles=apply(multiv.tests.ar[,4,],1,function(v){quantile(v,probs=c(invquant,quant))})
              Inv_HR_quant=ifelse(cond,Inv_HR_quantiles[2,],Inv_HR_quantiles[1,])
              Inv_Lower_CI_quant=apply(multiv.tests.ar[,5,],1,function(v){quantile(v,invquant)})
              Inv_Upper_CI_quant=apply(multiv.tests.ar[,6,],1,function(v){quantile(v,probs=quant)})
              P.value_quant=apply(multiv.tests.ar[,7,],1,function(v){quantile(v,probs=quant)})
              multiv.tests = data.frame(HR=HR_quant, 
                                        Lower_CI=Lower_CI_quant, 
                                        Upper_CI=Upper_CI_quant,
                                        Inv_HR=Inv_HR_quant,
                                        Inv_Lower_CI=Inv_Lower_CI_quant,
                                        Inv_Upper_CI=Inv_Upper_CI_quant,
                                        P.value=P.value_quant
              )
              rownames(multiv.tests)<-signature_names
              multiv.tests$labels<-signature_names
              multiv.tests$colour <- rep(c("white", "gray95"), ceiling(nrow(multiv.tests)/2))[1:nrow(multiv.tests)]
              ####### Plotting
              ####################### ggplot2
              md<-data.frame(Sig=rep(signature_names,each=r),
                             Pvalues=as.vector(t(Lpval)))
              ms = group_by(md, Sig) %>% summarize(q1=min(Pvalues),
                                                   q2=quantile(Pvalues,p=0.25),
                                                   q3=median(Pvalues),
                                                   q4=quantile(Pvalues,p=0.75),
                                                   q5=max(Pvalues))
              sig_order<-order(signature_names)
              segments<-data.frame(Sig=signature_names[sig_order],x_bgn=c(1:n)-0.4,x_end=c(1:n)+0.4,
                                   y_bgn=Lp_quant[sig_order],
                                   y_end=Lp_quant[sig_order],
                                   q1=0,q2=0,q3=0,q4=0,q5=0)
              g1<-ggplot(ms, aes(x=Sig,ymin=q1,lower=q2,middle=q3,upper=q4,ymax=q5)) + 
                geom_boxplot(stat='identity',show.legend = FALSE,col=cor) +
                geom_hline(yintercept=lcut,col=col2) +
                geom_segment(aes(x = x_bgn, y = y_bgn, xend = x_end, yend = y_end), col = col3,
                             data = segments, show.legend = FALSE)+
                theme_bw()+
                theme(axis.text.x=element_text(angle=0,vjust=.5,hjust=0,face="bold")) + 
                labs(x="",y="-log(pvalue)")
              
              fp <- ggplot(multiv.tests, aes(x = HR, y = labels, xmin = Lower_CI, xmax = Upper_CI)) +
                geom_hline(aes(yintercept = labels, colour = colour), size = 7) + 
                geom_pointrange(shape = 22, fill = "black") +
                geom_vline(xintercept = 1, linetype = 3) +
                ylab("") +
                xlab("Hazard Ratio with 95% CI") +
                theme_classic() +
                scale_colour_identity() +
                scale_y_discrete(limits = rev(multiv.tests$labels)) +
                scale_x_log10(limits = c(0.25, 4), 
                              breaks = c(0.25, 0.5, 1, 2, 4), 
                              labels = c("0.25", "0.5", "1", "2", "4"), expand = c(0,0)) +
                theme(axis.text.y = element_blank(), axis.title.y = element_blank())
              
              multiv.table<-data.frame(labels=multiv.tests[,"labels"],
                                       HR_CI=paste(round(multiv.tests$HR,3)," (",
                                                   round(multiv.tests$Lower_CI,3),"-",
                                                   round(multiv.tests$Upper_CI,3),")",sep=""),
                                       P.value=signif(multiv.tests$P.value,3),
                                       colour=multiv.tests$colour)
              
              ggtable <- ggplot(data = multiv.table, aes(y = labels)) +
                geom_hline(aes(yintercept = labels, colour = colour), size = 7) +
                geom_text(aes(x = 0, label = labels), hjust = 0) +
                geom_text(aes(x = 3, label = HR_CI)) +
                geom_text(aes(x = 3, y=n+0.5, label = "HR(CI)")) +
                geom_text(aes(x = 7, label = P.value), hjust = 1) +
                geom_text(aes(x = 7, y=n+0.5, label = "P.value"), hjust = 1) +
                scale_colour_identity() +
                theme_void() + 
                theme(plot.margin = margin(5, 0, 35, 0))
              
              forestplot <- ggarrange(plotlist = list(ggtable,fp),
                                  ncol = 2, nrow = 1)
              if(model0){
                md<-data.frame(Sig=rep("Anova",r),
                               Pvalues=as.vector(LpvalAnova))
                ms = group_by(md, Sig) %>% summarize(q1=min(Pvalues),
                                                     q2=quantile(Pvalues,p=0.25),
                                                     q3=median(Pvalues),
                                                     q4=quantile(Pvalues,p=0.75),
                                                     q5=max(Pvalues))
                sig_order<-order(signature_names)
                segments<-data.frame(Sig=signature_names[sig_order],x_bgn=0.6,x_end=1.4,
                                     y_bgn=Lp_quant_Anova[sig_order],
                                     y_end=Lp_quant_Anova[sig_order],
                                     q1=0,q2=0,q3=0,q4=0,q5=0)
                g2<-ggplot(ms, aes(x=Sig,ymin=q1,lower=q2,middle=q3,upper=q4,ymax=q5)) + 
                  geom_boxplot(stat='identity',show.legend = FALSE,col='black') +
                  geom_hline(yintercept=lcut,col=col2) +
                  geom_segment(aes(x = x_bgn, y = y_bgn, xend = x_end, yend = y_end), col = col3,
                               data = segments, show.legend = FALSE)+
                  theme_bw()+
                  theme(axis.text.x=element_text(angle=0,vjust=.5,hjust=0,face="bold")) + 
                  labs(x="",y="-log(pvalue)")
                final_figure <- ggarrange(g1,forestplot,g2,ncol = 1)
              }else{
                final_figure <- ggarrange(g1,forestplot,ncol = 1)
              } 
              plot(final_figure)
              ####### Plotting end
              if(plot_to_file){
                  dev.off()
                  cat(paste("Exposure Survival model results",
                            "were plotted to the file",file,
                            "on the current directory.",sep=" "),"\n")
              }
              signif<-as.integer(signif_signatures)
              return(list("Significance"=signif,
                          "Stats"=multiv.tests))
          })


################################################################################
# Fuzzy Clustering:
################################################################################
setGeneric("FuzzyClustExp",
    def=function(signexp_obj,  max_instances=200, Clim=NA_integer_, Med_exp=NA,
                 method.dist="euclidean", method.clust="fcm", relative=FALSE, 
                 m=2, plot_to_file=FALSE, file="FuzzyClustExp.pdf",colored=TRUE,
                 iseed=NA_integer_, try_all=FALSE, fast=TRUE,
                 parplan="multisession",...){
        standardGeneric("FuzzyClustExp")
    }
)
setMethod("FuzzyClustExp",signature(signexp_obj="SignExp",  max_instances="ANY",
                                    Clim="ANY", Med_exp="ANY",method.dist="ANY",
                                    method.clust="ANY", relative="ANY", m="ANY",
                                    plot_to_file="ANY", file="ANY",
                                    colored="ANY",iseed="ANY",try_all="ANY",
                                    fast="ANY", parplan="ANY"),
    function(signexp_obj,  max_instances, Clim, Med_exp, method.dist, 
             method.clust, relative, m, plot_to_file, file, colored, iseed, 
             try_all, fast, parplan,...){
        if(!is.na(iseed)) set.seed(seed = iseed)
        dp <- dim(signexp_obj@Sign) #[i,n,r]
        de <- dim(signexp_obj@Exp) #[n,j,r]
        i<-dp[[1]]; n<-dp[[2]]; j<-de[[2]]; r<-de[[3]]
        if(r > max_instances){
          select_instances<-sort(sample(1:r,max_instances,replace=FALSE))
          signexp_obj@Exp <-signexp_obj@Exp[,,select_instances]
          signexp_obj@Sign<-signexp_obj@Sign[,,select_instances]
          r<-max_instances
        }
        Es<-signexp_obj@Exp
        if (is.na(Clim[1])){
            Clim <- c(2,j-1)
        }else{
            if(length(Clim)==1) Clim<-rep(Clim,2)    
        } 
        Cmin<-Clim[1]
        Cmax<-Clim[2]
        if(try_all){
            step0 <- 1
        }else{
            step0 <- 2^max((floor(log2(Cmax-Cmin+1))-2),0)
        }
        if(is.na(Med_exp[1])){
            Med_exp<-as.matrix(Median_exp(signexp_obj))
        }
        if(fast){
            my_obj<-Med_exp
        }else{
            my_obj<-signexp_obj    
        }
        aseed<-iseed
        if(Cmin<Cmax){
            cat(paste("Evaluating models with the number of groups ranging from ",
                      Cmin," to ",Cmax,", please be patient.\n",sep=""))
            Ops<-Optimal_sigs(testfun=function(n){
                cat(paste("Testing ",n," groups.\n",sep=""))
                Cm<-CmeansExp(my_obj, Med_exp, C=n, method.dist, method.clust,
                              relative, iseed=aseed)
                U<-Cm[[1]]
                PBMF<-as.vector(apply(Cm[[4]],3,function(E){PBMFindex(U,Data=E,m)}))
                return(list(median(PBMF),PBMF))
            },
            liminf=Cmin,limsup=Cmax,step=step0,significance=FALSE,parplan=parplan
            )
            bestn<-as.numeric(Ops[[1]])
            rm(Ops)
            cat(paste("Optimum number of groups is ",bestn,
                      ". Performing final clustering.\n",sep=""))
        }else{
            bestn<-Cmin
            cat(paste("Performing clustering in ",bestn," groups.\n",sep=""))
        }
        Cm<-CmeansExp(signexp_obj, Med_exp, C=bestn, method.dist, method.clust,
            relative, iseed=aseed,parplan=parplan)
        if(plot_to_file){
            if(length(grep("\\.pdf$",file))==0){
                file<-paste(file,"pdf",sep=".")
            }
            pdf(file,width=7,height=7)
        }else{
            if(!grepl("pdf|postscript|cairo_|png|tiff|jpeg|bmp",
                      names(dev.cur()),perl=TRUE)){
                dev.new(width=max(5,2*j), height=7)
            }
        }
        if(colored){
            clr = colorRampPalette(brewer.pal(name="YlOrRd",n=9))(100)
        }else{
            clr=  colorRampPalette(brewer.pal(name="Greys",n=9))(100) 
        }
        # Heatmap
        MF<-Cm$Meanfuzzy
        colnames(MF) = paste("Cl",1:NCOL(MF),sep="")
        rownames(MF)<-signexp_obj@samples
        ph<-pheatmap(MF,border_color=NA, color=clr, 
                 clustering_method='ward.D2',
                 clustering_distance_cols='canberra',
                 cluster_cols = FALSE,
                 show_rownames=FALSE,
                 show_colnames=TRUE,
                 angle_col="90",
                 silent=TRUE)
        gt<-ph[[4]]
        sample_ord<-ph[[1]]$order
        AllF = Cm$AllFuzzy[sample_ord,,]
        rownames(AllF) = signexp_obj@samples[sample_ord]
        colnames(AllF) = paste("Cl",1:NCOL(MF),sep="")
        m = reshape2::melt(AllF)
        colnames(m)<-c("Sample","Cluster","r","value")
        m$Cluster<-as.factor(m$Cluster)
        m$Sample<-as.factor(m$Sample)
        ms = group_by(m, Sample, Cluster) %>% summarize(q1=min(value),
                                                           q2=quantile(value,p=0.25),
                                                           q3=median(value),
                                                           q4=quantile(value,p=0.75),
                                                           q5=max(value))
        ms_ord<-c()
        for(sp in rownames(AllF)){
          for(cl in colnames(AllF)){
              ms_ord<-c(ms_ord,which(ms$Sample==sp & ms$Cluster==cl))
          }
        }
        ms<-ms[ms_ord,]
        plot(ph[[4]])
        samples_per_page<-6
        ## ggplot2 boxplot
        for(bt in 1:ceiling(j/samples_per_page)){
            batch<-(samples_per_page*NCOL(MF)*(bt-1)+1):(samples_per_page*NCOL(MF)*bt)
            batch<-batch[batch<=NROW(ms)]
            g<-ggplot(ms[batch,], aes(x=Cluster,ymin=q1,lower=q2,middle=q3,upper=q4,ymax=q5)) + 
              geom_boxplot(stat='identity') + 
              facet_grid(Sample~.) + 
              theme_cowplot() +
              theme(axis.text.y=element_blank(), axis.ticks.y = element_blank()) +
              theme(strip.text.y=element_text(size=6))+
              labs(x="")
            plot(g)  
        }
        if(plot_to_file){
            dev.off()
        }
        return(Cm[1:3])
    }
)

#CmeansExp
setGeneric("CmeansExp",
    def=function(signexp_obj, Med_exp=NA, C, method.dist="euclidean", 
        method.clust="fcm", relative=FALSE, iseed=NA_integer_,
        parplan="multisession",...){
        standardGeneric("CmeansExp")
    }
)
#For matrix:
setMethod("CmeansExp",signature(signexp_obj="matrix", Med_exp="ANY", C="ANY",
                                method.dist="ANY", method.clust="ANY", 
                                relative="ANY", iseed="ANY",parplan="ANY"),
          function(signexp_obj, Med_exp, C, method.dist, method.clust,
                   relative, iseed, parplan,...){
            # initialize random seed
            if(!is.na(iseed)) set.seed(seed = iseed)
            de <- dim(signexp_obj) #[n,j]
            n<-de[[1]]; j<-de[[2]]
            if(is.na(Med_exp[1])){
              Med_exp<-signexp_obj
            }
            if(relative){ 
              signexp_obj<-t(t(signexp_obj)/colSums(signexp_obj))
              Med_exp<-t(t(Med_exp)/colSums(Med_exp))
              }
            colnames(Med_exp)<-colnames(signexp_obj)
            # if(method.clust=="km"){
            #     baseclust<-kmeans(t(Med_exp),centers=C)
            #     basefuzzy<-t(sapply(baseclust$cluster,function(n){
            #         as.numeric(c(1:C)==n)
            #     }))
            # }else{ 
            #     if(method.clust=="fcm"){
            #         baseclust<-ppclust::fcm(t(Med_exp),centers=C)
            #     }else if (method.clust=="pcm"){
            #         baseclust<-ppclust::pcm(t(Med_exp),centers=C)
            #     }else if (method.clust=="fpcm"){
            #         baseclust<-ppclust::fpcm(t(Med_exp),centers=C)
            #     }else stop("method.clust should be 'fcm', 'pcm' or 'fpcm'!\n")
            #     basefuzzy<-baseclust$u
            # }
            if(method.clust=="km"){
              m <- 1        
            }else{ 
              if(method.clust=="fcm"){
                m <- 2
              }else stop("method.clust should be 'km' or 'fcm'!\n")
            }
            Es<-array(0,dim=c("n"=n,"j"=j,"r"=1))
            Es[,,1]<-signexp_obj
            Fuzzy<-FuzzyClusterCpp(Es,Med_exp,C,m,0.01)
            #return(list(Meanfuzzy=basefuzzy,AllFuzzy=basefuzzy))
            Meanfuzzy<-apply(Fuzzy[[1]],c(1,2),mean)
            return(list(Meanfuzzy=Meanfuzzy,
                        AllFuzzy=Fuzzy[[1]],
                        Centroids=Fuzzy[[2]],
                        Es=Es))
          })
#For SignExp
setMethod("CmeansExp",signature(signexp_obj="SignExp", Med_exp="ANY", C="ANY",
                                method.dist="ANY", method.clust="ANY", 
                                relative="ANY",iseed="ANY",parplan="ANY"),
          function(signexp_obj, Med_exp, C, method.dist, method.clust,
                   relative, iseed, parplan,...){
            # initialize random seed
            if(!is.na(iseed)) set.seed(seed = iseed)
            if(!signexp_obj@normalized) signexp_obj<-Normalize(signexp_obj)
            dp <- dim(signexp_obj@Sign) #[i,n,r]
            de <- dim(signexp_obj@Exp) #[n,j,r]
            i<-dp[[1]]; n<-dp[[2]]; j<-de[[2]]; r<-de[[3]]
            if(is.na(Med_exp[1])){
              Med_exp<-Median_exp(signexp_obj)
            }
            Es<-signexp_obj@Exp
            if(relative){ 
              Med_exp<-t(t(Med_exp)/colSums(Med_exp)) 
              colnames(Med_exp)<-signexp_obj@samples
              for(s in 1:r){
                thisE<-Es[,,s]
                Es[,,s]<-t(t(thisE)/colSums(thisE))
              }
            }
            if(method.clust=="km"){
              m <- 1        
            }else{ 
              if(method.clust=="fcm"){
                m <- 2
              }else stop("method.clust should be 'km' or 'fcm'!\n")
            }
            Fuzzy<-FuzzyClusterCpp(Es,Med_exp,C,m,0.01)
            Meanfuzzy<-apply(Fuzzy[[1]],c(1,2),mean)
            return(list(Meanfuzzy=Meanfuzzy,
                        AllFuzzy=Fuzzy[[1]],
                        Centroids=Fuzzy[[2]],
                        Es=Es))
          })
####
PBMFindex<-function(U,Data,m=2){
    U<-as.matrix(U)
    Data<-as.matrix(Data)
    k<-NCOL(U)
    n<-NROW(U)
    Um<-U^m
    Centroids<-t(t(Data%*%Um)/colSums(Um))
    v0<-rowMeans(Data)
    D0<-Data-v0
    e<-sum(sqrt(colSums(D0^2)))
    dk<-max(dist(t(Centroids)))
    Dc<-(as.matrix(dist(t(cbind(Data,Centroids))))[1:n,(n+1):(n+k)])^2
    #Dc<-(proxy::dist(t(Data),t(Centroids)))^2
    J<-sum(Um*Dc)
    PBMF<-((e*dk)/(k*J))^2
    return(PBMF)
}

################################################################################
# Hierarquical Clustering:
################################################################################
setGeneric("HClustExp",
    def=function(signexp_obj, Med_exp=NA,  max_instances=200, 
                 method.dist="euclidean", method.hclust="average", 
                 use.cor=FALSE, relative=FALSE, plot_to_file=FALSE, 
                 file="HClustExp_dendrogram.pdf", colored=TRUE,...){
        standardGeneric("HClustExp")
    }
)
setMethod("HClustExp",signature(signexp_obj="SignExp", Med_exp="ANY",  
                                max_instances="ANY", method.dist="ANY", 
                                method.hclust="ANY", use.cor="ANY", 
                                relative="ANY", plot_to_file="ANY",
                                file="ANY",colored="ANY"),
    function(signexp_obj, Med_exp,  max_instances, method.dist, method.hclust,
        use.cor, relative, plot_to_file, file, colored,...){
        if(!signexp_obj@normalized) signexp_obj<-Normalize(signexp_obj)
        dp <- dim(signexp_obj@Sign) #[i,n,r]
        de <- dim(signexp_obj@Exp) #[n,j,r]
        i<-dp[[1]]; n<-dp[[2]]; j<-de[[2]]; r<-de[[3]]
        if(r > max_instances){
          select_instances<-sort(sample(1:r,max_instances,replace=FALSE))
          signexp_obj@Exp <-signexp_obj@Exp[,,select_instances]
          signexp_obj@Sign<-signexp_obj@Sign[,,select_instances]
          r<-max_instances
        }
        if(is.na(Med_exp[1])){
            Med_exp<-Median_exp(signexp_obj)
        }
        if(relative){ Med_exp<-t(t(Med_exp)/colSums(Med_exp)) }
        # Med_exp: (n,p) matrix, n-samples, p-variables
        # Use custom distance function
        if(is.function(method.dist)) {
            distance <- method.dist(Med_exp)
        } else {
            distance<-dist.pvclust(Med_exp,method=method.dist,use.cor=use.cor)
        }
        
        # ward -> ward.D
        # only if R >= 3.1.0
        if(method.hclust == "ward" && getRversion() >= '3.1.0') {
            method.hclust <- "ward.D"
        }
        Med_exp.hclust <- hclust(distance, method=method.hclust)
        # multiscale bootstrap
        r <- list(1.0)
        mboot <-list(Exp.hclust(signexp_obj=signexp_obj, 
            object.hclust=Med_exp.hclust, method.dist=method.dist, 
            use.cor=use.cor, method.hclust=method.hclust, relative=relative)) 
        
        result<-Expclust.merge(Med_exp,object.hclust=Med_exp.hclust,mboot=mboot)
        if(plot_to_file){
            pdf(file)
        }
        ####### Plotting
        plot(result)  # plot.pvclust
        ####### Plotting end
        if(plot_to_file){
            dev.off()
        }
        return(result)
    }
    
)
Exp.hclust <- function(signexp_obj, object.hclust, method.dist, use.cor, 
    method.hclust, relative=FALSE)
{ 
    if(!signexp_obj@normalized){
        signexp_obj<-Normalize(signexp_obj)
    }
    Es<-signexp_obj@Exp
    n<-dim(Es)[[1]]
    j<-dim(Es)[[2]]
    r<-dim(Es)[[3]]
    
    pattern   <- hc2split(object.hclust)$pattern
    edges.cnt <- table(factor(pattern)) - table(factor(pattern))
    
    for(k in 1:r){
        Exposure <- Es[,,'r'=k]
        if(n==1) Exposure <- matrix(as.vector(Exposure),n,j)
        if(relative){ Exposure<-t(t(Exposure)/colSums(Exposure)) }
        if(j<=30){
          colnames(Exposure)<-signexp_obj@samples
        }else{
          colnames(Exposure)<-NULL
        }
        if(is.function(method.dist)) {
            suppressWarnings(distance  <- method.dist(Exposure))
        } else {
            suppressWarnings(distance  <- dist.pvclust(Exposure,
                method=method.dist,use.cor=use.cor))
        }
        if(all(is.finite(distance))) { # check if distance is valid
            x.hclust  <- hclust(distance,method=method.hclust)
            pattern.i <- hc2split(x.hclust)$pattern # split
            edges.cnt <- edges.cnt + table(factor(pattern.i,  levels=pattern))
        } else {
            x.hclust <- NULL
            warning(paste("inappropriate distance matrices",
                " are omitted in computation: r = ", k), call.=FALSE)
        }
    }
    boot <- list(edges.cnt=edges.cnt, method.dist=method.dist, use.cor=use.cor,
        method.hclust=method.hclust, nboot=r, size=n, r=1, store=NA)
    class(boot) <- "boot.hclust"
    return(boot)
}

Expclust.merge <- function(data, object.hclust, mboot){
    pattern <- hc2split(object.hclust)$pattern
    r     <- unlist(lapply(mboot,"[[","r"))
    nboot <- unlist(lapply(mboot,"[[","nboot"))
    store <- lapply(mboot,"[[", "store")
    rl <- length(mboot)
    ne <- length(pattern)
    edges.bp <- edges.cnt <- data.frame(matrix(rep(0,ne*rl),nrow=ne,ncol=rl))
    row.names(edges.bp) <- pattern
    names(edges.cnt) <- paste("r", 1:rl, sep="")
    for(j in 1:rl) {
        edges.cnt[,j] <- as.vector(mboot[[j]]$edges.cnt) 
        edges.bp[,j]  <- edges.cnt[,j] / nboot[j]
    }
    ms.fitted <- lapply(as.list(1:ne),
        function(x, edges.bp, r, nboot){
            msfit(as.vector(t(edges.bp[x,])), r, nboot)},
        edges.bp, r, nboot)
    class(ms.fitted) <- "mslist"
    p    <- lapply(ms.fitted,"[[","p")
    se   <- lapply(ms.fitted,"[[","se")
    coef <- lapply(ms.fitted,"[[","coef")
    au    <- unlist(lapply(p,"[[","au"))
    bp    <- unlist(lapply(p,"[[","bp"))
    se.au <- unlist(lapply(se,"[[","au"))
    se.bp <- unlist(lapply(se,"[[","bp"))
    v     <- unlist(lapply(coef,"[[","v"))
    cc    <- unlist(lapply(coef,"[[","c"))
    pchi  <- unlist(lapply(ms.fitted,"[[","pchi"))
    edges.pv <- data.frame(au=au, bp=bp, se.au=se.au, se.bp=se.bp,
        v=v, c=cc, pchi=pchi)
    row.names(edges.pv) <- row.names(edges.cnt) <- 1:ne
    result <- list(hclust=object.hclust, edges=edges.pv, count=edges.cnt,
        msfit=ms.fitted, nboot=nboot, r=r, store=store)
    class(result) <- "pvclust"
    return(result)
}

#Internal functions from package pvclust:

hc2split <- function(x)
{
    A <- x$merge # (n-1,n) matrix
    n <- nrow(A) + 1
    B <- list()
    
    for(i in 1:(n-1)){
        ai <- A[i,1]
        
        if(ai < 0)
            B[[i]] <- -ai
        else
            B[[i]] <- B[[ai]]        
        
        ai <- A[i,2]
        
        if(ai < 0)
            B[[i]] <- sort(c(B[[i]],-ai))
        else
            B[[i]] <- sort(c(B[[i]],B[[ai]]))
    }
    
    CC <- matrix(rep(0,n*(n-1)),nrow=(n-1),ncol=n)
    
    for(i in 1:(n-1)){
        bi <- B[[i]]
        m <- length(bi)
        for(j in 1:m)
            CC[i,bi[j]] <- 1
    }
    
    split <- list(pattern=apply(CC,1,paste,collapse=""), member=B)
    
    return(split)
}

dist.pvclust <- function(x, method="euclidean", use.cor="pairwise.complete.obs")
{
    if(!is.na(pmatch(method,"correlation"))){
        res <- as.dist(1 - cor(x, method="pearson", use=use.cor))
        attr(res,"method") <- "correlation"
        return(res)
    }
    else if(!is.na(pmatch(method,"abscor"))){
        res <- as.dist(1 - abs(cor(x,method="pearson",use=use.cor)))
        attr(res,"method") <- "abscor"
        return(res)
    }
    else if(!is.na(pmatch(method,"uncentered"))){
        if(sum(is.na(x)) > 0){
            x <- na.omit(x)
            warning("Rows including NAs were omitted")
        }
        x  <- as.matrix(x)
        P  <- crossprod(x)
        qq <- matrix(diag(P),ncol=ncol(P))
        Q  <- sqrt(crossprod(qq))
        res <- as.dist(1 - P/Q)
        attr(res,"method") <- "uncentered"
        return(res)
    }
    else
        dist(t(x),method)
}
rvalieris/signeR documentation built on April 20, 2024, 2:08 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
rvalieris/signeR
Empirical Bayesian approach to mutational signature discovery

R/SignExp_learning.R
In rvalieris/signeR: Empirical Bayesian approach to mutational signature discovery

Defines functions Mymelt

R Package Documentation

Browse R Packages

We want your feedback!

rvalieris/signeR Empirical Bayesian approach to mutational signature discovery

R/SignExp_learning.R In rvalieris/signeR: Empirical Bayesian approach to mutational signature discovery

Defines functions Mymelt

R Package Documentation

Browse R Packages

We want your feedback!

rvalieris/signeR
Empirical Bayesian approach to mutational signature discovery

R/SignExp_learning.R
In rvalieris/signeR: Empirical Bayesian approach to mutational signature discovery