R/metaAnalyses.R

Defines functions internalJaccardIndMatrix topgenes.corClusterDraw_log topgenes.corCluster_log topgenes.corClusterDraw topgenes.corCluster topCommonGenes comp_cor_log2 comp_ratio_sd extractDendroTissueDistance extractDendroFromPlotTD plotTissuesDistance tissuesDistance overlapSpeProtmRNA jaccardInd t_test.Mat_identical.vs.different completeDF.t_test.CorrMat_identical.vs.different t_test.CorrMat_identical.vs.different test_CorrMat_identical_different compute_tissue_corr_matrix

Documented in comp_cor_log2 completeDF.t_test.CorrMat_identical.vs.different comp_ratio_sd compute_tissue_corr_matrix extractDendroFromPlotTD extractDendroTissueDistance internalJaccardIndMatrix jaccardInd overlapSpeProtmRNA plotTissuesDistance test_CorrMat_identical_different tissuesDistance topCommonGenes topgenes.corCluster topgenes.corClusterDraw topgenes.corClusterDraw_log topgenes.corCluster_log t_test.CorrMat_identical.vs.different t_test.Mat_identical.vs.different

## Analyses based on correlation ------------------------------------------------

#' Compute the correlation between two matrices variables( alphabetical order)
#'
#' @param A numeric data.frame for the first study to compare
#' @param B numeric data.frame for the second study to compare
#' @param label.A character string. Name of the first study used in the report.
#' @param label.B character string. Name of the second study used in the report.
#' @param method character string. Method for base::cor, either "pearson", "spearman" or
#'              "kendall".
#' @param log2 logical. Default: TRUE. Whether the data should be transform with log2 or not
#' @param report boolean. Default:TRUE. Whether to output the result before returning the matrix.
#' @param Latex boolean. Default: FALSE. Whether to output in latex format.
#' @param Kable boolean. Default: TRUE. Whether to report with kable.
#' @param Grid boolean. Default: TRUE. Whether to create an image of the output.
#' @param headers boolean. Default: TRUE. Whether to use headers for the reporting.
#'
#' @return a matrix with the columns correlation of the two studies.
#' @export
#'
compute_tissue_corr_matrix<-function(A,B,label.A,label.B,method,log2=TRUE,
                                     report=TRUE,Latex=FALSE,Kable=TRUE,Grid=TRUE,
                                     headers=TRUE){

  if (report){
    if(headers)  cat(paste0("\n\n### Correlation between ",label.A," and ",label.B,":\n\n"))
  }
  A<-A[,sort(colnames(A))]
  B<-B[,sort(colnames(B))]
  if(log2){
  CorrMat<-data.frame(lapply(colnames(B),function(x){
    sapply(colnames(A),function(y){
      cor(log2(A[,y]+1),log2(B[,x]+1),method=method)})}))
  }else{
    CorrMat<-data.frame(lapply(colnames(B),function(x){
      sapply(colnames(A),function(y){
        cor(A[,y],B[,x],method=method)})}))
  }

  colnames(CorrMat)<-colnames(B)
  if(report){
    if(Latex) xtable::xtable(CorrMat)
    if(Kable) print(knitr::kable(CorrMat))
    if(Grid){
      gridExtra::grid.table(signif(CorrMat,digits=2))
      grid::grid.newpage()
    }
  }
  return(CorrMat)
}

#' Create the data.frame and can also test and plot
#' the comparison between the tissues of the first study with the second one
#'
#' @param CorrMat correlation matrix between two studies tissue expression (column correlation)
#' @param test boolean. Default: FALSE. Whether a (Welch's) t-test should be performed
#'             between the group where the same tissue from each study are compared together
#'             and the group where one tissue from the first study is compared to
#'             a random tissue of the second tissue.
#' @param plot boolean. Default: FALSE. Whether the \strong{Test} should be plotted as well.
#' @param publish boolean. Default:TRUE. Whether the theme theme_bw() should be applied to the plot
#' @param x numeric. Default:0.75. x-axis of where to center the p-value.
#' @param y numeric. Default:0.1. y-axis of where to center the p-value.
#' @param ym numeric. Default: 0.02. y-axis of where to write the medium value of the correlation
#' @param textaxis numeric. Default:15. Size of the text on the x-axis
#' @param titleaxis  numeric. Default:15. Size of the x-axis' title.
#' @param out character string. Allows to pick the output.
#'            "DFtest" (default):
#' @param sep character or character string. Default: '.'
#' @param remove.identity boolean. Default:TRUE.
#'        Whether the cases where the same tissue from the first study and the second one should be removed.
#' @param reorder boolean. Default: TRUE. Whether the data.frame should be order in decreasing order of the observed correlation
#'
#' @return depends on 'out'. Can be a data.frame, a plot, only the identical tissue comparisons,
#'         only the different tissue comparisons, ...
#' @export
#'
test_CorrMat_identical_different<-function(CorrMat,test=FALSE,plot=FALSE,publish=TRUE,
                                           x=0.75,y=0.1,ym=0.02,textaxis=15,titleaxis=15,
                                           out='DFtest',sep='.',remove.identity=TRUE,reorder=TRUE){

  DFtest<-as.data.frame(t(data.frame(utils::combn(colnames(CorrMat),2,simplify=FALSE))),stringsAsFactors = FALSE)
  rownames(DFtest)<-1:nrow(DFtest)
  colnames(DFtest)<-c('Pair1','Pair2')

  sep=paste0('[',sep,']')
  DFtest$Type<-sapply(1:nrow(DFtest),
                      function(x) ifelse(strsplit(DFtest$Pair1[x],sep)[[1]][1]==strsplit(DFtest$Pair2[x],sep)[[1]][1],
                                         'Same tissues','Different tissues'))
  DFtest$Correlation<-sapply(1:nrow(DFtest),function(x) CorrMat[DFtest[x,1],DFtest[x,2]])
  if(remove.identity) DFtest<-DFtest[DFtest$Pair1!=DFtest$Pair2,]
  if(reorder) DFtest<-DFtest[base::order(DFtest$Correlation,decreasing=TRUE),]

  if(test){
    Identical<-DFtest[DFtest$Type=="Same tissues",]
    Different<-DFtest[DFtest$Type=="Different tissues",]
    test<-t.test(Identical$Correlation,Different$Correlation)
    if(reorder) Identical<-Identical[base::order(Identical$Correlation,decreasing=TRUE),]
    estimateDF<-data.frame(Type=c("Same tissues","Different tissues"),
                           Correlation=test$estimate)
    if(plot){
      p <- ggplot2::ggplot(DFtest,aes(Type,Correlation))+ggplot2::geom_boxplot()+ggplot2::coord_cartesian(ylim=c(0,1))
      p <- p + ggplot2::annotate('text',label=paste('p-value=',signif(test$p.value,digits=8)),
                                 x=x,y=y)+ggplot2::geom_text(data=estimateDF,
                                                             aes(label=paste0('m = ',
                                                                              signif(Correlation,digits=4)),
                                                                 y=ym),
                                                             size=10)
      if(publish) p <- p + theme_bw()
      p <- p + ggplot2::theme(axis.text=element_text(size=textaxis),
                              axis.title=element_text(size=titleaxis),
                              legend.position='none')

      print(p)
      switch(out,
             'identical'= return(Identical),
             'different'= return(Different),
             'DFtest'= return(DFtest),
             'test' = return(test),
             'plot' = return(p),
             'test_identical'= return(list(test,identical)),
             'DFtest_identical'= return(list(DFtest,identical))
      )
    }
  }else{
    return(DFtest)
  }
}


#' Test with a Welch t-test between the group of similar tissues versus the group of different tissues.
#'
#' @param CorrMat correlation matrix between two studies tissue expression (column correlation)
#' @param label.A character string. Name of the first study on which the correlations are based
#' @param label.B character string. Name of the second study on which the correlations are based
#' @param report boolean. Default: TRUE. Whether to output a text report.
#' @param index character string. For the notebook, allows to create new sections.
#' @param output character string.
#'               'identical' to return only the pairs that comprise the same tissues,
#'               'different' to return only the pairs that comprise the different tissues,
#'               'DFtest' to return all the pairs,
#'               'test' returns only the test,
#'               'plot' returns only the plot,
#'               'test_identical' returns the test and only the pairs that comprise the same tissues,
#'               'DFtest_identical' returns all the pairs for the test and the identical tissues (as two different objects).
#' @param publish boolean. Default: TRUE. Whether to apply theme_bw()
#'
#' @return the result of the Welch t-test and a figure
#' @export
#'
t_test.CorrMat_identical.vs.different<-function(CorrMat,label.A,label.B,report=TRUE,index="####",
                                                output='DFtest_identical',publish=TRUE){

  if(report) cat(paste0("\n\n",index," Welch Two sample test between the identical tissues and different tissues of ",label.A,
                        " and ",label.B,"\n\n"))
  DFtest <-Reduce(rbind,lapply(colnames(CorrMat),function(x){
    rbind(cbind(DF1=x,
                DF2=reshape2::melt(CorrMat[grep(x,rownames(CorrMat),ignore.case=TRUE),
                                           colnames(CorrMat)[grep(x,colnames(CorrMat),ignore.case=TRUE)],drop=FALSE]),
                type='Same-tissue pairs'),
          cbind(DF1=x,
                DF2=reshape2::melt(CorrMat[grep(x,rownames(CorrMat),ignore.case=TRUE),
                                           !colnames(CorrMat) %in% colnames(CorrMat)[grep(x,colnames(CorrMat),ignore.case=TRUE)]]),
                type='Different tissues pairs')
    )
  }))
  colnames(DFtest)<-c('DF1','DF2','Correlation','Type')
  identical<-DFtest[DFtest$Type=="Same-tissue pairs",]
  different<-DFtest[DFtest$Type=="Different tissues pairs",]
  test<-t.test(identical$Correlation,different$Correlation)

  identical<-identical[order(identical$Correlation,decreasing=TRUE),]
  estimateDF=data.frame(Type=c('Same-tissue pairs','Different tissues pairs'),
                        Correlation=test$estimate)
  if(report) cat.html(test)

  p<-ggplot2::ggplot(DFtest,aes(Type,Correlation))+geom_boxplot()+ annotate('text',label=paste('p-value=',
                                                                                               signif(test$p.value,digits=8)),
                                                                            x=0.75,y=0.1)+geom_text(data=estimateDF,
                                                                                                    aes(label=paste0('m= ',signif(Correlation,digits=4)),
                                                                                                        y=0.02),size=10)# +ylab(paste(simpleCap(METHODCOR),'Correlation'))
  if(publish) p<- p +theme_bw()

  p <- p + ggplot2::theme(axis.text=element_text(size=15),
                          axis.title=element_text(size=15),
                          legend.position='none')

  print(p)
  switch(output,
         'identical'= return(identical),
         'different'= return(different),
         'DFtest'= return(DFtest),
         'test' = return(test),
         'plot' = return(p),
         'test_identical'= return(list(test,identical)),
         'DFtest_identical'= return(list(DFtest,identical)))

}


#' Apply t.test.CorrMat_identical.vs.different and complete the resulting data.frame
#'
#' @param CorrMat correlation matrix between two studies tissue expression (column correlation)
#' @param label.A character string. Name of the first study on which the correlations are based
#' @param label.B character string. Name of the second study on which the correlations are based
#' @param report boolean. Default: TRUE. Whether to output a text report.
#' @param index character string. For the notebook, allows to create new sections.
#' @param publish boolean. Default: TRUE. Whether to apply theme_bw()
#' @param QuantMet character string. Allows to fill in the quantification of the studies.
#' @param datasets Character string. Allows to fill in the names of the studies.
#' @param CorrelationMethod character string. Fill in the name of the correlation method
#'
#' @return a data.frame
#' @export
#'
completeDF.t_test.CorrMat_identical.vs.different<-function(CorrMat,label.A,label.B,report=TRUE,index="####",
                                                  publish=TRUE,QuantMet,datasets,CorrelationMethod){


  newDF<-t_test.CorrMat_identical.vs.different(CorrMat,label.A=label.A,label.B=label.B,
                                               report=report,index=index,output='DFtest',
                                               publish=publish)
  newDF$Tissue.Nb<-base::ncol(CorrMat)
  newDF$QuantMet<-QuantMet
  newDF$datasets<-datasets
  newDF$CorrelationMethod<-CorrelationMethod

  return(newDF)

}


#' Test with a Welch t-test between the group of similar tissues versus the group of different tissues.
#' Take the diagonal and test it versus the lower triangle of the matrix.
#'
#' @param Mat correlation matrix between two studies tissue expression (column correlation)
#' @param yLab character string. Title of the y-axis
#' @param label.diag character string. Label for the pairs in the diagonal of the matrix.
#' @param label.low.tri character string. Label for the pairs in the lower.triangle of the matrix.
#' @param xLab character string. Title of the x-axis. Default: 'Type'.
#' @param pval boolean. Default: TRUE. Whether to add the p-value on the plot.
#' @param showMean boolean. Default: TRUE. Whether to add the mean of each group on the plot.
#' @param output character string that allows to pick the output;
#'               'identical' for the same pairs of tissues,
#'               'different' for the different pairs of tissues,
#'               'test' for the test,
#'               'plot' for the plot and
#'               'test_identical' for a list comprising the identical tissues and the test.
#' @param publish boolean. Default: TRUE. Whether to apply theme_bw().
#'
#' @return depends on 'output'; it can be a data.frame, a figure or the result of the test.
#' @export
#'
t_test.Mat_identical.vs.different<-function(Mat,yLab,
                                            label.diag='Same-tissue pairs',
                                            label.low.tri='Different tissues pairs',
                                            xLab='Type',pval=TRUE,showMean=TRUE,
                                            output='test',publish=TRUE){

  identical<-diag(as.matrix(Mat))
  different<-as.matrix(Mat)[lower.tri(as.matrix(Mat), diag = FALSE)]
  test<-t.test(identical,different)

  estimateDF=data.frame(Type=c(label.diag,label.low.tri),
                        Value=test$estimate)

  DFtest<-rbind(data.frame(Value=identical,Type=label.diag),
                data.frame(Value=different,Type=label.low.tri))

  p <- ggplot2::ggplot(DFtest,aes(Type,Value))+ggplot2::geom_boxplot()
  if(pval) p <- p + ggplot2::annotate('text',label=paste('p-value=',
                                                         signif(test$p.value,digits=8)),
                                      x=0.75,y=0.1)
  if(showMean) p<- p+ ggplot2::geom_text(data=estimateDF,
                                         aes(label=paste0('m= ',signif(Value,digits=4)),
                                             y=0.02),size=10)
  if(publish) p<- p +theme_bw()

  p<-p+ggplot2::xlab(xLab)+ggplot2::ylab(yLab)
  p<-p+ggplot2::theme(axis.text=element_text(size=15),
                      axis.title=element_text(size=15),
                      legend.position='none')

  print(p)
  switch(output,
         'identical'= return(identical),
         'different'= return(different),
         'test' = return(test),
         'plot' = return(p),
         'test_identical'= return(list(test,identical)))
}


## Analyses based on tissue distance ----------------------

#' Compute Jaccard index
#'
#' @param list1 vector of gene names from a first element to compare
#' @param list2 vector of genes names from a second element to compare
#' @param universeSize positive integer. Number of elements in the universe
#' @param pvalue boolean. Default: FALSE. Whether a p-value should be computed
#' @param indexSignif positive integer. Default: 2. Number of significant digits to add for the Jaccard index on the figure.
#' @param pvalSig positive integer. Default: 3. Number of significant digits to add for the p-value on the figure.
#' @param figure boolean. default: FALSE. Whether a figure showing the overlap should be displayed
#' @param Col Colour palette: vector of two character strings. Colours to use to draw the two elements being compared in the venn diagram
#' @param categories vector of two character strings that are the names of the two elements being compared.
#' @param condition character string. Name of the condition compared between the two elements.
#' @param policeFont character string. Name of the font to use.
#' @param saveFile path (character string). Default: NA. Allows to save the figure (if figure=TRUE) at the given path.
#' @param outputType character string. Whether "index" to return the Jaccard index;
#'                                     "pval" for the p-value associated or
#'                                     "list" for a list comprising the index and the pvalue,
#'                                     "intersect" intersection of the two lists.
#' @param ... other parameters for grDevices::cairo_pdf
#'
#'
#' @return depends on outputType. Refer to that argument for the possible output.
#' @export
#'

jaccardInd<-function(list1,list2,universeSize,pvalue=FALSE,
                     indexSignif=2, pvalSig=3,
                     figure=FALSE,Col=c('coral4','darkorchid1'),
                     categories,condition,policeFont='Linux Libertine',
                     saveFile=NA,outputType="index",
                     ...){

  Inter<-length(intersect(list1,list2))
  Uni<-length(union(list1,list2))
  NbPossibleSuccesses<-length(list1)
  Nbdraws<-length(list2)

  jInd<-Inter/Uni

  if(figure)  pvalText<-paste('\nJaccard index = ',signif(jInd,digits=indexSignif))

  if (outputType %in% c("pval","list")) pvalue<-TRUE

  if(pvalue&&missing(universeSize)) {
    warning("Impossible to compute a p-value as the universe size is missing")
    pvalue<-FALSE
    outputType<-"index"
  }

  if(pvalue){
    #p-value to observe Inter or more (thus Inter-1)
    pval<-phyper(Inter-1,NbPossibleSuccesses,universeSize-NbPossibleSuccesses,Nbdraws,lower.tail = FALSE)
    if(figure)  pvalText<-paste(pvalText,'\np-value =',signif(pval,digits=pvalSig))
  }

  if(figure){
    p<-VennDiagram::draw.pairwise.venn(ind=FALSE,
                          fontfamily=policeFont,
                          cat.fontfamily=policeFont,
                          area1=NbPossibleSuccesses,
                          area2=Nbdraws,
                          cross.area=Inter,
                          category = c(categories[1],categories[2]),
                          cat.col= c(Col[1],Col[2]),
                          cat.pos = c(340,27),
                          cat.cex=rep(2,2),
                          cat.dist=c(0.04,0.04),
                          col = c(Col[1],Col[2]),
                          fill = c(Col[1],Col[2]),
                          cex=rep(2,3),
                          alpha = rep(0.4, 2),
                          euler.d=TRUE,
                          margin=0,
                          scaled=TRUE
    )
    grid.arrange(grobs=list(title=textGrob(condition,gp=gpar(fontsize=50)),
                            textGrob(pvalText,gp=gpar(fontsize=15)),gTree(children=p)),heights=c(1,0.5,5))

    if(!is.na(saveFile)){
      cairo_pdf(filename = saveFile, family= policeFont,...)
      grid.arrange(grobs=list(title=textGrob(condition,gp=gpar(fontsize=50)),
                              textGrob(pvalText,gp=gpar(fontsize=15)),gTree(children=p)),heights=c(1,0.5,5))
      dev.off()
    }

  }

  switch(outputType,
         "index"    = return(jInd),
         "pval"     = return(pval),
         "list"     = return(list("index"=jInd,"pval"=pval)),
         "intersect"= return(intersect(list1,list2))
         )
}

#' Overlap of the tissue-specific proteins with the most specific transcripts to that tissue.
#'
#' @param DFprot numeric data.frame of the protein expression data
#' @param DFmRNA numeric data.frame of the transcriptomic expression data
#' @param pretreatment boolean. Default: FALSE. Whether the considered tissues (columns) and genes (rows)
#'                     need to be filtered to the common set.
#'                     The function requires the same set of tissues/genes to be meaningful.
#'                     This step can be bypassed if it already happened previously.
#' @param mRNAmethod a function name. The function should allow to order the mRNAs based on their specificity rank.
#' @param cutoffProt numeric. Threshold to consider a protein is expressed.
#' @param complete.matrix boolean. Default:TRUE.
#'                        Whether the comparison between the two studies should be done for their whole data.frame
#' @param pvalue boolean. Default: TRUE.
#'               Whether the p-value of the jaccard index should be computed
#' @param ratio numeric (>0). Multiplying factor that
#'              allows increasing the number of transcripts compared with the proteins.
#' @param xprot character string. Default: NA. Needed when complete.matrix is false.
#'              Name of the tissue to consider for the proteomics.
#' @param yrna character string. Default: NA. Needed when complete.matrix is false.
#'              Name of the tissue to consider for the transcriptomics.
#' @param figure boolean. Default: TRUE. Whether the venn diagram should be drawn
#' @param ... more arguments for grDevices::cairo_pdf
#'
#' @return overlap between the proteomics and transcriptomics
#' @export
#'
overlapSpeProtmRNA<-function(DFprot,DFmRNA,pretreatment=FALSE,
                             mRNAmethod='ratioSpe',
                             cutoffProt=0,
                             complete.matrix=TRUE,
                             pvalue=TRUE,
                             ratio=1,
                             xprot=NA,
                             yrna=NA,
                             figure=TRUE,...){
  if(pretreatment){
    commonCond<-intersect(colnames(DFprot),colnames(DFmRNA))
    DFprot<-strip(DFprot[,commonCond])
    DFmRNA<-strip(DFmRNA[,commonCond])

    commonRows<-intersect(rownames(DFprot),rownames(DFmRNA))
    DFprot<-DFprot[commonRows,]
    DFmRNA<-DFmRNA[commonRows,]
  }

  nbGenes<-nrow(DFmRNA)
  #extract the uniquely expressed proteins
  DFprot<-computeBreadth(DFprot,omit.zero=TRUE,
                         threshold=cutoffProt,
                         typeR='unique_tissueList')

  #compute the specificity of RNA in each tissue
  DFmRNA<-eval(call(name=mRNAmethod,DFmRNA))
  conditionCnter<-colnames(DFmRNA)

  if(complete.matrix){
    if(!pvalue){
      resDF<-Reduce(rbind,
                    lapply(setNames(conditionCnter,conditionCnter),function(x){
                      #print(paste("x",x))
                      uniqueProt<-rownames(DFprot[DFprot$condition==x,])
                      #print("uniqueProt")
                      #print(dput(uniqueProt))
                      lg<-length(uniqueProt)
                      limit<-round(ratio*lg)
                      tmp<-as.data.frame(lapply(setNames(conditionCnter,conditionCnter), function(y){
                        #print(paste("y",y))
                        list2comp<-rownames(DFmRNA[order(DFmRNA[,y],decreasing = TRUE),])[1:limit]
                        #print("list2comp")
                        #print(dput(list2comp))
                        jaccardInd(uniqueProt,list2comp,universeSize=nbGenes,pvalue=FALSE,figure=FALSE)
                      }))
                    }))
      rownames(resDF)<-conditionCnter
    }else{
      resDF<-Reduce(rbind,lapply(setNames(conditionCnter,conditionCnter),
                                 function(x){
                                   uniqueProt<-rownames(DFprot[DFprot$condition==x,])
                                   lg<-length(uniqueProt)
                                   limit<-ratio*lg
                                   tmp<-as.data.frame(lapply(setNames(conditionCnter,conditionCnter), function(y){
                                     list2comp<-rownames(DFmRNA[order(DFmRNA[,y],decreasing = TRUE),])[1:limit]
                                     jaccardInd(uniqueProt,list2comp,universeSize=nbGenes,pvalue=TRUE,figure=FALSE,
                                                outputType="pval")
                                   }))
                                 }))
      rownames(resDF)<-conditionCnter
    }
    #row results correspond to the protein tissue samples
    #column results correspond to the mRNA tissue samples
    #example: if column[3]=='Heart' and row[4]=='Liver
    #the comparison is between the transcriptome of the Heart and the Liver proteome.
    return(resDF)
  }else{
    if(missing(yrna)) yrna<-xprot
    uniqueProt<-rownames(DFprot[DFprot$condition==xprot,])
    lg<-length(uniqueProt)
    limit<-ratio*lg
    list2comp<-rownames(DFmRNA[order(DFmRNA[,yrna],decreasing = TRUE),])[1:limit]
    if(figure){
      if(xprot!=yrna) {
        condition=paste(xprot,'~',yrna)
      }else{
        condition=xprot
      }
    }
    jaccardInd(uniqueProt,list2comp,universeSize=nbGenes,pvalue=pvalue,
               figure=figure,condition=condition,...)
  }
}



#' Compute the distance between the tissues within a study.
#' @description the distance is based on the number of genes that are uniquely shared by a number of given tissues only.
#'
#' @param DF numeric data.frame that contains
#' @param nbOfTissues integer (>0) number of tissues in which the genes have to be expressed to be considered
#' @param ColN vector of character strings. Names of the tissues to include in the analysis.
#' @param gid boolean. Default: FALSE. Whether to output the name of the concerned genes instead of their number.
#' @param plot boolean. Default: FALSE. Whether the heatmap should be printed or not
#' @param method character string, name of the agglomeration method to be used (hclust)
#'               This should be (an unambiguous abbreviation of) one of "ward.D", "ward.D2", "single",
#'               "complete", "average" (= UPGMA), "mcquitty" (= WPGMA),
#'               "median" (= WPGMC) or "centroid" (= UPGMC).
#' @param notecol character string. Name of the colour to use for the annotation within the heatmap.
#' @param col function to be used to define the colour palette for (gplots) heatmap.2
#'            default: colorRampPalette(c('aliceblue','darkcyan'))
#' @param threshold numeric. Default: 0
#' @param ... for labelling purposes
#'
#' @return a data.frame
#' @export
#'
tissuesDistance<-function(DF,nbOfTissues=2,ColN,gid=FALSE,plot=FALSE,
                          method='ward.D',notecol='gray37',
                          col=colorRampPalette(c('aliceblue','darkcyan')),
                          threshold=0,
                          ...){

  if(missing(ColN)) {
    if(colnames(DF)[ncol(DF)]=='nb.tissues'){
      ColN<-colnames(DF)[-ncol(DF)]
      DF.cleaned<-DF[DF$nb.tissues ==nbOfTissues,]
      DF.cleaned<-DF.cleaned[,ColN]
    }else{
      ColN<-colnames(DF)
      DF.cleaned<-DF[rowSums(DF>threshold)==nbOfTissues,]
    }}

  TissuesCombo<-data.frame(utils::combn(ColN,nbOfTissues,simplify=FALSE),stringsAsFactors=FALSE)

  res<-Reduce(rbind,lapply(TissuesCombo,function(x) {
    x<-as.character(x)
    # we consider rows that were the first element of x is true
    # as such need to make sure that there are indeed rows before the next step:
    if(nrow(DF.cleaned[DF.cleaned[,x[1]] >0,])>0){
      DF.tmp<-DF.cleaned[DF.cleaned[,x[1]] >0,]
      if(!gid){
        count<-nrow(DF.tmp[DF.tmp[,x[2]] >0,])
        #same thing with the second tissues
      }else{
        if(length(rownames(DF.tmp[DF.tmp[,x[2]] >0,]))>0){
          id<-Reduce(function(...) paste(...,sep=";"),rownames(DF.tmp[DF.tmp[,x[2]] >0,]))
        }else{
          id<-'none'
        }
      }
    }else{
      if(!gid){
        count<-0
      }else{
        id<-'none'
      }
    }
    if(!gid){
      return(data.frame(T1=x[1],T2=x[2],count=count))
    }else{
      return(data.frame(T1=x[1],T2=x[2],geneID=id))
    }
  })
  )

  SameTissue<-sapply(colnames(DF.cleaned),function(x){
    if("nb.tissues" %in% colnames(DF)){
      return(setNames(sum(DF[DF$nb.tissues==1,x]),x))
    }else{
      uniquelyExpressed<-DF[rowSums(DF>threshold)==1,]
      return(setNames(sum(uniquelyExpressed[,x]>threshold),x))
    }
  })

  if(!gid){
    res2<-data.frame(T1=res[,2],T2=res[,1],count=res[,3])
    identity<-data.frame(T1=colnames(DF.cleaned),T2=colnames(DF.cleaned),count=SameTissue)
    RES<-rbind(res,identity,res2)

    RES<-dcast(RES,T1 ~ T2,value.var='count')
    rownames(RES)<-RES$T1
    RES<-RES[,-1]

    RES<-RES[sort(rownames(RES)),]
    RES<-RES[,sort(colnames(RES))]

  }else{
    RES<-data.frame(T1=res[,2],T2=res[,1],geneID=res[,3])
  }

  ResToplot<-as.matrix(1/RES)
  diag(ResToplot)<-0
  ResToplot[ResToplot=='Inf']<-1

  if(plot){
    p<-heatmap.2(1-ResToplot,
                 hclustfun=function(x) hclust(x,method = method),
                 trace="none",margins = c(10,10),
                 key=FALSE,
                 #scale='both',
                 distfun=function(c) as.dist(1 - c),
                 cellnote=RES, notecol=notecol,
                 col=col,cexRow =1.4,cexCol = 1.4,
                 revC=TRUE,srtCol=45)
    p
  }

  return(RES)
}


#' Plot heatmap based on the result of tissueDistance
#'
#' @param DF numeric data.frame; output of tissueDistance
#' @param method character string, name of the agglomeration method to be used (hclust)
#'               This should be (an unambiguous abbreviation of) one of "ward.D", "ward.D2", "single",
#'               "complete", "average" (= UPGMA), "mcquitty" (= WPGMA),
#'               "median" (= WPGMC) or "centroid" (= UPGMC).
#' @param notecol character string. Name of the colour to use for the annotation within the heatmap.
#' @param col function to be used to define the colour palette for (gplots) heatmap.2
#'            default: colorRampPalette(c('aliceblue','darkcyan'))
#'
#' @return a heatmap (based on (gplots) heatmap.2)
#' @export
#'
plotTissuesDistance<-function(DF,method='ward.D',notecol='gray37',
                              col=colorRampPalette(c('aliceblue','darkcyan'))){

  Toplot<-as.matrix(1/DF)
  diag(Toplot)<-0
  Toplot[Toplot=='Inf']<-1

  heatmap.2(1-Toplot,
            hclustfun=function(x) hclust(x,method = method),
            trace="none",margins = c(10,10),
            key=FALSE,
            #scale='both',
            distfun=function(c) as.dist(1 - c),
            cellnote=as.matrix(DF), notecol=notecol,
            col=col,cexRow =1.4,cexCol = 1.4,
            revC=TRUE,srtCol=45)

}


#' Extract the dendrogram from the heatmap based on tissueDistance
#'
#' @param DF numeric data.frame; output of tissueDistance
#' @param method character string, name of the agglomeration method to be used (hclust)
#'               This should be (an unambiguous abbreviation of) one of "ward.D", "ward.D2", "single",
#'               "complete", "average" (= UPGMA), "mcquitty" (= WPGMA),
#'               "median" (= WPGMC) or "centroid" (= UPGMC).
#' @param row boolean. Default: TRUE. Whether to extract the dendrogram associated to the rows of the heatmap.
#'
#' @return a dendrogram
#' @export
#'
extractDendroFromPlotTD<-function(DF,method='ward.D',row=TRUE){

  Toplot<-as.matrix(1/DF)
  diag(Toplot)<-0
  Toplot[Toplot=='Inf']<-1

  tmp<-heatmap.2(1-Toplot,
                 hclustfun=function(x) hclust(x,method = method),
                 trace="none",
                 key=FALSE,
                 distfun=function(c) as.dist(1 - c),
                 revC=TRUE)

  if(row){
    return(tmp$rowDendrogram)
  }else{
    return(tmp$colDendrogram)
  }
}



#' Create a dendrogram from a tissueDistance data.frame
#'
#' @param DF1 numeric data.frame; output of tissueDistance
#' @param trans boolean. Default: TRUE
#' @param plot boolean. Default: TRUE
#' @param type a character string specifying the type of phylogeny to be drawn;
#'             Default: "radial"
#'             can also be "phylogram", "cladogram", "fan", "unrooted".
#'
#' @return a dendrogram (phylo object)
#' @export
#'
extractDendroTissueDistance<-function(DF1,trans=TRUE,plot=TRUE,type){
  if(trans){
    #dend.DF1<-nj(as.dist(1-DF1))
    dend.DF1<-ape::as.phylo(hclust(as.dist(revCustomMatrixDist(DF1)),method='ward.D'))
  }
  if(plot){
    if(missing(type)){
      print("Default type value: radial.")
      print("Other options: phylogram, cladogram, fan, unrooted")
      type<-'radial'
    }
    plot(dend.DF1,type=type)
  }
  return(dend.DF1)
}


# Misc and depricated ----------------------------------------------------------
# might contain problems in the designed purposes or anything.


#' Calculate the ratio of standard deviation between two data.frames
#'
#' @param DF1 numeric data.frame
#' @param DF2 numeric data.frame
#' @param option1 boolean. Default TRUE. Whether to use the first option to compute the ratio sd:
#' first option: log2(DF1)-log2(DF2); other option: log2(DF1)/log2(DF2)
#' @param pseudocount numeric. Default: 1. What to add to avoid problems with log2(0)
#'
#'
#' @return a numeric data.frame
#' @export
#'
comp_ratio_sd<-function(DF1,DF2,option1=TRUE, pseudocount=1){
  if(option1){
    DF<-log2(DF1+pseudocount)-log2(DF2+pseudocount)
  }else{
    DF<-log2(DF1+pseudocount)/log2(DF2+pseudocount)
  }
  DF$average<-rowMeans(DF)
  DF$sd<-apply(DF[,-ncol(DF)],1,sd)
  DF$avg.sd<-DF$average/DF$sd

  DF<-DF[order(DF$avg.sd,decreasing=TRUE),]

  return(DF)
}


#' Compute the correlation between two studies after their transposition
#' and their transformation with log2(x+1)
#'
#' @param df1 numeric data.frame for the first study
#' @param df2 numeric data.frame for the second study
#'
#' @return a named vector with the name of the first study
#'         and the correlation between the two datasets.
#' @export
#'
comp_cor_log2<-function(df1,df2){
  tdf1<-t(df1)
  tdf2<-t(df2)
  sapply(colnames(tdf1),function(x){
    cor(log2(tdf1[,x] +1),log2(tdf2[,x] +1))
  })
}

# (Depricated) Analyses based on the top genes -------------------

#' Gives the number or the refenrences of shared genes
#' by both studies out of a given number of highest genes to query.
#'
#' @param a data.frame with expression data for the first study
#' @param b data.frame with expression data for the second study
#' @param top positive integer. Number of genes to consider for the analysis;
#'            if 0 (default), all genes are considered.
#' @param filta character vector or positive integer vector.
#'              rownames or indices of the rows to consider for the first study.
#' @param filtb character vector or positive integer vector.
#'              rownames or indices of the rows to consider for the second study.
#' @param cutoff numeric. cutoff under which the expression is considered as noise
#' @param LG Boolean, topCommonGenes return the length if TRUE, else the list of genes is returned
#'
#' @return depends on LG. Number or the list of genes that are shared for the highest expressed
#' @export
#'
topCommonGenes <- function(a,b,top=0,filta,filtb,cutoff,LG=TRUE){

  if(!missing(cutoff)){#only the genes expressed at least once above cutoff are kept
    a<-df.cutoff(a,cutoff)
    b<-df.cutoff(b,cutoff)
  }

  if (top==0){#whole datasets are considered
    common.genes<-base::intersect(row.names(a),row.names(b))
  }
  else# only the 'top' genes are considered
  {
    a.order<-a[base::order(filta,decreasing=TRUE),]
    b.order<-b[base::order(filtb,decreasing=TRUE),]
    if (length(row.names(a.order))>=top){
      a.ind<-row.names(a.order)[1:top]
    }else{
      a.ind<-row.names(a.order)
      message("The first dataset has less genes than the given number of top genes to be considered")
    }
    if (length(row.names(b.order))>=top){
      b.ind<-row.names(b.order)[1:top]
    }else{
      b.ind<-row.names(b.order)
      message("The second dataset has less genes than the given number of top genes to be considered")
    }
    common.genes<-base::intersect(a.ind,b.ind)
  }
  if (LG==TRUE) {
    return(length(common.genes))
  }else{
    return(common.genes)
  }
}

#' Compute the correlation matrix for the correlation of the two studies' tissues (columns)
#'
#' @param a data.frame with expression data for the first study
#' @param b data.frame with expression data for the second study
#' @param filta character vector or positive integer vector.
#'              colnames or indices of the columns to consider for the first study.
#' @param same boolean. Default:FALSE. Whether the tissues (columns)
#'             from the second study should be identical to the first study.
#' @param filtb character vector or positive integer vector.
#'              colnames or indices of the columns to consider for the second study.
#' @param top  positive integer. Number of genes to consider for the analysis;
#'            if 0 (default), all genes are considered.
#' @param use character string expliciting how missing values should be handled.
#'            can be either  "everything", "all.obs", "complete.obs", "na.or.complete",
#'            or "pairwise.complete.obs" (default).
#'            To fasten the process if there is no missing value, use "everything".
#' @param method character string. Method for base::cor, either "pearson", "spearman" or
#'              "kendall".
#' @param nameA character string to use for identifying the first study
#' @param nameB character string to use for identifying the second study
#'
#' @return a matrix with the correlation of the columns of the two data.frames
#' @export
#'
topgenes.corCluster <- function(a,b,filta=NA,same=FALSE,filtb=NA,top=0,
                                use="pairwise.complete.obs",method="pearson",
                                nameA=deparse(substitute(a)),nameB=deparse(substitute(b)))
{


  if (top==0){
    common.genes<-base::intersect(row.names(a),row.names(b))
  }else{

    if (!is.na(filta[1])){	## <==> if a filter has been provided
      a.order<-a[base::order(a[,filta],decreasing=TRUE),]
    }else{
      a.tmp<-a[base::order(a,decreasing=TRUE),]
      a.order<-na.omit(a.tmp)#some lines with NA are added, they're deleted
    }
    if(same){
      filtb<-filta
      printDebug(filtb)
    }
    if (!is.na(filtb[1])){
      ## <==> filtb has been provided (call or with 'same')
      b.order<-b[base::order(b[,filtb],decreasing=TRUE),]
    }else{
      b.tmp<-b[base::order(b,decreasing=TRUE),]
      b.order<-na.omit(b.tmp)#some lines with NA are added, they're deleted
    }
    a.ind <- row.names(a.order)[1:top]
    b.ind <- row.names(b.order)[1:top]
    common.genes<-base::intersect(a.ind,b.ind)
  }


  if (length(common.genes)>2) {

    #creation of one dataset (with all the data)
    new.all<-cross.selectCond(a,b,name1=nameA,name2=nameB)
    #calcul of the correlation
    data.cor=cor(new.all,use=use, method=method)
    return(data.cor)

  }else #there is no genes in common
  {
    vide<-matrix(data=NA,nrow=length(colnames(a)),ncol=length(colnames(b)),dimnames=list(colnames(a),colnames(b)))
    return(vide)
  }
}


#' Draw (after computation) the correlation matrix for the correlation of the two studies' tissues (columns)
#'
#' @param a data.frame with expression data for the first study
#' @param b data.frame with expression data for the second study
#' @param filta character vector or positive integer vector.
#'              colnames or indices of the columns to consider for the first study.
#' @param same boolean. Default:FALSE. Whether the tissues (columns)
#'             from the second study should be identical to the first study.
#' @param filtb character vector or positive integer vector.
#'              colnames or indices of the columns to consider for the second study.
#' @param top positive integer. Number of genes to consider for the analysis;
#'            if 0 (default), all genes are considered.
#' @param use character string expliciting how missing values should be handled.
#'            can be either  "everything", "all.obs", "complete.obs", "na.or.complete",
#'            or "pairwise.complete.obs" (default).
#'            To fasten the process if there is no missing value, use "everything".
#' @param method character string. Method for base::cor, either "pearson", "spearman" or
#'              "kendall".
#' @param nameA character string to use for identifying the first study
#' @param nameB character string to use for identifying the second study
#' @param png boolean. Default:TRUE. Whether to save the figure as a png
#'            instead of the standard output
#'
#' @return a figure, either on the standard output or as png
#' @export
#'
topgenes.corClusterDraw <- function(a,b,filta=NA,same=FALSE,filtb=NA,
                                    top=0,use="pairwise.complete.obs",
                                    method="pearson",
                                    nameA=deparse(substitute(a)),
                                    nameB=deparse(substitute(b)),png=TRUE){

  new=topgenes.corCluster(a,b,filta=filta,same=same,filtb=filtb,top=top,nameA=nameA,nameB=nameB,method=method,use=use)
  col=colorRampPalette(c('aliceblue','darkcyan'))
  if (png){
    tmp=paste(nameA,'_',sep="") #initialization of the filename
    if(!is.na(filta)){
      tmp=paste(tmp,filta,sep="")
    }
    tmp=paste(tmp,paste('_',nameB,sep=""),sep="")
    if (same) filtb <- filta
    if(!is.na(filtb)){
      tmp<-paste(tmp,paste('_',filtb,sep=""),sep="")
    }
    tmp<-paste(tmp,'_',sep="")
    png(file=paste(tmp,paste(method,"_CorClust.png",sep=""),sep=""), bg="transparent")
    graphics::par(oma=c(2.5,2.5,2.5,2.5))
    heatmap.2(new, distfun=function(c) as.dist(1 - c), trace="none", dendrogram="row", col=col)
    dev.off()
  }else{
    graphics::par(oma=c(2.5,2.5,2.5,2.5))
    heatmap.2(new, distfun=function(c) as.dist(1 - c), trace="none", dendrogram="row", col=col)
  }
  return(new)
}


#' Compute the correlation matrix for the correlation of the two studies' tissues (columns)
#' after they have been transformed with log
#'
#' @param a data.frame with expression data for the first study
#' @param b data.frame with expression data for the second study
#' @param filta character vector or positive integer vector.
#'              colnames or indices of the columns to consider for the first study.
#' @param same boolean. Default:FALSE. Whether the tissues (columns)
#'             from the second study should be identical to the first study.
#' @param filtb character vector or positive integer vector.
#'              colnames or indices of the columns to consider for the second study.
#' @param top positive integer. Number of genes to consider for the analysis;
#'            if 0 (default), all genes are considered.
#' @param use character string expliciting how missing values should be handled.
#'            can be either  "everything", "all.obs", "complete.obs", "na.or.complete",
#'            or "pairwise.complete.obs" (default).
#'            To fasten the process if there is no missing value, use "everything".
#' @param method character string. Method for base::cor, either "pearson", "spearman" or
#'              "kendall".
#' @param nameA character string to use for identifying the first study
#' @param nameB character string to use for identifying the second study
#' @param cleanInfinite boolean. Default: TRUE. whether to use clean.infinite to handle log(0)
#' @param pseudocount numeric. Default: 1. Pseudocount in case for log(x+pseudocount) when clean.infinite is not used for log(0)
#' @return a matrix with the correlation of the columns of the two data.frames once log2-transformed
#' @export
#'
topgenes.corCluster_log<- function(a,b,filta=NA,same=FALSE,filtb=NA,top=0,
                                   use="pairwise.complete.obs",method="pearson",
                                   nameA=deparse(substitute(a)),
                                   nameB=deparse(substitute(b)),
                                   cleanInfinite=TRUE,
                                   pseudocount=1){

  if (top==0){
    common.genes<-base::intersect(row.names(a),row.names(b))
  }else{

    if (!is.na(filta[1])){	## <==> if a filter has been provided
      a.order<-a[base::order(a[,filta],decreasing=TRUE),]
    }else{
      a.tmp<-a[base::order(a,decreasing=TRUE),]
      a.order<-na.omit(a.tmp)#some lines with NA are added, they're deleted
    }
    if(same){
      filtb<-filta
      printDebug(filtb)
    }
    if (!is.na(filtb[1])){
      ## <==> filtb has been provided (call or with 'same')
      b.order<-b[base::order(b[,filtb],decreasing=TRUE),]
    }else{
      b.tmp<-b[base::order(b,decreasing=TRUE),]
      b.order<-na.omit(b.tmp)#some lines with NA are added, they're deleted
    }
    a.ind<-row.names(a.order)[1:top]
    b.ind<-row.names(b.order)[1:top]
    common.genes<-base::intersect(a.ind,b.ind)
  }


  if (length(common.genes)>2) {

    #creation of one dataset (with all the data)
    new.all<-cross.selectCond(a,b,name1=nameA,name2=nameB)
    #calcul of the correlation
    if(cleanInfinite){
      data.cor<-cor(clean.infinite(log(new.all)),use=use, method=method)
    }else{
      data.cor<-cor(log(new.all+pseudocount),use=use, method=method)
    }
    return(data.cor)

  }else #there is no genes in common
  {
    vide<-matrix(data=NA,nrow=length(colnames(a)),ncol=length(colnames(b)),dimnames=list(colnames(a),colnames(b)))
    return(vide)
  }
}


#' Draw (after computation) the correlation matrix for the correlation of the two studies' tissues (columns)
#' after they have been transformed with log
#'
#' @param a data.frame with expression data for the first study
#' @param b data.frame with expression data for the second study
#' @param filta character vector or positive integer vector.
#'              colnames or indices of the columns to consider for the first study.
#' @param same boolean. Default:FALSE. Whether the tissues (columns)
#'             from the second study should be identical to the first study.
#' @param filtb character vector or positive integer vector.
#'              colnames or indices of the columns to consider for the second study.
#' @param top  positive integer. Number of genes to consider for the analysis;
#'            if 0 (default), all genes are considered.
#' @param use character string expliciting how missing values should be handled.
#'            can be either  "everything", "all.obs", "complete.obs", "na.or.complete",
#'            or "pairwise.complete.obs" (default).
#'            To fasten the process if there is no missing value, use "everything".
#' @param method character string. Method for base::cor, either "pearson", "spearman" or
#'              "kendall".
#' @param nameA character string to use for identifying the first study
#' @param nameB character string to use for identifying the second study
#' @param cleanInfinite boolean. Default: TRUE. whether to use clean.infinite to handle log(0)
#' @param pseudocount numeric. Default: 1. Pseudocount in case for log(x+pseudocount) when clean.infinite is not used for log(0)
#' @param png boolean. Default:TRUE. Whether to save the figure as a png
#'            instead of the standard output
#'
#' @return a heatmap of the correlation matrix on the log-transformed expression data of two studies
#' @export
#'
topgenes.corClusterDraw_log<-function(a,b,filta=NA,same=FALSE,filtb=NA,top=0,
                                      use="pairwise.complete.obs",method="pearson",
                                      nameA=deparse(substitute(a)),nameB=deparse(substitute(b)),
                                      png=TRUE,
                                      cleanInfinite=TRUE,
                                      pseudocount=1){

  new=topgenes.corCluster_log(a,b,filta=filta,same=same,filtb=filtb,top=top,nameA=nameA,nameB=nameB,method=method,use=use)
  col=colorRampPalette(c('aliceblue','darkcyan'))
  if (png){
    tmp=paste(nameA,'_',sep="") #initialization of the filename
    if(!is.na(filta)){
      tmp=paste(tmp,filta,sep="")
    }#endif !is.na(filta)
    tmp=paste(tmp,paste('_',nameB,sep=""),sep="")
    if (same) filtb <- filta
    if(!is.na(filtb)){
      tmp<-paste(tmp,paste('_',filtb,sep=""),sep="")
    }
    tmp<-paste(tmp,'_',sep="")
    png(file=paste(tmp,paste(method,"_CorClust_log2.png",sep=""),sep=""), bg="transparent")
    graphics::par(oma=c(2.5,2.5,2.5,2.5))
    heatmap.2(new, distfun=function(c) as.dist(1 - c), trace="none", dendrogram="row", col=col)
    dev.off()
  }else{
    graphics::par(oma=c(2.5,2.5,2.5,2.5))
    heatmap.2(new, distfun=function(c) as.dist(1 - c), trace="none", dendrogram="row", col=col)
  }
  return(new)
}


# Based on the breadth of expression of the genes as a first step

#' Create matrix of jaccard indices based on the output of ExpressedGeneInTissue
#'
#' @param DFa data.frame
#' @param geneList list of the genes found in tissue
#' @param ... other parameters for ExpressedGeneInTissue
#'
#' @return a matrix
#' @export
#'
internalJaccardIndMatrix<-function(DFa,geneList, ...){
  if(missing(geneList)) geneList<-ExpressedGeneInTissue(DFa, ...)
  res1<-as.data.frame(t(utils::combn(colnames(DFa),2)),stringsAsFactors = FALSE)
  res1$val<-sapply(1:nrow(res1),function(x){
    return(jaccardInd(unlist(geneList[res1[x,1]]),unlist(geneList[res1[x,2]])))
    })
  res2<-data.frame("V1"=res1$V2,"V2"=res1$V1,val=res1$val,stringsAsFactors = FALSE)
  res3<-data.frame("V1"=colnames(DFa),"V2"=colnames(DFa),val=1,stringsAsFactors = FALSE)
  res<-rbind(res1,res2,res3)
  return(as.matrix(suppressMessages(reshape2::acast(res, V1~V2))))
}
barzine/barzinePhdR documentation built on Nov. 23, 2024, 8:54 p.m.