R/TNI-annotation.R

Defines functions .preprocess.genesets .annotate.regulons.gsea2 .jaccard.genesets .hypergeo.genesets .annotate.regulons.jaccard .annotate.regulons.hypergeo .annotate.samples.gsea

################################################################################
##################### TNI Functional Annotation Methods ########################
################################################################################

##------------------------------------------------------------------------------
setMethod(
  "tni.annotate.samples",
  "TNI",
  function(object, geneSetList, minGeneSetSize = 15, exponent = 1, 
           verbose = TRUE){
    
    #--- check compatibility
    object <- upgradeTNI(object)
    
    #-- Basic checks
    if(object@status["Preprocess"]!="[x]")
      stop("input 'object' needs preprocessing!")
    if(missing(geneSetList))
      stop("missing 'geneSetList'.")
    tnai.checks("geneSetList", geneSetList)
    tnai.checks("minGeneSetSize", minGeneSetSize)
    tnai.checks("exponent", exponent)
    tnai.checks("verbose",verbose)
    
    #--- start preprocessing
    if(verbose)cat("-Preprocessing for input data...\n")
    
    #--- check geneSetList
    if(is.null(names(geneSetList)))
      stop("'geneSetList' should be named (unique names)!")
    if(any(duplicated(names(geneSetList))))
      stop("'geneSetList' should have unique names!")
    
    #--- check geneSetList
    rowAnnotation <- tni.get(object, 'rowAnnotation')
    geneSetList <- .preprocess.genesets(object, geneSetList, 
                                        minGeneSetSize, verbose)
    
    if(verbose) cat("--Checking log space... ")
    gexp <- tni.get(object, "gexp")
    if(.isUnloggedData(gexp)){
      if(verbose) cat("applying log2 transformation!\n")
      gexp <- .log2transform(gexp)
    } else {
      if(verbose)cat("OK!\n")
    }
    
    results <- .annotate.samples.gsea(geneSetList, gexp, exponent, verbose)
    
    return(results)
    
  }
)

##------------------------------------------------------------------------------
setMethod(
  "tni.annotate.regulons",
  "TNI",
  function(object, geneSetList, regulatoryElements = NULL, 
           minGeneSetSize = 15, sizeFilterMethod="posORneg",
           exponent = 1, verbose = TRUE){
    
    #-- Basic checks
    if(object@status["DPI.filter"]!="[x]")
      stop("input 'object' needs dpi analysis!")
    if(missing(geneSetList))
      stop("missing 'geneSetList'.")
    tnai.checks("geneSetList", geneSetList)
    tnai.checks("regulatoryElements",regulatoryElements)
    tnai.checks("minGeneSetSize", minGeneSetSize)
    tnai.checks("sizeFilterMethod", sizeFilterMethod)
    tnai.checks("exponent", exponent)
    tnai.checks("verbose",verbose)
  
    #--- check compatibility
    object <- upgradeTNI(object)
    
    #--- start preprocessing
    if(verbose)cat("-Preprocessing for input data...\n")
    
    #--- check geneSetList
    if(is.null(names(geneSetList)))
      stop("'geneSetList' should be named (unique names)!")
    if(any(duplicated(names(geneSetList))))
      stop("'geneSetList' should have unique names!")
    
    #--- check regulatoryElements
    if(!is.null(regulatoryElements)){
      regnames <- tni.get(object, "regulatoryElements")
      if(sum(regulatoryElements%in%regnames) > 
         sum(regulatoryElements%in%names(regnames))){
        regulatoryElements <- regnames[regnames%in%regulatoryElements]
      } else {
        regulatoryElements <- regnames[names(regnames)%in%regulatoryElements]
      }
      if(length(regulatoryElements)==0)
        stop("'regulatoryElements' argument has no valid names!")
    } else {
      regulatoryElements <- tni.get(object, "regulatoryElements")
    }
    
    #--- check geneSetList
    rowAnnotation <- tni.get(object, 'rowAnnotation')
    geneSetList <- .preprocess.genesets(object, geneSetList, 
                                        minGeneSetSize, verbose)
    listOfRegulonsAndMode <- tni.get(object, 'regulons.and.mode')
    listOfRegulonsAndMode <- listOfRegulonsAndMode[regulatoryElements]
    
    ##-----check regulon size
    regcounts <- .regulonCounts(listOfRegulonsAndMode)
    if(sizeFilterMethod=="posANDneg"){
      idx <- regcounts$Positive >= minGeneSetSize & 
        regcounts$Negative >= minGeneSetSize
    } else if(sizeFilterMethod=="posORneg"){
      idx <- regcounts$Positive >= minGeneSetSize | 
        regcounts$Negative >= minGeneSetSize
    } else {
      idx <- regcounts$Size >= minGeneSetSize
    }
    regulatoryElements <- regulatoryElements[
      regulatoryElements%in%rownames(regcounts)[idx]]
    listOfRegulonsAndMode <- listOfRegulonsAndMode[regulatoryElements]
    
    ##-----stop when no regulon passes the size requirement
    if(length(listOfRegulonsAndMode)==0){
      stop("no regulon passed the 'minGeneSetSize' requirement!")
    }
    
    if(verbose) cat("--Checking log space... ")
    gexp <- tni.get(object, "gexp")
    if(.isUnloggedData(gexp)){
      if(verbose) cat("applying log2 transformation!\n")
      gexp <- .log2transform(gexp)
    } else {
      if(verbose)cat("OK!\n")
    }
    
    results <- .annotate.regulons.gsea2(listOfRegulonsAndMode, geneSetList, 
                                        gexp, regulatoryElements, 
                                        exponent, verbose)
    results <- t(results$differential)
    
    return(results)
    
  }
)

##------------------------------------------------------------------------------
setMethod(
  "tni.overlap.genesets",
  "TNI",
  function(object, geneSetList, regulatoryElements = NULL, 
           minGeneSetSize = 15, sizeFilterMethod="posORneg",
           method = c("HT","JC"), pValueCutoff = 0.05, 
           pAdjustMethod = "BH", verbose = TRUE){
    
    #-- Basic checks
    if(object@status["DPI.filter"]!="[x]")
      stop("input 'object' needs dpi analysis!")
    if(missing(geneSetList))
      stop("missing 'geneSetList'.")
    tnai.checks("geneSetList", geneSetList)
    tnai.checks("regulatoryElements",regulatoryElements)
    method <- match.arg(method)
    tnai.checks("minGeneSetSize", minGeneSetSize)
    tnai.checks("sizeFilterMethod", sizeFilterMethod)
    tnai.checks("pValueCutoff", pValueCutoff)
    tnai.checks("pAdjustMethod", pAdjustMethod)
    tnai.checks("verbose",verbose)
    
    #--- check compatibility
    object <- upgradeTNI(object)
    
    #--- start preprocessing
    if(verbose)cat("-Preprocessing for input data...\n")
    
    #--- check geneSetList
    if(is.null(names(geneSetList)))
      stop("'geneSetList' should be named (unique names)!")
    if(any(duplicated(names(geneSetList))))
      stop("'geneSetList' should have unique names!")
    
    #--- check regulatoryElements
    if(!is.null(regulatoryElements)){
      regnames <- tni.get(object, "regulatoryElements")
      if(sum(regulatoryElements%in%regnames) > 
         sum(regulatoryElements%in%names(regnames))){
        regulatoryElements <- regnames[regnames%in%regulatoryElements]
      } else {
        regulatoryElements <- regnames[names(regnames)%in%regulatoryElements]
      }
      if(length(regulatoryElements)==0)
        stop("'regulatoryElements' argument has no valid names!")
    } else {
      regulatoryElements <- tni.get(object, "regulatoryElements")
    }
    
    #--- check geneSetList
    rowAnnotation <- tni.get(object, 'rowAnnotation')
    geneSetList <- .preprocess.genesets(object, geneSetList, 
                                        minGeneSetSize, verbose)
    listOfRegulonsAndMode <- tni.get(object, 'regulons.and.mode')
    listOfRegulonsAndMode <- listOfRegulonsAndMode[regulatoryElements]
    
    ##-----check regulon size
    regcounts <- .regulonCounts(listOfRegulonsAndMode)
    if(sizeFilterMethod=="posANDneg"){
      idx <- regcounts$Positive >= minGeneSetSize & 
        regcounts$Negative >= minGeneSetSize
    } else if(sizeFilterMethod=="posORneg"){
      idx <- regcounts$Positive >= minGeneSetSize | 
        regcounts$Negative >= minGeneSetSize
    } else {
      idx <- regcounts$Size >= minGeneSetSize
    }
    regulatoryElements <- regulatoryElements[
      regulatoryElements%in%rownames(regcounts)[idx]]
    listOfRegulonsAndMode <- listOfRegulonsAndMode[regulatoryElements]
    
    ##-----stop when no regulon passes the size requirement
    if(length(listOfRegulonsAndMode)==0){
      stop("no regulon passed the 'minGeneSetSize' requirement!")
    }
    
    # Get regulons' targets
    listOfRegulons <- lapply(names(listOfRegulonsAndMode), function(reg){
      names(listOfRegulonsAndMode[[reg]])
    })
    names(listOfRegulons) <- names(regulatoryElements)
    
    if(method=="JC"){
      results <- .annotate.regulons.jaccard(geneSetList, listOfRegulons, 
                                            verbose=verbose)
      
      jmat <- matrix(0, length(geneSetList), length(listOfRegulons), 
                     dimnames = list(names(geneSetList), names(listOfRegulons)))
      jmat[as.matrix(
        results[c("GeneSet1", "GeneSet2")])] <- results[["Jaccard"]]
      results <- t(jmat)
    } else {
      results <- .annotate.regulons.hypergeo(geneSetList, listOfRegulons, 
                                             universe=rownames(rowAnnotation), 
                                             pAdjustMethod=pAdjustMethod,
                                             verbose=verbose)
      results <- results[results[["Adjusted.Pvalue"]]<pValueCutoff,]
    }
    
    return(results)
    
  }
)

##------------------------------------------------------------------------------
.annotate.samples.gsea <- function(geneSetList, gexp, exponent, verbose){
  
  #-----get phenotypes
  phenotypes <- gexp-apply(gexp,1,mean)
  
  #-----reset names to integer values
  for(i in names(geneSetList)){
    gs <- geneSetList[[i]]
    geneSetList[[i]] <- match(gs,rownames(phenotypes))
  }
  rnames_phenotypes <- rownames(phenotypes)
  rownames(phenotypes)<-1:nrow(phenotypes)
  
  ##-----get ranked phenotypes
  phenoranks <- apply(-phenotypes, 2, rank)
  colnames(phenoranks) <- colnames(phenotypes)
  rownames(phenoranks) <- rownames(phenotypes)
  genesets <- names(geneSetList)
  samples <- colnames(phenotypes)
  
  if(verbose)cat("-Performing single-sample GSEA...\n")
  if(verbose)cat("--For", length(genesets), "gene set(s) and",
                 length(samples),'sample(s)...\n')
  if(verbose)pb <- txtProgressBar(style=3)
  ES <- NULL
  for(i in 1:length(samples)){
    res <- .run.tni.gsea1.alternative(
      geneSetList=geneSetList,
      phenotype=phenotypes[, samples[i]],
      phenorank=phenoranks[, samples[i]],
      exponent=exponent,
      alternative="two.sided"
    )
    ES <- rbind(ES,res)
    if(verbose) setTxtProgressBar(pb, i/length(samples))
  }
  if(verbose) close(pb)
  rownames(ES) <- samples
  colnames(ES) <- genesets
  return(ES)
}

##------------------------------------------------------------------------
##This function applies a hypergeometric test over two lists with sets of
##genes, and returns a data frame with overlaping statistics
.annotate.regulons.hypergeo <- function(geneSetList, listOfRegulons, universe,
                                        pAdjustMethod = "BH", verbose = TRUE) {
  
  if(verbose) cat("-Performing overlap analysis...\n")
  if(verbose) cat("--For", length(listOfRegulons), 
                  "regulon(s) and", 
                  length(geneSetList),'gene set(s)...\n')
  
  #--- Get geneset pairs
  gspairs <- expand.grid(GeneSet1=names(geneSetList), 
                         GeneSet2=names(listOfRegulons), 
                         stringsAsFactors=FALSE)
  gspairs <- gspairs[gspairs$GeneSet1!=gspairs$GeneSet2, ]
  
  if(nrow(gspairs) > 0){
    
    #--- Run .hypergeo
    if(verbose) pb <- txtProgressBar(style=3)
    reshg <- t(sapply(1:nrow(gspairs),function(i){
      if(verbose) setTxtProgressBar(pb, i/nrow(gspairs))
      .hypergeo.genesets(geneSetList[[gspairs$GeneSet1[i]]],
                         listOfRegulons[[gspairs$GeneSet2[i]]],
                         universe)
    }))
    if(verbose) close(pb)
    results <- cbind(gspairs, reshg)
    
    ##Adjust pvalues
    adjPvals <- p.adjust(results[, "Pvalue"], method = pAdjustMethod)
    results <- cbind(results, 'Adjusted.Pvalue'=adjPvals)
  } else {
    results <- matrix(NA, nrow=0, ncol=10)
  }
  colnames(results) <- c("GeneSet","Regulon","Universe.Size", 
                         "GeneSet.Size", "Regulon.Size", 
                         "Expected.Overlap", "Observed.Overlap", 
                         "Pvalue", "Adjusted.Pvalue")
  return(results)
}

##------------------------------------------------------------------------
##This function returns a data frame with Jaccard statistics
.annotate.regulons.jaccard <- function(geneSetList, listOfRegulons, 
                                       verbose = TRUE) {
  
  #--- Get geneset pairs
  gspairs <- expand.grid(GeneSet1=names(geneSetList), 
                         GeneSet2=names(listOfRegulons), 
                         stringsAsFactors=FALSE)
  gspairs <- gspairs[gspairs$GeneSet1!=gspairs$GeneSet2, ]
  
  #--- Run .jaccard
  if(verbose) pb <- txtProgressBar(style=3)
  results <- sapply(1:nrow(gspairs),function(i){
    if(verbose) setTxtProgressBar(pb, i/nrow(gspairs))
    .jaccard.genesets(geneSetList[[gspairs$GeneSet1[i]]],
                      listOfRegulons[[gspairs$GeneSet2[i]]])
  })
  if(verbose) close(pb)
  results <- cbind(gspairs, Jaccard=results)
  return(results)
}


##------------------------------------------------------------------------
##This function takes two gene sets (e.g. regulons), a vector 
##containing the size of the gene universe, and compute the number of 
##genes expected to occur in both gene sets, the actual observed overlap, 
##and the pvalue from a hypergeometric test.
.hypergeo.genesets <- function(geneSet1, geneSet2, universe) {
  ##number of genes in universe
  N <- length(universe)			
  ##remove genes from gene set that are not in universe			
  geneSet1 <- intersect(geneSet1, universe) 
  geneSet2 <- intersect(geneSet2, universe)
  ##size of gene set	
  m <- length(geneSet1) 							
  Nm <- N-m	
  ##geneSet2 in gene set
  overlap <- intersect(geneSet1, geneSet2) 	
  ##number of hits between regulons		
  k <- length(overlap) 							
  n <- length(geneSet2)	
  HGTresults <- phyper(k-1, m, Nm, n, lower.tail = FALSE)
  ex <- (n/N)*m
  if(m == 0 | n == 0) HGTresults <- 1
  hyp.vec <- c(N, m, n, ex, k, HGTresults)
  names(hyp.vec) <- c("Universe.Size", "GeneSet1.Size", "GeneSet2.Size", 
                      "Expected.Overlap", "Observed.Overlap", "Pvalue")
  return(hyp.vec)
}

##------------------------------------------------------------------------
.jaccard.genesets <- function(geneSet1, geneSet2){
  length(intersect(geneSet1,geneSet2))/length(union(geneSet1,geneSet2))
}

##------------------------------------------------------------------------------
.annotate.regulons.gsea2 <- function(listOfRegulonsAndMode, geneSetList, gexp,
                                     regulatoryElements, exponent, verbose){
 
  #-----get phenotypes
  genesets <- names(geneSetList)
  phenotypes <- sapply(genesets, function(gs){
    gs <- geneSetList[[gs]]
    samps <- apply(gexp[gs,], 2, mean)
    sp1 <- names(samps[samps>median(samps)])
    sp2 <- names(samps[samps<median(samps)])
    apply(gexp[,sp1], 1, mean) - apply(gexp[,sp2], 1, mean)
  })
  
  #-----reset names to integer values
  listOfRegulons <- lapply(listOfRegulonsAndMode, names)
  for(i in names(listOfRegulonsAndMode)){
    reg <- listOfRegulonsAndMode[[i]]
    names(listOfRegulonsAndMode[[i]]) <- match(names(reg),rownames(phenotypes))
  }
  rownames(phenotypes)<-1:nrow(phenotypes)
  
  ##-----get ranked phenotypes
  phenoranks <- apply(-phenotypes, 2, rank)
  colnames(phenoranks) <- colnames(phenotypes)
  rownames(phenoranks) <- rownames(phenotypes)
  
  if(verbose)cat("-Performing two-tailed GSEA...\n")
  if(verbose)cat("--For", length(listOfRegulonsAndMode), "regulon(s) and",
                 length(genesets),'gene set(s)...\n')
  if(verbose)pb <- txtProgressBar(style=3)
  regulonActivity<-list()
  for(i in 1:length(genesets)){
    res <- .run.tni.gsea2.alternative(
      listOfRegulonsAndMode=listOfRegulonsAndMode,
      phenotype=phenotypes[, genesets[i]],
      phenorank=phenoranks[, genesets[i]],
      exponent=exponent,
      alternative="two.sided"
    )
    regulonActivity$differential<-rbind(regulonActivity$differential,
                                        res$differential[regulatoryElements])
    regulonActivity$positive<-rbind(regulonActivity$positive,
                                    res$positive[regulatoryElements])
    regulonActivity$negative<-rbind(regulonActivity$negative,
                                    res$negative[regulatoryElements])
    if(verbose) setTxtProgressBar(pb, i/length(genesets))
  }
  if(verbose) close(pb)
  rownames(regulonActivity$differential) <- genesets
  rownames(regulonActivity$positive) <- genesets
  rownames(regulonActivity$negative) <- genesets
  colnames(regulonActivity$differential)<-names(regulatoryElements)
  colnames(regulonActivity$positive)<-names(regulatoryElements)
  colnames(regulonActivity$negative)<-names(regulatoryElements)
  regulonActivity$regulatoryElements <- regulatoryElements
  return(regulonActivity)
}

##------------------------------------------------------------------------------
.preprocess.genesets <- function(object, geneSetList, minGeneSetSize, verbose){
  ##----Checking geneSetList
  ids <- unique(unlist(geneSetList, use.names = FALSE))
  rowAnnotation <- tni.get(object, 'rowAnnotation')
  if(verbose) cat("--Checking agreement between 'geneSetList' and regulons...")
  refcol <- sapply(1:ncol(rowAnnotation),function(i){
    sum(ids%in%rowAnnotation[,i],na.rm=TRUE)
  })
  agreement<-max(refcol)/length(ids)*100
  if(verbose)cat(paste(round(agreement,1),"% ! \n",sep=""))
  if(agreement<30){
    idiff<-round(100-agreement,1)
    tp<-paste("NOTE: ",idiff,
              "% of 'geneSetList' IDs not represented in the 'rowAnnotation'!",
              sep="")
    stop(tp,call.=FALSE)
  } else if(agreement<50){
    idiff<-round(100-agreement,1)
    tp<-paste("NOTE: ",idiff,
              "% of 'geneSetList' IDs not represented in the 'rowAnnotation'!",
              sep="")
    warning(tp,call.=FALSE)
  }
  refcol <- which(refcol==max(refcol))[1]
  geneSetList <- lapply(geneSetList, function(gs){
    idx <- rowAnnotation[[refcol]] %in% gs
    rownames(rowAnnotation)[idx]
  })
  sz <- unlist(lapply(geneSetList, length))
  geneSetList <- geneSetList[sz>=minGeneSetSize]
  if(length(geneSetList)==0)
    stop("input sets in the 'geneSetList' contains no useful data!\n", 
         call.=FALSE)
  return(geneSetList)
}

Try the RTN package in your browser

Any scripts or data that you put into this service are public.

RTN documentation built on Nov. 12, 2020, 2:02 a.m.