R/geneannot.R

Defines functions genAnnotationCellType cellTypeByModule getFriendlyNameForColumnCategories getCategoriesForGene getFriendlyNameForCategories plotGOZScoresUserList getICReport loadICsGOSim getGProfilerOnNet annotate UserGOenrichment getPreservationStatistics getZsummary10MicTissues1Module getZSummary reportOnModule functionalReportOnModule reportOnGenesMultipleTissue reportOnGenes globalReportOnGenes reportOnGeneIDsGlobal reportOnGeneIDs reportOnFETGenes reportAdd reportGet reportExists

Documented in reportOnGeneIDs reportOnGeneIDsGlobal

reportExists = function(experiment,tissue,module){
  if(exists("coexp.report.go.all"))
    return(!is.null(coexp.report.go.all[[experiment]][[tissue]][[module]]))
  return(FALSE)
}

reportGet = function(experiment,tissue,module){
  if(reportExists(experiment,tissue,module))
    return(coexp.report.go.all[[experiment]][[tissue]][[module]])
  return(NULL)
}

reportAdd = function(experiment,tissue,module,report){
  if(!exists("coexp.report.go.all")){
    coexp.report.go.all <<- NULL
    coexp.report.go.all[[experiment]] <<- NULL
    coexp.report.go.all[[experiment]][[tissue]] <<- NULL
    coexp.report.go.all[[experiment]][[tissue]][[module]] <<- report
  }else{
    if(is.null(coexp.report.go.all)){
      coexp.report.go.all[[experiment]] <<- NULL
      coexp.report.go.all[[experiment]][[tissue]] <<- NULL
      coexp.report.go.all[[experiment]][[tissue]][[module]] <<- report
    }else if(is.null(coexp.report.go.all[[experiment]])){
      coexp.report.go.all[[experiment]][[tissue]] <<- NULL
      coexp.report.go.all[[experiment]][[tissue]][[module]] <<- report
    }else #if(is.null(report.go.all[[experiment]][[tissue]])){
      coexp.report.go.all[[experiment]][[tissue]][[module]] <<- report
  }
}

reportOnFETGenes = function(tissue,genes,which.one,net){

  #Everything will be turned into Ensembl
  if(is.null(net))
    net = getNetworkFromTissue(tissue=tissue,which.one=which.one)
  names(net$moduleColors) =  fromAny2Ensembl(names(net$moduleColors))
  en.genes = fromAny2Ensembl(genes)

  #Modules within genes
  mods = net$moduleColors[match(en.genes,names(net$moduleColors))]
  mods[is.na(mods)] = "void"
  final.report = NULL
  mult.mod.genes = NULL

  p.val.mods = rep(1,length(mods))
  table.mods = table(mods)
  table.mods = table.mods[names(table.mods) != "void"]
  total.specific = sum(mods != "void")
  total.net = length(net$moduleColors)
  for(module in names(table.mods)){
    n.module = sum(net$moduleColors == module)
    n.module.and.specific = table.mods[[module]]
    p.val = testGeneSet(n.module,n.module.and.specific,total.specific,total.net)
    p.val.mods[mods == module] = signif(p.val$p.value,4)
  }

  the.table = cbind(rep(tissue,length(genes)),rep(which.one,length(genes)),
                    genes,en.genes,mods,p.val.mods)
  colnames(the.table) = c("tissue","which.one","gene","ensgene","module","p.val.mods")
  rownames(the.table) = genes
  return(the.table)
}

#' Get known genes in a network
#'
#' This method will return a vector of logical values with the same length as `genes`.
#' It will get the IDs of tne network in `tissue` and `which.one` convert both genes and ids
#' to Ensembl and check for matches. TRUE means the gene can be used in the analysis with
#' that network. FALSE means either the gene is not used in that network or that it could be
#' there with a different name or id format.
#'
#' @param tissue c
#' @param which.one The category the network belongs to
#' @param genes The list of gene IDs you want to use. They can be as gene symbols or Ensembl IDs
#'
#' @return A vector of logical values
#' @export
#'
#' @examples
#'
reportOnGeneIDs = function(tissue,
                           which.one,
                           genes){

  #Translate the genes to Ensemble
  ids = fromAny2Ensembl(genes)
  nids = fromAny2Ensembl(getGenesFromModule(tissue=tissue,which.one=which.one,module=NULL))
  return(ids %in% nids)

}

#' Get known genes in a set of networks
#'
#' This method will return a matrix of logical values with the same columns as number of ids in `genes`.
#' For each network, it will call `reportOnGenIDs()`` as many times as different networks.
#'
#' @param tissues The networks as they appear in the network database
#' @param which.one The categories the networks belong to
#' @param genes The list of gene IDs you want to use. They can be as gene symbols or Ensembl IDs
#'
#' @return A matrix of logical values (genes in columns and networks in rows)
#' @export
#'
#' @examples
reportOnGeneIDsGlobal = function(tissues,
                               categories,
                               ids){
  stopifnot(length(tissues) > 0)
  stopifnot(length(tissues) == length(categories))
  stopifnot(length(ids) > 0)

  allreport = NULL
  i = 1
  for(tissue in tissues){
    cat("Working on",tissue," and ",categories[i],"\n")
    allreport = rbind(allreport,reportOnGeneIDs(tissue=tissue,
                                         genes=ids,
                                         which.one=categories[i]))
    i = i + 1
  }
  colnames(allreport) = ids
  rownames(allreport) = tissues
  allreport
}



globalReportOnGenes = function(tissues,
                               categories,
                               genes){
  stopifnot(length(tissues) > 0)
  stopifnot(length(tissues) == length(categories))

  allreport = NULL
  singcats = unique(categories)
  for(category in singcats){
    cat("Working on",category,"\n")
    mask = categories == category
    localtissues = tissues[mask]
    cat("Working on",paste0(localtissues,collapse=", "),"\n")
    report = reportOnGenesMultipleTissue(tissues=localtissues,
                                         genes=genes,
                                         which.one=category)$report

    allreport = rbind(allreport,report)
  }

  #Get the bonferroni factor
  ntests = 0
  pvals = NULL
  cats = tiss = mods = NULL
  for(category in singcats){
    mask = categories == category
    localtissues = tissues[mask]
    for(ltissue in localtissues){
      localreport = allreport[allreport$category == category & allreport$tissue == ltissue,]
      modules = unique(localreport$module)
      for(module in modules){
        cats = c(cats,category)
        tiss = c(tiss,ltissue)
        mods = c(mods,module)
        pvals = c(pvals,unique(allreport[allreport$category == category &
                                    allreport$tissue == ltissue &
                                    allreport$module == module,]$fisher))
      }
    }
  }

  fdrpvalues = p.adjust(as.numeric(pvals),method = "fdr")
  bonfpvalues = p.adjust(as.numeric(pvals),method = "bonferroni")
  allreport = cbind(allreport,rep(0,nrow(allreport)),rep(0,nrow(allreport)))

  mask = (ncol(allreport) - 1):ncol(allreport)
  colnames(allreport)[mask] = c("FDR","Bonferroni")
  for(mytest in 1:length(cats)){
    indexes = which(allreport$category == cats[mytest] & allreport$tissue == tiss[mytest] &
                                allreport$module == mods[mytest])
    newdata = matrix(nrow=length(indexes),ncol=2,data=c(fdrpvalues[mytest],bonfpvalues[mytest]),byrow=T)
    allreport[indexes,mask] = newdata
  }
  return(allreport)

}



reportOnGenes = function(tissue,
                         genes,
                         silent=F,
                         which.one="signedrnaseq",
                         alt.probes=NULL,
                         ens.correction=NULL,
                         gwases=NULL,
                         alt.expr.data=NULL){

  genes = unique(genes)

  #Everything will be turned into Ensembl
  net = getNetworkFromTissue(tissue=tissue,which.one=which.one)
  netf = getNetworkFromTissue(tissue=tissue,which.one=which.one,only.file = T)
  names(net$moduleColors) =  fromAny2Ensembl(names(net$moduleColors))
  if(which.one == "new"){
    if(is.null(alt.expr.data))
      stop("We need the expression data rds object at alt.expr.data\n")
    expr.dataf = getExprDataFromTissue(tissue=alt.expr.data,
                                         which.one=which.one,
                                         only.file = T)
  }else
    expr.dataf = getExprDataFromTissue(tissue=tissue,which.one=which.one,only.file = T)

  if(expr.dataf != "Nonspecified"){
    expr.data = readRDS(expr.dataf)
    print(head(colnames(expr.data)))
    colnames(expr.data) = fromAny2Ensembl(colnames(expr.data))
  }else
    expr.data = NULL

  en.genes = fromAny2Ensembl(genes)
  gene.names = fromAny2GeneName(genes)
  print(en.genes)
  if(!is.null(ens.correction)){
    for(i in seq(from=1,to=length(ens.correction),by=2)){
      mask = which(en.genes == ens.correction[i])
      if(sum(mask) > 0){
        en.genes[mask] = ens.correction[i+1]
        cat("Substituting", ens.correction[i],"by",ens.correction[i+1],"\n")
      }
    }
  }

  final.report = NULL
  mult.mod.genes = NULL
  mms = getMM(net=netf,
              expr.data.file=expr.data,
              tissue=tissue,
              genes=en.genes)
  if(length(mms) == 0)
    return(NULL)

  modules = na.omit(unique(getModuleFromGenes(which.one=which.one,tissue=tissue,genes=en.genes)))

  for(i in 1:nrow(mms)){
    gene = mms[i,2]
    mod = mms[i,3]

    cat("Working on gene",gene,"\n")

    #mod = unique(mod)
    if(length(mod) >= 1){
      cat("Gene found in module",mod,"in ",tissue,"\n")
      report = reportOnModule(tissue,mod,
                                which.one=which.one)
        mm = mms[i,4]
        report = c(gene = gene,
                   ensgene = mms[i,1],
                   mm = signif(as.numeric(mm),4),
                   report)

        if(!silent){
          print(paste0("Gene ",gene, " is in ",tissue," in ",mod, " (MM ",
                       mm,")"))
          print(paste0("Report for module ",mod))
          print(report)
        }

        #Get the evidence for cell specific genes based on WGCNA userLists
        wgcna.cell.cats = getFriendlyNameForCategories(getCategoriesForGene(gene))
        if(length(wgcna.cell.cats) == 0){
          wgcna.cell.cats = "void"
        }

        if(sum(mms[,1] == gene) > 1){
          j = sum(mms[1:i,1] == gene)
          final.report[[paste0(gene,"_version_",j)]] = report

        }

        else
          final.report[[gene]] = report
    }else{
      cat("Gene not found in",tissue," gene set",which.one,"\n")
      mod = "void"
      mm = 0
      report = NULL
    }
  }
  #And now we add the fisher exact test results

  #But before that we convert to dataframe
  the.table = NULL
  genes.with.report = names(final.report[unlist(lapply(final.report,function(x){
    return(!is.null(x))}))])
  for(gene in genes.with.report){
    report = final.report[[gene]]
    report$go.report = paste0(report$go.report,collapse=", ")
    report$pd.genes = paste0(report$pd.genes,collapse=", ")
    report.t = as.data.frame(report,stringsAsFactors=F)
    the.table = rbind(the.table,report.t)
  }

  #Now we add the VEGAS results if any
  if(!is.null(gwases)){
    modules = the.table$module
    vegastoadd = NULL
    for(module in modules){
      addbymodule = NULL
      for(gwas in gwases){
        newsignal = gsa.getSignal(which.one=which.one,tissue=tissue,gwas=gwas,module=module)
        if(!is.null(newsignal)){
          pvalue = newsignal$pvalue
          addbymodule = cbind(addbymodule,pvalue)
          colnames(addbymodule)[ncol(addbymodule)] = paste0(gwas,"vegaspvalue")
        }

      }
      if(!is.null(addbymodule)){
        vegastoadd = rbind(vegastoadd,addbymodule)
      }

    }
    if(!is.null(vegastoadd))
      the.table = cbind(the.table,vegastoadd)
  }

  total.specific = length(genes.with.report)
  total.net = length(net$moduleColors)
  mods = the.table$module

  #It could be that there are more than 1 modules per gene
  mult.mod.index = grep(",",mods)
  for(single.index in mult.mod.index){
    #Get the first module
    few.modules = mods[single.index]
    few.modules = str_split(few.modules,", ")
    mods[single.index] = few.modules[[1]][[1]]
  }

  if(!is.null(mods)){
    p.val.mods = vector(mode="numeric",length=length(mods))
    names(p.val.mods) = mods
    table.mods = table(mods)
    table.mods = table.mods[names(table.mods) != "void"]
    for(module in names(table.mods)){
      n.module = sum(net$moduleColors == module)
      n.module.and.specific = table.mods[[module]]
      p.val = testGeneSet(n.module,n.module.and.specific,total.specific,total.net)
      p.val.mods[names(p.val.mods) == module] = signif(p.val$p.value,4)
    }
    the.table = cbind(the.table,p.val.mods)
    the.table = as.data.frame(the.table,stringsAsFactors=F)
    rownames(the.table) = names(final.report[genes.with.report])
    colnames(the.table)[ncol(the.table)] = "fisher"

  }
  cat("These genes are found in more than 1 module:",mult.mod.genes,"\n")
  return(list(report=the.table,mult.genes=mult.mod.genes))
}

reportOnGenesMultipleTissue = function(tissues,genes,silent=F,
                                       which.one="signedrnaseq",
                                       alt.probes=NULL,
                                       out.file=NULL,
                                       gwases=NULL){

  out.table = NULL
  problematic.genes = NULL
  for(tissue in tissues){
    tissue.report = reportOnGenes(tissue=tissue,
                                  genes=genes,
                                  silent=silent,
                                  which.one=which.one,
                                  alt.probes=alt.probes,
                                  gwases=gwases)

    if(!is.null(tissue.report$report)){
      problematic.genes[[tissue]] = tissue.report$mult.genes
      tissue.report = tissue.report$report
      tissue.report.ready = cbind(rep(tissue,nrow(tissue.report)),tissue.report)
      colnames(tissue.report.ready) = c("tissue",colnames(tissue.report))
      if(is.null(out.table)){
        out.table = tissue.report.ready

      }else{
        out.table = rbind(out.table,tissue.report.ready)
      }
    }
  }

  #We have to correct p.val.mods here!
  # if(mod.level.correct){
  #   modules = 0
  #   for(tissue in tissues)
  #     modules = modules + length(getModulesFromTissue(tissue=tissue,
  #                                                     which.one=which.one))
  #   bonf = out.table$fisher * modules
  #   bonf[bonf > 1] = 1
  #   out.table = cbind(out.table,bonf)
  #
  #   colnames(out.table)[ncol(out.table)] = "bonfpval"
  # }
  if(is.null(out.table))
    return(list(report=out.table,mult.genes=NULL))
  out.table = cbind(rep(which.one,nrow(out.table)),out.table)
  colnames(out.table)[1] = "category"


  if(!is.null(out.file))
    write.csv(out.table,out.file)
  return(list(report=out.table,mult.genes=problematic.genes))
}

functionalReportOnModule = function(tissue="SNIG",module,which.one="rnaseq"){

  gprof.file = findGO(which.one=which.one,tissue=tissue)

  go <- read.csv(gprof.file,stringsAsFactors=FALSE)
  go$X = NULL
  return(go[go$query.number == module,])
}

reportOnModule = function(tissue="SNIG",
                          module,
                          which.one="rnaseq",
                          how.many=5,
                          simple.cell.types=F,
                          cell.type.threshold=0.05,
                          ctcollapse=T){

  tmp.report = reportGet(experiment=which.one,tissue=tissue,module=module)
  if(!is.null(tmp.report))
    return(tmp.report)

  include.pd = T
  pd.genes = "void"
  if(include.pd){
    pd.genes = read.table(paste0(system.file("", "", package = "CoExpNets"),"/pd_genes.txt"),
                          stringsAsFactors=F,header=F)$V1
    genes.in.module = getGenesFromModule(tissue=tissue,which.one=which.one,module=module)
    stopifnot(!is.null(genes.in.module))
    if(substr(genes.in.module[1],1,4) == "ENSG")
      genes.in.module = fromEnsembl2GeneName(genes.in.module)
    mask = pd.genes %in% genes.in.module
    if(sum(mask) > 0)
      pd.genes = pd.genes[mask]
    else
      pd.genes = "void"
  }
  go = getGOFromTissue(which.one=which.one,tissue=tissue)
  cat("done GO\n")

  p.values <- go$p.value[go$query.number == module & (go$domain %in% "BP")]
  terms <- go$term.name[go$query.number == module & (go$domain %in% "BP")]
  term.ids = go$term.id[go$query.number == module & (go$domain %in% "BP")]

  if("IC" %in% colnames(go)){
    ics = go$IC[go$query.number == module & (go$domain %in% "BP")]
    p.values = p.values[!is.na(ics)]
    terms = terms[!is.na(ics)]
    ics = na.omit(ics)
  }

  else
    ics = NULL
  go.report = ""
  if(length(p.values) > 0){
    go.count = length(p.values)
    if(how.many > go.count)
      how.many = go.count
    the.order <- order(-log10(p.values),decreasing=TRUE)
    if(is.null(ics))
      go.report = paste0(terms[the.order][1:how.many]," ",term.ids[the.order][1:how.many],
                         " (p-value ",signif(p.values[the.order][1:how.many],4),")")
    else{
      p.values = p.values[the.order]
      terms = terms[the.order]
      term.ids = term.ids[the.order]
      ics = ics[the.order]

      p.values.f = p.values[ics > 2.5]
      terms.f = terms[ics > 2.5]
      term.ids.f = term.ids[ics > 2.5]

      if(length(p.values.f) > 0){
        if(length(p.values.f) < how.many)
          how.many = length(p.values.f)
        go.report = paste0(terms.f[1:how.many]," ",term.ids.f[1:how.many],
                           " (p-value ",signif(p.values.f[1:how.many],4),")")
      }else
        go.report = paste0(terms[1:how.many], " ",term.ids.f[1:how.many],
                           " (p-value ",signif(p.values[1:how.many],4),")")
    }
    go.report.size = go.count
    go.report.signif = sum(-log10(p.values))
  }else{
    go.report = "void"
    go.report.size = go.report.signif = 0
  }

  cat("Now cell type")
  #We include cell types here
  cell.types = getCellTypeFromTissue(which.one=which.one,tissue=tissue)


  cell.type.report = "void"
  if(any(colnames(cell.types) %in% module)){
    cell.types = cell.types[order(cell.types[,module],decreasing=F),]
    cell.type.row = rownames(cell.types)[which(cell.types[,module] <= cell.type.threshold)]
    if(length(cell.type.row) > 0){
      cell.type.report = NULL
      if(!simple.cell.types){

        for(i in 1:length(cell.type.row)){
          if(ctcollapse)
            cell.type.report = paste0(cell.type.report," ",
                                      getFriendlyNameForColumnCategories(cell.type.row[i]),
                                      " (p-value ",signif(cell.types[cell.type.row[i],module],4),"). ")
          else
            cell.type.report = c(cell.type.report,
                                 paste0(getFriendlyNameForColumnCategories(cell.type.row[i]),
                                        " (p-value ",signif(cell.types[cell.type.row[i],module],4),")"))

        }
      }else{
        cell.type.report = unique(getFriendlyNameForColumnCategories(names(cell.type.row)))
      }
    }
  }
  #preservation = getZSummary(tissue=tissue,module=module,which.one=which.one)
  preservation = NA
  if(is.na(preservation))
    preservation = "void"

  to.return = list(module=module,
                   size=length(getGenesFromModule(tissue=tissue,which.one=which.one,
                                                  module=module)),
                   go.report=go.report,
                   pd.genes=pd.genes,
                   preservation=preservation,
                   cell.type.pred=cell.type.report)
  reportAdd(which.one,tissue,module,to.return)
  return(to.return)


}


getZSummary = function(tissue="SNIG",
                       modules=c("red"),
                       which.one="signedrnaseq"){

  if(is.null(modules))
    modules = unique(getNetworkFromTissue(tissue=tissue,which.one=which.one)$moduleColors)

  source = 1
  target = 2
  if(which.one == "micro19K")
  {
    source = 2
    target = 1
    pr.file = paste0("supplementary/rdsnets/micro19K/PUTM.mic.net.19K.rds_vs_SNIG.mic.net.19K.rds_preserv.rds")
  }else if(which.one == "exonic"){
    pr.file = paste0("supplementary/rdsnets/exonic/SNIG_vs_PUTM_preserv.rds")
  }else if(which.one == "exonic.vs.gtex"){
    pr.file = paste0("supplementary/rdsnets/gtexv6/",
                     tissue,".UKBEC_vs_",tissue,".GTEx_preserv.rds")
    stats = readRDS(pr.file)
    loc.modules = rownames(stats$preservation$Z[[1]][[2]])
    return(stats$preservation$Z[[1]][[2]][match(modules,loc.modules),"Zsummary.pres"])
  }else if(which.one == "exonic.vs.micro"){
    pr.file = paste0("supplementary/rdsnets/exonic/",
                     tissue,"-RNAseq_vs_",tissue,"-microarray_preserv.rds")
    stats = readRDS(pr.file)
    loc.modules = rownames(stats$preservation$Z[[1]][[2]])
    return(stats$preservation$Z[[1]][[2]][match(modules,loc.modules),"Zsummary.pres"])
  }else if(which.one == "gtexv6"){
    return(NA)
  }else if(which.one == "rosmap"){
    return(NA)
  }else{
    stop(paste0("In getZSummary which.one=",which.one," not known"))
  }

  stats = readRDS(pr.file)

  if(tissue == "SNIG"){
    loc.modules = rownames(stats$preservation$Z[[source]][[target]])
    return(stats$preservation$Z[[source]][[target]][match(modules,loc.modules),"Zsummary.pres"])
  }else if (tissue == "PUTM"){
    loc.modules = rownames(stats$preservation$Z[[target]][[source]])
    return(stats$preservation$Z[[target]][[source]][match(modules,loc.modules),"Zsummary.pres"])
  }else return(NA)
}


getZsummary10MicTissues1Module = function(tissue,module){
  tissues = getMicTissues()
  i = which(tissues == tissue)
  if(i == 1)
    lefttissues = NULL
  else
    lefttissues = tissues[1:(i - 1)]
  if(i == 10)
    righttissues = NULL
  else
    righttissues = tissues[(i+1):10]
  prvals = NULL

  tnet = getNetworkFromTissue(tissue=tissue,which.one="micro19K")
  outtissues = NULL
  for(othertissue in lefttissues){
    othernet = getNetworkFromTissue(tissue=othertissue,which.one="micro19K")
    file.in = paste0("supplementary/rdsnets/micro19K/",
                     othertissue,".mic.net.19K.rds_vs_",tissue,
                     ".mic.net.19K.rds_preserv.rds","Zsummary.csv")
    if(!file.exists(file.in))
      getPreservationStatistics(networks=list(othernet,tnet),
                                tissues=c(othertissue,tissue),
                                paste0("supplementary/rdsnets/micro19K/",othertissue,".mic.net.19K.rds_vs_",tissue,
                                       ".mic.net.19K.rds_preserv.rds"))

    pr = read.csv(file=file.in,stringsAsFactors=F)
    if(sum(pr$tissue == tissue & pr$color == module) == 0)
      stop(paste0("When checking preservation of ",module," from tissue ",tissue,
                  " at tissue ",othertissue," we cant find the module"))
    prvals = c(prvals,pr[pr$tissue == tissue & pr$color == module,othertissue])
    outtissues = c(outtissues,othertissue)
  }

  for(othertissue in righttissues){
    othernet = getNetworkFromTissue(tissue=othertissue,which.one="micro19K")
    file.in = paste0("supplementary/rdsnets/micro19K/",
                     tissue,".mic.net.19K.rds_vs_",othertissue,
                     ".mic.net.19K.rds_preserv.rds","Zsummary.csv")
    if(!file.exists(file.in))
      getPreservationStatistics(networks=list(tnet,othernet),
                                tissues=c(tissue,othertissue),
                                paste0("supplementary/rdsnets/micro19K/",
                                       tissue,".mic.net.19K.rds_vs_",othertissue,
                                       ".mic.net.19K.rds_preserv.rds"))
    pr = read.csv(file=file.in,stringsAsFactors=F)
    if(sum(pr$tissue == tissue & pr$color == module) == 0)
      stop(paste0("When checking preservation of ",module," from tissue ",tissue,
                  " at tissue ",othertissue," we cant find the module"))
    prvals = c(prvals,pr[pr$tissue == tissue & pr$color == module,othertissue])
    outtissues = c(outtissues,othertissue)
  }
  names(prvals) = c(lefttissues,righttissues)
  return(list(min=min(prvals),max=max(prvals),mean=mean(prvals),all=prvals,tissues=outtissues))
}

getPreservationStatistics = function(networks,tissues,prev.data.file){

  net.objects <- list()

  if(typeof(networks[[1]]) == "character"){
    for(index in 1:length(networks)){
      print(paste0("Loading network ",networks[index]," for tissue ",tissues[index]))
      net.objects[[tissues[index]]] <- readRDS(networks[[index]])
    }
  }else{
    for(index in 1:length(networks)){
      print(paste0("Loading network for tissue ",tissues[index]))
      net.objects[[tissues[index]]] <- networks[[index]]
    }

  }



  Zsummary <- NULL
  MedianRank <- NULL

  for(tissue1 in tissues){
    Z.tmp.list <- list(NULL)
    MR.tmp.list <- list(NULL)
    for(tissue2 in tissues){
      ## loop through columns
      cat(tissue1, "\t", tissue2, "\n")
      if(tissue2 == tissue1) next() ## skip self-self comparison
      if( tissue1 < tissue2 ){
        fn	<- paste(tissue1, "vs", tissue2,sep="")
        ref <- 1
        test <- 2
      } else {
        ## swap around
        fn <- paste(tissue2, "vs", tissue1,sep="")
        ref <- 2
        test <- 1
      }
      mp = readRDS(prev.data.file)
      ### other statistics can be exstracted here
      #these statistics for instance can be exstracted:
      #Zsummary.pres","Zconnectivity.pres","Zdensity.pres","Z.cor.kIM"

      Z.tmp <- mp$preservation$Z[[ref]][[test]][,"Zsummary.pres",drop=F]
      rownames(Z.tmp) <- paste(tissue1, "_", rownames(Z.tmp),sep="")
      colnames(Z.tmp) <- tissue2
      Z.tmp.list[[ tissue2 ]] <- Z.tmp

      MR.tmp <- mp$preservation$observed [[ ref ]][[ test ]][ , "medianRank.pres", drop=F]
      rownames(MR.tmp) <- paste(tissue1, "_", rownames(MR.tmp),sep="")
      colnames(MR.tmp) <- tissue2
      MR.tmp.list[[ tissue2 ]] <- MR.tmp
      rm(mp, Z.tmp, MR.tmp, tissue2, fn, ref, test)
    }

    Z.tmp.list <- Z.tmp.list[ which(names(Z.tmp.list) != "") ]
    Z.tmp.list <- cbind( do.call("cbind", Z.tmp.list), tissue1=NA )
    colnames(Z.tmp.list)[ colnames(Z.tmp.list)=="tissue1" ] <- tissue1

    Zsummary <- rbind(Zsummary, Z.tmp.list[ , tissues])
    MR.tmp.list <- MR.tmp.list[ which(names(MR.tmp.list) != "") ]
    MR.tmp.list <- cbind( do.call("cbind", MR.tmp.list), tissue1=NA )
    colnames(MR.tmp.list)[ colnames(MR.tmp.list)=="tissue1" ] <- tissue1
    MedianRank <- rbind(MedianRank, MR.tmp.list[ , tissues])
  }

  # 230 i.e. ten extra
  # gold module consists of 1000 randomly selected genes
  print(dim(Zsummary))
  w <- grep("gold", rownames(Zsummary))
  round( Zsummary[w, ], 2 )

  MedianRank[w, ]
  Zsummary <- Zsummary[ -w, ]
  MedianRank <- MedianRank[ -w, ]

  # get the real module sizes
  moduleColors <- sapply(net.objects, function(obj) obj$moduleColors)
  mat = NULL
  mat[[tissues[1]]] = table(moduleColors[,1])
  mat[[tissues[2]]] = table(moduleColors[,2])

  #mat <- apply( moduleColors[, 1:length(tissues)], 2, function(x) table(x) )
  mat <- data.frame(as.matrix(unlist(mat)))
  rownames(mat) <- gsub("[.]","_",rownames(mat))
  kk <- match(rownames(mat),rownames(Zsummary))
  Zsummary <- Zsummary[kk,]
  kk <- match(rownames(mat),rownames(MedianRank))
  MedianRank <- MedianRank[kk,]

  Zsummary <- cbind( mat[,1],Zsummary)
  colors.col <- NULL
  tissue.col <- NULL
  for(index in 1:length(rownames(mat))){
    values <- str_split(rownames(mat)[index],"_")
    colors.col[index] <- values[[1]][2]
    tissue.col[index] <- values[[1]][1]
  }
  Zsummary <- cbind(tissue.col,colors.col,Zsummary)
  colnames( Zsummary ) <- c("tissue","color","size",tissues)

  MedianRank <- cbind( mat[,1],MedianRank )
  colnames( MedianRank ) <- c("size",tissues)
  write.csv( Zsummary, file=paste0(prev.data.file,"Zsummary.csv"), quote=F )
  write.csv( MedianRank, file=paste0(prev.data.file,"MedianRank.csv"), quote=F )

}


UserGOenrichment <- function(net.file,net.name=NULL,which.one="exonic",do.plot=F){
  #Variable initialization
  if(typeof(net.file) == "character")
    net <- readRDS(net.file)
  else
    net = net.file
  if(is.null(net.name))
    net.name <- str_extract(net.file,"network[0-9]+.+rds$")
  out.file <- paste0(net.name,".USER_terms.csv")

  cat("Passing",length(net$moduleColors)," gene symbols to user enrichment\n")

  cat("Writting everything to",out.file,"\n")

  if(which.one != "braineac")
    genes = fromAny2GeneName(names(net$moduleColors))
  else
    genes = newmic.fromtID2GeneName(names(net$moduleColors))

  enrichments = WGCNA::userListEnrichment(genes,
                                          net$moduleColors,
                                          fnIn=NULL,
                                          useBrainLists = TRUE,
                                          useBrainRegionMarkers=T,
                                          useImmunePathwayLists=T,
                                          useBloodAtlases=T,
                                          useStemCellLists=T,
                                          usePalazzoloWang=T,
                                          nameOut=out.file,
                                          outputCorrectedPvalues=TRUE)



  if(do.plot){
    pdf(paste0(net.file,".go_user_zscores.pdf"),width=8,height=6)
    plotGOZScoresUserList(enrichments,net,
                          title=paste0("User enrichment. Sum z-s (p-val < 10e-2), ",
                                       net.name))
    dev.off()
  }
  write.csv(enrichments$sigOverlaps,out.file)
  #saveGOTermDefinition(enrichments,out.file)

}



annotate = function(net,subnets=T,organism="hsapiens",ensembl=F){
  if(typeof(net) == "character")
    net = readRDS(net)
  if(is.null(names(net$moduleColors)))
    names(net$moduleColors) = names(net$adjacency)
  cat("Annotating network\n")
  net$go = getGProfilerOnNet(net.file=net,out.file=NULL,ensembl=ensembl)
  notHuman = ifelse(organism == "hsapiens",F,T)
  net$ct = genAnnotationCellType(net.in=net,return.processed=F,which.one="new",notHuman=notHuman)
  # if(subnets){
  #   subnetsgo = NULL
  #   subnets = net$subnets
  #   for(job in names(subnets)){
  #     cat("Annotating",job,"net\n")
  #     localnet = NULL
  #     nparts = length(subnets[[job]]$partitions)
  #     localnet$moduleColors = subnets[[job]]$partitions[[nparts]]
  #     subnetsgo[[job]] = getGProfilerOnNet(net.file=net,...)
  #   }
  # }
  net
}


#This function generates a csv file with the Gene Ontology enrichment
#signals. It is based on the gProfileR R package (see documentation for the
#package). Main parameters are
#net.file			: full path name for the net file
#filter				: ontologies to test for enrichemtn (see gProfileR docs)
#exclude.iea		: do not use Inferred electronic annotations (see gProfileR docs)
#correction.method	: method for multiple testing correction (see gProfileR docs)
#out.file			: full name for the csv file where to store results
#incluster			: ignore that one
getGProfilerOnNet <- function(net.file,
                              filter=c("GO","KEGG","REAC"),
                              ensembl=TRUE,
                              exclude.iea=T,
                              organism = "hsapiens",
                              correction.method="gSCS",
                              out.file=NULL,...){

  if(typeof(net.file) == "character")
    net <- readRDS(net.file)
  else
    net = net.file

  modules = unique(net$moduleColors)

  background <- names(net$moduleColors)
  if(ensembl)
    background <- fromEnsembl2GeneName(background)

  all.genes <- NULL
  for(module in modules){

    genes <- names(net$moduleColors)[net$moduleColors == module]
    if(ensembl)
      genes <- fromEnsembl2GeneName(genes)
    all.genes[[module]] <- genes
  }

  go <- gProfileR::gprofiler(all.genes,
                             correction_method=correction.method,
                             #custom_bg=background,
                             src_filter=filter,
                             organism=organism,
                             exclude_iea=exclude.iea)
  if(nrow(go) == 0)
    return(go)
  #png_fn = paste0(out.file,".png"),
  #no_isects=T)
  go.out = cbind(go,rep(NA,nrow(go)))
  colnames(go.out) = c(colnames(go),"IC")

  #	#Now we will add our own column with the information content of each term
  #	#So it is possible to evaluate them
  cat("Generating IC-BP")
  loadICsGOSim("BP")
  mask = go$domain == "BP"
  bp.terms.go = go$term.id[mask]
  bp.ic.go = IC[bp.terms.go]
  go.out$IC[mask] = bp.ic.go

  cat("Generating IC-MF")
  loadICsGOSim("MF")
  mask = go$domain == "MF"
  mf.terms.go = go$term.id[mask]
  mf.ic.go = IC[mf.terms.go]
  go.out$IC[mask] = mf.ic.go

  cat("Generating IC-CC")
  loadICsGOSim("CC")
  mask = go$domain == "CC"
  cc.terms.go = go$term.id[mask]
  cc.ic.go = IC[cc.terms.go]
  go.out$IC[mask] = cc.ic.go
  #
  #	data("ICsMFhumanall")
  #	data("ICsCChumanall")
  if(!is.null(out.file)){
    write.csv(go.out,out.file)
    cat("Obtained",nrow(go),"terms saved at",out.file,"\n")
  }else{
    cat("Obtained",nrow(go),"terms\n")
  }
  return(go.out)
}

loadICsGOSim = function(onto){
  stopifnot(onto %in% c("BP","MF","CC"))
  if(onto == "BP")
    load(system.file("", "ICsBPhumanall.rda", package = "CoExpNets"),.GlobalEnv)
  else if(onto == "MF")
    load(system.file("", "ICsMFhumanall.rda", package = "CoExpNets"),.GlobalEnv)
  else
    load(system.file("", "ICsCChumanall.rda", package = "CoExpNets"),.GlobalEnv)
}

getICReport = function(gprof){
  ontologies = c("BP","CC","MF")
  ics = NULL
  for(onto in ontologies){
    loadICsGOSim(onto)
    terms = gprof$term.id[gprof$domain == onto]
    ics[[onto]] = IC[terms]
    names(ics[[onto]]) = terms
  }
  return(ics)
}

plotGOZScoresUserList <- function(goresult,net,title="Sum z-scores "){
  #Transforming p values in log10 scale
  logs <- -log10(goresult$sigOverlaps$CorrectedPvalues)
  modules <- unique(goresult$sigOverlaps$InputCategories)

  mod.colors <- as.character(modules)

  print(modules)
  z.scores <- NULL
  n.terms <- NULL
  mod.labels <- NULL
  c.modules <- 0


  for(module in modules){
    indexes <- which(goresult$sigOverlaps$InputCategories == module)
    print(indexes)
    good.p.values <- logs[indexes]
    c.modules <- c.modules + 1
    z.scores[c.modules] <- sum(good.p.values)
    n.terms[c.modules] <- length(good.p.values)
    mod.labels[c.modules] <- paste0(n.terms[c.modules],"/",
                                    table(net$moduleColors == module)[2])
  }

  data.to.plot <- as.data.frame(cbind(1:c.modules,z.scores,mod.colors,mod.labels,n.terms),
                                stringsAsFactors=FALSE)
  data.to.plot <- data.to.plot[order(-z.scores,-n.terms),]

  x <- barplot(height=as.numeric(data.to.plot$z.scores),names.arg=data.to.plot$mod.labels,
               width=as.numeric(data.to.plot$n.terms),col=data.to.plot$mod.colors,
               main=title, xaxt="n",
               ylab="-log10(pval)")
  text(cex=1, x=x-2, y=-1.25, pos=1, offset=2, data.to.plot$mod.labels, xpd=TRUE, srt=45)
  legend("topright", title="Module colors",
         legend=data.to.plot$mod.colors,
         fill=data.to.plot$mod.colors,cex=0.5)
}

getFriendlyNameForCategories = function(cats){
  friendly.cats = cats
  existing.cats = cats[cats %in% coexp.cell.categories]
  friendly.cats[cats %in% coexp.cell.categories] =
    coexp.long.friendly.cell.categories[match(existing.cats,coexp.cell.categories)]
  return(friendly.cats)
}

getCategoriesForGene = function(gene){
  #Load the gene lists
  if (!(exists("BrainLists")))
    BrainLists = NULL
  data("BrainLists", envir = sys.frame(sys.nframe()))
  return(BrainLists[,2][BrainLists[,1] == gene])
}

getFriendlyNameForColumnCategories = function(cats){
  friendly.cats = cats
  existing.cats = cats[cats %in% coexp.cell.types]

  friendly.cats[cats %in% coexp.cell.types] =
    coexp.long.friendly.cell.categories[match(existing.cats,coexp.cell.types)]
  return(friendly.cats)
}

#This method is useful for generating cell type enrichment per module from a network
#It will generate a csv ending with celltype.csv with a squared matrix (modules in rows
#and tested cell type markers in columns)
#All can be left to its default value except done for the net.in parameter which
#must have a full path for the network RDS file.
#The csv will be generated at the same folder than the network
#The pdf file with the signals will be generated at plot.file location (can be used as a prefix)
#for the pdf file as in the default
cellTypeByModule = function(tissue="None",
                            do.plot=T,
                            which.one="new",
                            plot.file=NULL,
                            net.in=NULL,
                            legend=NULL,
                            threshold=20,
                            return.processed=T,
                            use.grey=F,
                            display.cats=NULL){ #This last flag FALSE when you want the raw p-values

  cat("Entering cellTypeByModule with",which.one,"\n")
  #if(do.plot & is.null(plot.file))
  #  stop(paste0("You have to specify the plot file if you want a heatmap plotted\n"))


  if(is.null(net.in)){
    net = getNetworkFromTissue(tissue=tissue,which.one=which.one)
    modules = getModulesFromTissue(tissue,which.one)
    enrichment = getCellTypeFromTissue(which.one=which.one,tissue=tissue)
    enrf = findUserCT(which.one=which.one,tissue=tissue)
    userEnrichment = data.frame()
    if(!is.null(enrf)){
      if(file.exists(enrf))
        userEnrichment = read.csv(enrf)
    }
    is.any.network = F
  }else{
    if(typeof(net.in) == "character")
      net = readRDS(net.in)
    else
      net = net.in
    modules = unique(net$moduleColors)
    is.any.network = T
    file.name = paste0(net.in,".USER_terms.csv")
    #if(!file.exists(file.name)){
    cat("Generating new user enrichment into ",file.name,"\n")
    UserGOenrichment(net.file=net,
                     net.name=net.in)
    cat("Done generating new User enrichment\n")
    #}
    userEnrichment = read.csv(file.name,stringsAsFactors=F)

  }
  if(!use.grey)
    modules = modules[modules != "grey"]


  external.ref = read.csv(paste0(system.file("", "", package = "CoExpNets"),
                                 "/cell_type_TableS1.csv"),stringsAsFactors=F)

  all.cell.types = c(coexp.cell.types,
                     coexp.ukbec.cell.types,
                     paste0("External-",colnames(external.ref)))
  cell.type.data = matrix(ncol=length(modules),nrow=length(all.cell.types))
  cell.type.data[,] = 1
  rownames(cell.type.data) = c(getFriendlyNameForColumnCategories(coexp.cell.types),
                               coexp.ukbec.cell.types,colnames(external.ref))
  colnames(cell.type.data) = modules

  #WGCNA enrichment
  if(nrow(userEnrichment) > 0){
    for(module in modules){
      i = match(module,modules)
      for(cell.type in coexp.cell.types){
        j = match(cell.type,coexp.cell.types)
        p.val = as.numeric(userEnrichment$CorrectedPvalues[userEnrichment$InputCategories %in% module &
                                                             userEnrichment$UserDefinedCategories %in% coexp.cell.categories[j]])
        if(length(p.val) > 0)
          cell.type.data[j,i] = p.val #-log10(p.val)
      }
    }
  }

  input.file.names = c(paste0(system.file("", "cell_type_TableS1.csv", package = "CoExpNets"),"/",
                              coexp.ukbec.cell.files),
                       paste0(system.file("", "cell_type_TableS1.csv", package = "CoExpNets"),"/",
                              "cell_type_TableS1.csv.",colnames(external.ref),".txt"))
  cell.types.i.f = c(coexp.ukbec.cell.types,colnames(external.ref))

  last.cell.types = c(coexp.ukbec.cell.types,colnames(external.ref))

  if(nrow(userEnrichment) > 0)
    for(i in 1:nrow(userEnrichment)){
      #print(enrichment)
      module = userEnrichment$InputCategories[i]
      #print(module)
      for(j in 1:length(input.file.names)){
        if(grepl(input.file.names[j],userEnrichment$UserDefinedCategories[i]))
          break
      }
      category = cell.types.i.f[j]
      cell.type.data[category,module] = userEnrichment$CorrectedPvalues[i]
    }


  #But before, rename them
  rownames(cell.type.data) = c(getFriendlyNameForColumnCategories(coexp.cell.types),
                               coexp.ukbec.cell.types,paste0(colnames(external.ref),"-External"))
  unprocessed.cell.type.data = cell.type.data
  cell.type.data = cell.type.data[,apply(cell.type.data,2,function(x){ any(x < 1)}),drop=FALSE]
  cell.type.data = -log10(cell.type.data)
  cell.type.data[is.infinite(cell.type.data)] = max(cell.type.data[!is.infinite(cell.type.data)])

  if(threshold > 0)
    cell.type.data[cell.type.data > threshold] = threshold
  #Order by cell type

  cell.type.data = cell.type.data[order(rownames(cell.type.data)),,drop=FALSE]

  #Now filter
  if(!is.null(display.cats)){
    cell.type.data = cell.type.data[rownames(cell.type.data) %in% display.cats,]
  }


  if(do.plot & (ncol(cell.type.data) >= 2)){
    if(is.null(legend))
      legend = tissue
    if(!is.null(plot.file))
      pdf(plot.file,width=15,height=8)
    gplots::heatmap.2(cell.type.data[apply(cell.type.data,1,function(x){ any(x > 2)}),],
                      trace="none",
                      col=heat.colors(100)[100:1],
                      cexCol=0.7,
                      cexRow=0.7,
                      Rowv=F,
                      Colv=T,
                      main=paste0("Cell type enrichment for ",legend),
                      key=F,srtCol=45,dendrogram="none",
                      margins=c(6,20))

    if(!is.null(plot.file))
      dev.off()
  }
  if(return.processed)
    return(cell.type.data)
  return(unprocessed.cell.type.data)
}

genAnnotationCellType = function(tissue="None",
                                 which.one="new",
                                 markerspath=system.file("ctall", "",
                                                         package = "CoExpNets"),
                                 net.in=NULL,
                                 legend=NULL,
                                 doheatmap=F,
                                 notHuman=F,
                                 plot.file=NULL,
                                 threshold=20,
                                 return.processed=T,
                                 getMarkerSize=F,
                                 getOnlyOverlap=F,
                                 getMyOwnTest=F,
                                 applyFDR=F,
                                 display.cats=NULL){ #This last flag FALSE when you want the raw p-values

  cat("Entering genAnnotationCellType with",which.one,"\n")

  if(which.one == "new"){
    if(typeof(net.in) == "character")
      net = readRDS(net.in)
    else
      net = net.in
  }else{
    net = getNetworkFromTissue(tissue=tissue,which.one=which.one)
  }
  modules = unique(net$moduleColors)
  if(notHuman)
    names(net$moduleColors) = toupper(names(net$moduleColors))
  #So the 1st heatmap
  #will have a column for each cell.types element and a row for
  #each module and we will show -log10(p-values) in a scale

  files = list.files(path=markerspath,full.names = T)
  files = files[grep(pattern=".txt$",files)]
  if(getMarkerSize){
    sizes = lapply(files,function(x){
      nrow(read.delim(x,header=T))
    })
    return(cbind(markerset=files,genes=sizes))
  }


  markernames = gsub(".txt","",basename(files))


  ctypes = markernames
  ctypedata = matrix(ncol=length(modules),nrow=length(files))
  ctypedata[,] = 1
  rownames(ctypedata) = ctypes
  colnames(ctypedata) = modules

  if(getOnlyOverlap){
    myfunction = Vectorize(function(m,f){
      submarkers = read.delim(f,stringsAsFactors=F,header=T)[,1]
      #cat("Module is",m,"file is",f,"\n")
      #print(names(net$moduleColors)[net$moduleColors == m])
      #print(submarkers)
      return(sum(submarkers %in% names(net$moduleColors)[net$moduleColors == m]))
    })

    overlap = outer(modules,files,myfunction)
    colnames(overlap) = gsub(".txt","",basename(files))
    rownames(overlap) = modules
    if(applyFDF){
      overlap = p.adjust(overlap,method="fdr")
    }
    return(overlap)

  }

  if(getMyOwnTest){
    myfunction = Vectorize(function(m,f){
      submarkers = read.delim(f,stringsAsFactors=F,header=T)[,1]
      ov = sum(submarkers %in% names(net$moduleColors)[net$moduleColors == m])
      nm = sum(net$moduleColors == m)

      #cat("Module is",m,"file is",f,"\n")
      #print(names(net$moduleColors)[net$moduleColors == m])
      #print(submarkers)
      return(testGeneSet(n.module=nm,total.specific=length(submarkers),
                         total.net=length(net$moduleColors),n.module.and.specific=ov)$p.value)
    })

    overlap = outer(modules,files,myfunction)
    colnames(overlap) = gsub(".txt","",basename(files))
    rownames(overlap) = modules
    return(overlap)

  }



  all.gene.names = names(net$moduleColors)
  all.gene.names = fromAny2GeneName(names(net$moduleColors))

  tmp.enr.f = paste0(markerspath,"enrichment_tmp.csv")
  if(file.exists(tmp.enr.f))
    file.remove(tmp.enr.f)
  ukbec.en = WGCNA::userListEnrichment(all.gene.names,net$moduleColors,files,
                                       nameOut=tmp.enr.f)
  if(file.exists(tmp.enr.f)){
    enrichment = read.csv(tmp.enr.f,
                          stringsAsFactors=F)

    for(i in 1:nrow(enrichment)){
      module = enrichment$InputCategories[i]
      for(j in 1:length(files)){
        if(grepl(ctypes[j],enrichment$UserDefinedCategories[i]))
          break
      }
      category = ctypes[j]
      ctypedata[category,module] = enrichment$CorrectedPvalues[i]
    }
  }
  rownames(ctypedata) = gsub("cell_type_TableS1.csv.","",rownames(ctypedata))
  uctypedata = ctypedata
  ctypedata = ctypedata[,apply(ctypedata,2,function(x){ any(x < 1)}),drop=FALSE]
  ctypedata = -log10(ctypedata)
  ctypedata[is.infinite(ctypedata)] = max(ctypedata[!is.infinite(ctypedata)])

  if(threshold > 0)
    ctypedata[ctypedata > threshold] = threshold
  #Order by cell type

  ctypedata = ctypedata[order(rownames(ctypedata)),,drop=FALSE]

  if(doheatmap){
    if((ncol(ctypedata) >= 2)){
      if(is.null(legend))
        legend = tissue
      if(!is.null(plot.file))
        pdf(plot.file,width=15,height=8)
      heatmap.2(ctypedata[apply(ctypedata,1,function(x){ any(x > 2)}),],
                trace="none",
                col=heat.colors(100)[100:1],
                cexCol=0.7,
                cexRow=0.7,
                Rowv=F,
                Colv=T,
                main=paste0("Cell type enrichment for ",legend),
                key=F,srtCol=45,dendrogram="none",
                margins=c(6,20))

      if(!is.null(plot.file))
        dev.off()
    }
  }

  if(return.processed)
    return(ctypedata)
  return(uctypedata)
}
juanbot/CoExpNets documentation built on May 15, 2021, 12:20 p.m.