R/cluster-go-terms.R

Defines functions exclude_go_terms_by_counts guess_best_sets_map groupByCluster_map guess_best_sets groupByCluster levSim cluster.goTerms remove_words sigTable_filter_by_clusters_map sigTable_filter_by_clusters

Documented in sigTable_filter_by_clusters sigTable_filter_by_clusters_map

#' Filter significance table pickeing 1 term for cluster
#'
#' @param sigTable the table with significant results
#' @param clusters the GO organized in clusters
#' @param lowerCut excludes the GO with fewer than lowerCut significant times
#' @param upperCut excludes the GO with more than upperCut significant times
#'
#' @export
#'
sigTable_filter_by_clusters <- function(sigTable, clusters, lowerCut = 2, upperCut=Inf) {
  terms <- exclude_go_terms_by_counts(sigTable, lowerCut=lowerCut, upperCut=upperCut)
  if (any(is.na(clusters[terms])))
    stop("Some terms are not present in clusters")

  cls <- clusters[terms]
  keep_go <- unname(guess_best_sets(cls, sigTable))
  list(keep_go=keep_go, grp=groupByCluster(cls))
}

#' Filter significance table picking 1 term for cluster_map
#'
#' @param sigTable the table with significant results
#' @param cluster_map the GO organized in clusters
#' @param lowerCut excludes the GO with fewer than lowerCut significant times
#' @param upperCut excludes the GO with more than upperCut significant times
#'
#' @export
#'
sigTable_filter_by_clusters_map <- function(sigTable, cluster_map, lowerCut = 2, upperCut=Inf) {
  terms <- exclude_go_terms_by_counts(sigTable, lowerCut=lowerCut, upperCut=upperCut)
  cls <- cluster_map[cluster_map[,1] %in% terms, , drop=F]
  if (any(is.na(cls)))
    stop("Some terms are not present in clusters")

  keep_go <- guess_best_sets_map(cls, sigTable)
  list(keep_go=keep_go, grp=groupByCluster_map(cls))
}

remove_words <- function(x) {
  patterns = c("positive regulation of ", "negative regulation of ",
               " signaling pathway", " signal transduction", " system development",
               "activation of ", "regulation of ",
               "cellular response to ", "response to ",
               "system", "development ", " process", " stimulus", " protein",
               "branching", "involved ")

  sapply(x, function(string) {
    out <- string
    for (ptt in patterns){
      out <- gsub(ptt, "", out)
    }
    out
  }
  )
}

#' @importFrom stats as.dist cutree hclust
#' @importFrom graphics abline plot
#' @importFrom utils adist
cluster.goTerms <- function(terms, h=NULL, k=NULL) {
  # library("DescTools")
  semanticDist <- adist(terms)

  labels <- remove_words(colnames(semanticDist))
  normalizer <- matrix(1, ncol=length(labels), nrow=length(labels),
                       dimnames = list(labels, labels))
  coordinates <- which(upper.tri(semanticDist), arr.ind = T)
  for (i in seq_len(NROW(coordinates))) {
    x <- coordinates[i,1]
    y <- coordinates[i,2]
    # nf <- abs(nchar(labels[x])-nchar(labels[y]))
    nf <- max(nchar(labels[x]),nchar(labels[y]))
    # normalizer[x, y] <- ifelse(nf!=0, nf, 1)
    normalizer[x, y] <- nf
    normalizer[y, x] <- normalizer[x, y]
  }

  hc <- hclust(as.dist(semanticDist/normalizer), method = "complete")
  plot(hc)

  if (!is.null(h)){
    abline(h=h, col="red")
    return(cutree(hc, h=h))
  }

  if (!is.null(k))
    return(cutree(hc, k=k))
}

levSim <- function(pure_terms){
  warning("
library(DescTools)
DescTools::StrDist(pure_terms, method=\"normlevenshtein\")
          ")
  # library("DescTools")
  # DescTools::StrDist(pure_terms, method="normlevenshtein")
}

groupByCluster <- function(cls) {tapply(names(cls), cls,function(x) x)}

#' @importFrom utils head
guess_best_sets <- function(clusters, df) {
  clsGrp <- groupByCluster(clusters)
  sapply(clsGrp, function(descr) {
    clsdf <- df[df$Description %in% descr, , drop=F]
    bestIdx <- head(order(clsdf$"p.adjust"), 1)
    clsdf$"Description"[bestIdx]
  })
}

groupByCluster_map <- function(cls) {tapply(cls[,1], cls[,2],function(x) x)}

#' @importFrom utils head
guess_best_sets_map <- function(clusters_map, df) {
  clsGrp <- groupByCluster_map(clusters_map)
  chosen <- sapply(clsGrp, function(descr) {
    clsdf <- df[as.character(df$Description) %in% descr, , drop=F]
    bestIdx <- head(order(clsdf$"p.adjust"), 1)
    as.character(clsdf$"Description"[bestIdx])
  })
  names(chosen) <- names(clsGrp)
  chosen
}

exclude_go_terms_by_counts <- function(df, lowerCut=1, upperCut=Inf) {
  names(which(table(df$Description) >= lowerCut & table(df$Description) <= upperCut))
}

# exclude_class_go_term_by_counts <- function(df, class, lowerCut=1, upperCut=Inf) {
#   names(which(table(df$Description) >= lowerCut & table(df$Description) <= upperCut))
# }
cavei/cellCB documentation built on Sept. 14, 2023, 8:15 a.m.