#' @include utilities.R
#' @include extract_core.R
#'
#' Summarize the terms of the cluster using pagerank algorithm
#' @title summarize_clusters
#' @description
#' Summarize the terms of the cluster using pagerank algorithm
#' @rdname summarize_clusters
#' @param graph geneset overlap graph.
#' @param cluster vector of cluster labels for each geneset.
#' @param pattern search pattern to remove from the terms. Unless specified, will default to built-in pattern.
#' @param sep separator used/found in gene set names to be changed to blank spaces. Default value is underscore ('_').
#' @param ncpus number of cores used for parallelizing reconstruction.
#' @param disconnect_graph return a graph connecting only nodes in a cluster.
#' @param ... passed to textrank::textrank_sentences.
#' @details
#' Given a list of text, it creates a sparse matrix consisting of tf-idf score for tokens from the text. See `https://github.com/saraswatmks/superml/blob/master/R/TfidfVectorizer.R`. A k shortest-nearest neighbor graph is then computed using the overlap of of the terms.
#' @return Returns a matrix of tf-idf score of tokens.
#' @examples
#' gmt_file <- system.file("extdata", "h.all.v7.4.symbols.gmt", package = "bowerbird")
#' bwr <- bower(gmt_file)
#' bwr <- snn_graph(bwr)
#' bwr <- find_clusters(bwr)
#' bwr <- summarize_clusters(bwr, ncpus = 1)
#' bwr
#' @import textrank udpipe dplyr pbmcapply
#' @export
#'
summarize_clusters.BOWER <- function(bower, cluster = NULL, pattern = NULL, sep = NULL, ncpus = NULL, disconnect_graph = FALSE, ...){
requireNamespace('igraph')
requireNamespace('parallel')
requireNamespace('pkgfilecache')
if (is.null(pattern)){
pattern = '^GO_|^KEGG_|^REACTOME_|^HALLMARK_|POSITIVE_|NEGATIVE_|REGULATION_OF|^GOBP_'
}
if (is.null(sep)){
sep = '_'
}
if (!is.null(cluster)){
cl <- cluster
igraph::V(bower@graph)$cluster <- cl
} else {
cl <- bower@clusters
igraph::V(bower@graph)$cluster <- cl
}
if (is.null(ncpus)){
ncpus = parallel::detectCores()
}
df <- data.frame(name = igraph::V(bower@graph)$name, cluster = igraph::V(bower@graph)$cluster)
df$name <- gsub(pattern, '', df$name)
df_split <- split(df, df$cluster)
df_split <- pbmclapply(df_split, function(x) {
x <- x %>% select(name) %>% unlist %>% as.character
x <- gsub(sep, ' ', x)
return(x)
}, mc.cores = ncpus)
tagger <- check_udpipemodel()
tagger <- udpipe_load_model(tagger$file_model)
res <- pbmclapply(df_split, function(x) {
annt <- udpipe_annotate(tagger, paste(x, collapse = '.\n'))
annt <- as.data.frame(annt)
annt$textrank_id <- unique_identifier(annt, c("doc_id", "paragraph_id", "sentence_id"))
sentences <- unique(annt[, c("textrank_id", "sentence")])
if (nrow(sentences) > 1){
terminology <- annt[, c("textrank_id", "lemma")]
tr <- textrank_sentences(data = sentences, terminology = terminology, ...)
} else {
sentences <- rbind(sentences, sentences)
sentences$textrank_id <- c(1,2)
terminology1 <- annt[, c("textrank_id", "lemma")]
terminology2 <- annt[, c("textrank_id", "lemma")]
terminology2$textrank_id <- 2
terminology <- rbind(terminology1, terminology2)
tr <- textrank_sentences(data = sentences, terminology = terminology, ...)
}
s <- summary(tr, n = 1, keep.sentence.order = TRUE)
s <- gsub('[.]', '', s)
return(s)
}, mc.cores = ncpus)
tmp <- do.call(rbind, res)
igraph::V(bower@graph)$terms <- tmp[cl]
# and also only create a label for the centroid node
if (disconnect_graph){
edges <- igraph::as_data_frame(bower@graph)
vertices <- igraph::as_data_frame(bower@graph, "vertices")
clx <- vertices$cluster
names(clx) <- vertices$name
c1 <- clx[edges$from]
c2 <- clx[edges$to]
keep <- c1 == c2
edges <- edges[which(keep),]
bower@graph <- igraph::graph_from_data_frame(edges, directed = FALSE, vertices = vertices)
}
data = .graph_to_data(bower@graph)
data$`_orig_index` = row.names(data)
data <- split(data, data$cluster)
datax <- lapply(data, function(x) {
centroid <- .closest_to_centroid(x, 'x', 'y')$`_orig_index`
return(centroid)})
idx <- as.numeric(unlist(datax))
tmp2 <- tmp[cl]
tmp2[-idx] <- ""
tmp2 <- make.unique(tmp2, sep = '-')
tmp_idx <- grep('^-', tmp2)
tmp2[tmp_idx] <- ""
igraph::V(bower@graph)$labels <- tmp2
bower@.graph_data <- .graph_to_data(bower@graph)
bower <- extract_core(bower)
return(bower)
}
#' @rdname summarize_clusters
#' @export
summarize_clusters.igraph <- function(graph, cluster = NULL, pattern = NULL, sep = NULL, ncpus = NULL, disconnect_graph = FALSE, ...){
requireNamespace('igraph')
requireNamespace('parallel')
requireNamespace('pkgfilecache')
if (is.null(pattern)){
pattern = '^GO_|^KEGG_|^REACTOME_|^HALLMARK_|POSITIVE_|NEGATIVE_|REGULATION_OF|^GOBP_'
}
if (is.null(sep)){
sep = '_'
}
if (!is.null(cluster)){
cl <- cluster
igraph::V(graph)$cluster <- cluster
} else {
cl <- igraph::V(graph)$cluster
igraph::V(graph)$cluster <- cl
}
if (is.null(ncpus)){
ncpus = parallel::detectCores()
}
df <- data.frame(name = igraph::V(graph)$name, cluster = igraph::V(graph)$cluster)
df$name <- gsub(pattern, '', df$name)
df_split <- split(df, df$cluster)
df_split <- pbmclapply(df_split, function(x) {
x <- x %>% select(name) %>% unlist %>% as.character
x <- gsub(sep, ' ', x)
return(x)
})
tagger <- check_udpipemodel()
tagger <- udpipe_load_model(tagger$file_model)
res <- pbmclapply(df_split, function(x) {
annt <- udpipe_annotate(tagger, paste(x, collapse = '.\n'))
annt <- as.data.frame(annt)
annt$textrank_id <- unique_identifier(annt, c("doc_id", "paragraph_id", "sentence_id"))
sentences <- unique(annt[, c("textrank_id", "sentence")])
if (nrow(sentences) > 1){
terminology <- annt[, c("textrank_id", "lemma")]
tr <- textrank_sentences(data = sentences, terminology = terminology, ...)
} else {
sentences <- rbind(sentences, sentences)
sentences$textrank_id <- c(1,2)
terminology1 <- annt[, c("textrank_id", "lemma")]
terminology2 <- annt[, c("textrank_id", "lemma")]
terminology2$textrank_id <- 2
terminology <- rbind(terminology1, terminology2)
tr <- textrank_sentences(data = sentences, terminology = terminology, ...)
}
s <- summary(tr, n = 1, keep.sentence.order = TRUE)
s <- gsub('[.]', '', s)
return(s)
}, mc.cores = ncpus)
tmp <- do.call(rbind, res)
result <- tmp[cl]
igraph::V(graph)$terms <- tmp[cl]
# and also only create a label for the centroid node
if (disconnect_graph){
edges <- igraph::as_data_frame(graph)
vertices <- igraph::as_data_frame(graph, "vertices")
clx <- vertices$cluster
names(clx) <- vertices$name
c1 <- clx[edges$from]
c2 <- clx[edges$to]
keep <- c1 == c2
edges <- edges[which(keep),]
graph <- igraph::graph_from_data_frame(edges, directed = FALSE, vertices = vertices)
}
data = .graph_to_data(graph)
data$`_orig_index` = row.names(data)
data <- split(data, data$terms)
datax <- lapply(data, function(x) {
centroid <- .closest_to_centroid(x, 'x', 'y')$`_orig_index`
return(centroid)})
idx <- as.numeric(unlist(datax))
tmp2 <- tmp[cl]
tmp2[-idx] <- ""
tmp2 <- make.unique(tmp2, sep = '-')
tmp_idx <- grep('^-', tmp2)
tmp2[tmp_idx] <- ""
labels <- tmp2
return(list(result, labels))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.