R/summarize_clusters.R

Defines functions summarize_clusters.igraph summarize_clusters.BOWER

Documented in summarize_clusters.BOWER summarize_clusters.igraph

#' @include utilities.R
#' @include extract_core.R
#'

#' Summarize the terms of the cluster using pagerank algorithm
#' @title summarize_clusters
#' @description
#' Summarize the terms of the cluster using pagerank algorithm
#' @rdname summarize_clusters
#' @param graph geneset overlap graph.
#' @param cluster vector of cluster labels for each geneset.
#' @param pattern search pattern to remove from the terms. Unless specified, will default to built-in pattern. 
#' @param sep separator used/found in gene set names to be changed to blank spaces. Default value is underscore ('_').
#' @param ncpus number of cores used for parallelizing reconstruction.
#' @param disconnect_graph return a graph connecting only nodes in a cluster.
#' @param ... passed to textrank::textrank_sentences.
#' @details
#' Given a list of text, it creates a sparse matrix consisting of tf-idf score for tokens from the text. See `https://github.com/saraswatmks/superml/blob/master/R/TfidfVectorizer.R`. A k shortest-nearest neighbor graph is then computed using the overlap of of the terms.
#' @return Returns a matrix of tf-idf score of tokens.
#' @examples
#' gmt_file <- system.file("extdata", "h.all.v7.4.symbols.gmt", package = "bowerbird")
#' bwr <- bower(gmt_file)
#' bwr <- snn_graph(bwr)
#' bwr <- find_clusters(bwr)
#' bwr <- summarize_clusters(bwr, ncpus = 1)
#' bwr
#' @import textrank udpipe dplyr pbmcapply
#' @export
#'

summarize_clusters.BOWER <- function(bower, cluster = NULL, pattern = NULL, sep = NULL, ncpus = NULL, disconnect_graph = FALSE, ...){	
  requireNamespace('igraph')
  requireNamespace('parallel')
  requireNamespace('pkgfilecache')
	if (is.null(pattern)){
		pattern = '^GO_|^KEGG_|^REACTOME_|^HALLMARK_|POSITIVE_|NEGATIVE_|REGULATION_OF|^GOBP_'
	} 
	if (is.null(sep)){
		sep = '_'
	}
	if (!is.null(cluster)){
		cl <- cluster
		igraph::V(bower@graph)$cluster <- cl
	} else {
		cl <- bower@clusters
		igraph::V(bower@graph)$cluster <- cl
	}

	if (is.null(ncpus)){
		ncpus = parallel::detectCores()
	}

	df <- data.frame(name = igraph::V(bower@graph)$name, cluster = igraph::V(bower@graph)$cluster)
	df$name <- gsub(pattern, '', df$name)
	df_split <- split(df, df$cluster)
	df_split <- pbmclapply(df_split, function(x) {
		x <- x %>% select(name) %>% unlist %>% as.character
		x <- gsub(sep, ' ', x)
		return(x)
	}, mc.cores = ncpus)

	tagger <- check_udpipemodel()
	tagger <- udpipe_load_model(tagger$file_model)
	res <- pbmclapply(df_split, function(x) {
		annt <- udpipe_annotate(tagger, paste(x, collapse = '.\n'))
		annt <- as.data.frame(annt)
		annt$textrank_id <- unique_identifier(annt, c("doc_id", "paragraph_id", "sentence_id"))
		sentences <- unique(annt[, c("textrank_id", "sentence")])
		if (nrow(sentences) > 1){
      terminology <- annt[, c("textrank_id", "lemma")]  
      tr <- textrank_sentences(data = sentences, terminology = terminology, ...)
    } else {
      sentences <- rbind(sentences, sentences)
      sentences$textrank_id <- c(1,2)
      terminology1 <- annt[, c("textrank_id", "lemma")]
      terminology2 <- annt[, c("textrank_id", "lemma")]
      terminology2$textrank_id <- 2
      terminology <- rbind(terminology1, terminology2)
      tr <- textrank_sentences(data = sentences, terminology = terminology, ...)
    }
		s <- summary(tr, n = 1, keep.sentence.order = TRUE)
		s <- gsub('[.]', '', s)
		return(s)
	}, mc.cores = ncpus)

	tmp <- do.call(rbind, res)
	igraph::V(bower@graph)$terms <- tmp[cl]

  # and also only create a label for the centroid node
  if (disconnect_graph){
  	edges <- igraph::as_data_frame(bower@graph)
  	vertices <- igraph::as_data_frame(bower@graph, "vertices")
  	clx <- vertices$cluster
  	names(clx) <- vertices$name
  	c1 <- clx[edges$from]
  	c2 <- clx[edges$to]
  	keep <- c1 == c2
  	edges <- edges[which(keep),]
  	bower@graph <- igraph::graph_from_data_frame(edges, directed = FALSE, vertices = vertices)	
  }
  data = .graph_to_data(bower@graph)
  data$`_orig_index` = row.names(data)
  data <- split(data, data$cluster)
  datax <- lapply(data, function(x) {
    centroid <- .closest_to_centroid(x, 'x', 'y')$`_orig_index`
    return(centroid)})
  idx <- as.numeric(unlist(datax))
  tmp2 <- tmp[cl]
  tmp2[-idx] <- ""
  tmp2 <- make.unique(tmp2, sep = '-')
  tmp_idx <- grep('^-', tmp2)
  tmp2[tmp_idx] <- ""

  igraph::V(bower@graph)$labels <- tmp2
  bower@.graph_data <- .graph_to_data(bower@graph)
  bower <- extract_core(bower)

	return(bower)
}

#' @rdname summarize_clusters
#' @export
summarize_clusters.igraph <- function(graph, cluster = NULL, pattern = NULL, sep = NULL, ncpus = NULL, disconnect_graph = FALSE, ...){	
  requireNamespace('igraph')
  requireNamespace('parallel')
  requireNamespace('pkgfilecache')
	if (is.null(pattern)){
		pattern = '^GO_|^KEGG_|^REACTOME_|^HALLMARK_|POSITIVE_|NEGATIVE_|REGULATION_OF|^GOBP_'
	} 
	if (is.null(sep)){
		sep = '_'
	} 
	if (!is.null(cluster)){
		cl <- cluster
		igraph::V(graph)$cluster <- cluster
	} else {
		cl <- igraph::V(graph)$cluster
		igraph::V(graph)$cluster <- cl
	}

	if (is.null(ncpus)){
		ncpus = parallel::detectCores()
	}

	df <- data.frame(name = igraph::V(graph)$name, cluster = igraph::V(graph)$cluster)
	df$name <- gsub(pattern, '', df$name)
	df_split <- split(df, df$cluster)
	df_split <- pbmclapply(df_split, function(x) {
		x <- x %>% select(name) %>% unlist %>% as.character
		x <- gsub(sep, ' ', x)
		return(x)
	})

	tagger <- check_udpipemodel()
	tagger <- udpipe_load_model(tagger$file_model)
	res <- pbmclapply(df_split, function(x) {
		annt <- udpipe_annotate(tagger, paste(x, collapse = '.\n'))
		annt <- as.data.frame(annt)
		annt$textrank_id <- unique_identifier(annt, c("doc_id", "paragraph_id", "sentence_id"))
		sentences <- unique(annt[, c("textrank_id", "sentence")])
		if (nrow(sentences) > 1){
      terminology <- annt[, c("textrank_id", "lemma")]  
      tr <- textrank_sentences(data = sentences, terminology = terminology, ...)
    } else {
      sentences <- rbind(sentences, sentences)
      sentences$textrank_id <- c(1,2)
      terminology1 <- annt[, c("textrank_id", "lemma")]
      terminology2 <- annt[, c("textrank_id", "lemma")]
      terminology2$textrank_id <- 2
      terminology <- rbind(terminology1, terminology2)
      tr <- textrank_sentences(data = sentences, terminology = terminology, ...)
    }
		s <- summary(tr, n = 1, keep.sentence.order = TRUE)
		s <- gsub('[.]', '', s)
		return(s)
	}, mc.cores = ncpus)

	tmp <- do.call(rbind, res)	
  result <- tmp[cl]
  igraph::V(graph)$terms <- tmp[cl]
  # and also only create a label for the centroid node
  if (disconnect_graph){
  	edges <- igraph::as_data_frame(graph)
  	vertices <- igraph::as_data_frame(graph, "vertices")
  	clx <- vertices$cluster
  	names(clx) <- vertices$name
  	c1 <- clx[edges$from]
  	c2 <- clx[edges$to]
  	keep <- c1 == c2
  	edges <- edges[which(keep),]
  	graph <- igraph::graph_from_data_frame(edges, directed = FALSE, vertices = vertices)	
  }
  data = .graph_to_data(graph)
  data$`_orig_index` = row.names(data)
  data <- split(data, data$terms)
  datax <- lapply(data, function(x) {
    centroid <- .closest_to_centroid(x, 'x', 'y')$`_orig_index`
    return(centroid)})
  idx <- as.numeric(unlist(datax))
  tmp2 <- tmp[cl]
  tmp2[-idx] <- ""
  tmp2 <- make.unique(tmp2, sep = '-')
  tmp_idx <- grep('^-', tmp2)
  tmp2[tmp_idx] <- ""
  labels <- tmp2

	return(list(result, labels))
}
clatworthylab/bowerbird documentation built on Dec. 19, 2021, 5:15 p.m.