R/summarization.R

Defines functions phraser phrases summarize.mm summarize.mm_file summarize.python.builtin.tuple summarize.character summarize keywords.list keywords.character keywords get_bm25_weights.list get_bm25_weights.mm get_bm25_weights.mm_file get_bm25_weights

Documented in get_bm25_weights get_bm25_weights.list get_bm25_weights.mm get_bm25_weights.mm_file keywords keywords.character keywords.list phraser phrases summarize summarize.character summarize.mm summarize.mm_file summarize.python.builtin.tuple

#' BM 25
#' 
#' Function to compute rank scores for documents
#' in corpus and helper class BM25 used in calculations.
#' 
#' @param mm A corpus.
#' @param n_jobs The number of processes to use for computing bm25.
#' 
#' @name get_bm25_weights
#' 
#' @export
get_bm25_weights <- function(mm, n_jobs = -1L) UseMethod("get_bm25_weights")

#' @rdname get_bm25_weights
#' @method get_bm25_weights mm_file
#' @export
get_bm25_weights.mm_file <- function(mm, n_jobs = -1L){
  assert_that(!missing(mm), msg = "Missing `mm_file`")
  corpus <- gensim$corpora$MmCorpus(mm$file)
  model <- gensim$summarization$bm25$get_bm25_weights(corpus, n_jobs = n_jobs)

  # unlink temp
  if(mm$temp && mm$delete) unlink(mm$file)

  invisible(model)
}

#' @rdname get_bm25_weights
#' @method get_bm25_weights mm
#' @export
get_bm25_weights.mm <- function(mm, n_jobs = -1L){
  assert_that(!missing(mm), msg = "Missing `mm`")
  model <- gensim$summarization$bm25$get_bm25_weights(mm, n_jobs = n_jobs)

  invisible(model)
}

#' @rdname get_bm25_weights
#' @method get_bm25_weights list
#' @export
get_bm25_weights.list <- function(mm, n_jobs = -1L){
  assert_that(!missing(mm), msg = "Missing `mm`")
  model <- gensim$summarization$bm25$get_bm25_weights(mm, n_jobs = n_jobs) %>% 
    reticulate::py_to_r()
  invisible(model)
}

#' Keywords
#' 
#' Find keywords from text.
#' 
#' @param corpus A corpus.
#' @param ... Any other options, from the \href{https://radimrehurek.com/gensim/summarization/keywords.html}{official documentation}.
#' 
#' @name keywords
#' 
#' @export
keywords <- function(corpus, ...) UseMethod("keywords")

#' @rdname keywords
#' @method keywords character
#' @export
keywords.character <- function(corpus, ...){
  kw <- corpus %>% 
    purrr::map(gensim$summarization$keywords, ...) %>% 
    purrr::map(reticulate::py_to_r) %>% 
    purrr::map(strsplit, "\n") %>% 
    purrr::map(function(x) x[[1]])
  invisible(kw)
}

#' @rdname keywords
#' @method keywords character
#' @export
keywords.factor <- keywords.character

#' @rdname keywords
#' @method keywords list
#' @export
keywords.list <- function(corpus, ...){
  kw <- corpus %>% 
    purrr::map(gensim$summarization$keywords, ...) %>% 
    purrr::map(reticulate::py_to_r) %>% 
    purrr::map(strsplit, "\n") %>% 
    purrr::map(function(x) x[[1]])
  invisible(kw)
}

#' summarize
#' 
#' Text summarization.
#' 
#' @param corpus A corpus.
#' @param ... Any other options, from the \href{https://radimrehurek.com/gensim/summarization/summariser.html}{official documentation}.
#' 
#' @name summarize
#' 
#' @export
summarize <- function(corpus, ...) UseMethod("summarize")

#' @rdname summarize
#' @method summarize character
#' @export
summarize.character <- function(corpus, ...){
  kw <- corpus %>% 
    purrr::map(gensim$summarization$summarize, ...) %>% 
    purrr::map(reticulate::py_to_r) 
  invisible(kw)
}

#' @rdname summarize
#' @method summarize character
#' @export
summarize.factor <- summarize.character

#' @rdname summarize
#' @method summarize python.builtin.tuple
#' @export
summarize.python.builtin.tuple <- function(corpus, ...){
  kw <- corpus  %>% 
    gensim$summarization$summarize_corpus() %>% 
    purrr::map(reticulate::py_to_r) 
  invisible(kw)
}


#' @rdname summarize
#' @method summarize mm_file
#' @export
summarize.mm_file <- function(corpus, ...){
  corpus <- gensim$corpora$MmCorpus(corpus$file)
  model <- gensim$summarization$summarize_corpus(corpus, ...) %>% 
    purrr::map(reticulate::py_to_r) 

  # unlink temp
  if(corpus$temp && corpus$delete) unlink(corpus$file)

  invisible(model)
}

#' @rdname summarize
#' @method summarize mm
#' @export
summarize.mm <- function(corpus, ...){
  model <- gensim$summarization$summarize_corpus(corpus, ...) %>% 
    purrr::map(reticulate::py_to_r) 
  invisible(model)
}

#' Phrases
#' 
#' Automatically detect common phrases – multi-word expressions / 
#' word n-grams – from a stream of sentences.
#' 
#' @param ... Any other options, from the \href{https://radimrehurek.com/gensim/summarization/phrases.html}{official documentation}.
#' 
#' @name phrases
#' @export
phrases <- function(...){
  gensim$models$phrases$Phrases(...)
}

#' @rdname phrases
#' @export
phraser <- function(...){
  gensim$models$phrases$Phraser(...)
}
news-r/gensimr documentation built on Jan. 9, 2021, 5:55 a.m.