R/clr_search_keyphrases.R

#' Extract key phrases from an annotated corpus.
#'
#' Function enable corpus search of gram constructions in context.
#' @name clr_search_keyphrases
#' @param search Gram/lexical pattern to be searched for
#' @param LW Size of context in number of words to left of the target
#' @param RW Size of context in number of words to right of the target
#' @param corp List of annotated texts to be searched
#' @return A list of dataframes
#' @import data.table



#' @export
#' @rdname clr_search_keyphrases
clr_search_keyphrases <- function (corp,n=5,
                                   key_var ='lemma',
                                   flatten=TRUE, jitter=TRUE,
                                   remove_nums = TRUE,
                                   include='doc_id',
                                   min_txtf=0) { #add agg_var.

  x <- corp
  if ("meta" %in% names(x)) x <- x$corpus

  keys <- corpuslingr::clr_search_gramx(x,search= clr_ref_keyphrase)

  doc <-  keys[, list(docf=length(unique(doc_id))),by=key_var]
  txt <-  keys[, list(txtf=length(tag)),by=c('doc_id',key_var)]

  freqs <- rbindlist(x)
  freqs <- freqs[, list(textLength=length(get(key_var))),by=doc_id]

  k1 <- doc[txt, on = key_var]

  setkey(k1,doc_id); setkey(freqs, doc_id)
  k1 <- freqs[k1]

  k1[, docsInCorpus := nrow(freqs)]

  if (remove_nums==TRUE) {
    k1 <- k1[grepl("[0-9]", k1[[key_var]])==FALSE,]}

  k1[, tf_idf := (txtf/textLength)*log(docsInCorpus/(docf+1))]

  if (jitter==TRUE) {
    set.seed(99)
    k1[, tf_idf := jitter(tf_idf)]}

  k1 <- subset(k1, txtf >= min_txtf)
  k1 <- k1[,.SD[order(-tf_idf)[1:n]],by=doc_id]
  colnames(k1)[3] <- 'keyphrases'

  k1 <- k1[order(as.numeric(doc_id))]

  if (flatten == TRUE) {
    k1 <- k1[, list(keyphrases=paste(keyphrases, collapse=" | ")), by=list(doc_id)]}

  if (!setequal(intersect(include, colnames(k1)), include)) {
    setDT (corp$meta)
    k1 <- k1[corp$meta, on=c("doc_id"), nomatch=0]}

  k1 <- k1[, c(include,'keyphrases'), with = FALSE]
  k1
}
jaytimm/corpuslingr documentation built on May 29, 2019, 1:01 a.m.