R/termfrequency.R

Defines functions new_termFrequency TermFrequency

Documented in TermFrequency

#' Term Frequency 
#' @description a term frequency hash for the tokens and documents of the corpus
#' @param corpus an object inheriting from class Corpus
#' @importFrom purrr flatten
#' 
#' @return an object of class TermFrequency made up of
#' Dict - the hashed tokens with term frequency by document name
#' Labels - the document names from the corpus
#' LabelCounts - the total counts of tokens for each Label
#' @export
#'
#' @examples
#' corpusEx<-Corpus(list('first'=c('a single entry document'),'second'=c('a two entry document','with two separate entries')))
#' termFreqEx <- TermFrequency(corpusEx)
TermFrequency <- function( corpus ){
  if( !is.Corpus( corpus ) ){
    stop('Term Frequency requires corpus to operate on', call. = FALSE)
  }
  return( new_termFrequency( corpus ) )
} 

new_termFrequency <- function( corpus ){
  termfreqDict <- collections::dict()
  labels <- corpus$documentNames
  for( documentName in corpus$documentNames ){
    documentTokens <- flatten(corpus$tokens[[documentName]])
    for( token in documentTokens ) {
      key <- c( token, documentName )
      termfreqDict$set(key, (termfreqDict$get(key, 0) + 1))
    }
  }
  totals <- corpus$tokenCounts
  return( structure( list( 'Dict' = termfreqDict, 'Labels' = labels, 'LabelCounts' = totals ), class = 'TermFrequency' ) )
}
duncankmckinnon/ezRnlp documentation built on Aug. 6, 2020, 1:28 a.m.