R/ngramTokens.R
In doc2concrete: Measuring Concreteness in Natural Language

Documented in ngramTokens

#' Ngram Tokenizer
#' @description Tally bag-of-words ngram features
#' @param texts character vector of texts.
#' @param wstem character Which words should be stemmed? Defaults to "all".
#' @param ngrams numeric Vector of ngram lengths to be included. Default is 1 (i.e. unigrams only).
#' @param language Language for stemming. Default is "english"
#' @param punct logical Should punctuation be kept as tokens? Default is TRUE
#' @param stop.words logical Should stop words be kept? Default is TRUE
#' @param number.words logical Should numbers be kept as words? Default is TRUE
#' @param per.100 logical Should counts be expressed as frequency per 100 words? Default is FALSE
#' @param overlap numeric Threshold (as cosine distance) for including ngrams that constitute other included phrases. Default is 1 (i.e. all ngrams included).
#' @param sparse maximum feature sparsity for inclusion (1 = include all features)
#' @param verbose logical Should the package report token counts after each ngram level? Useful for long-running code. Default is FALSE.
#' @param vocabmatch matrix Should the new token count matrix will be coerced to include the same tokens as a previous count matrix? Default is NULL (i.e. no token match).
#' @param num.mc.cores numeric number of cores for parallel processing - see parallel::detectCores(). Default is 1.
#' @details This function produces ngram featurizations of text based on the quanteda package. This provides a complement to the doc2concrete function by demonstrating
#' How to build a feature set for training a new detection algorithm in other contexts.
#'
#'
#' @return a matrix of feature counts
#' @examples
#'
#' dim(ngramTokens(feedback_dat$feedback, ngrams=1))

#' dim(ngramTokens(feedback_dat$feedback, ngrams=1:3))
#' @export
ngramTokens<-function(texts,
                      wstem="all",
                      ngrams=1,
                      language="english",
                      punct=TRUE,
                      stop.words=TRUE,
                      number.words=TRUE,
                      per.100=FALSE,
                      overlap=1,
                      sparse=0.995,
                      verbose=FALSE,
                      vocabmatch=NULL,
                      num.mc.cores=1){

  cleanertext<-unlist(parallel::mclapply(texts, cleantext, language=language,
                                         stop.words=stop.words, punct=punct,
                                         number.words=number.words,
                                         mc.cores = num.mc.cores))

  dgm<-lapply(ngrams, function(x) as.matrix(array(NA, c(length(texts),100))))
  stemtokens<-quanteda::tokens(lapply(cleanertext, stemmer,wstem=wstem,language=language))
  for (ng in ngrams){
    if (ng==1) {
      dgm[[ng]] <-quanteda::dfm(stemtokens)
    }else{
      dgm[[ng]] <- quanteda::dfm(quanteda::tokens_ngrams(stemtokens,ng))
    }
    if ((sparse<1)&is.null(vocabmatch)) dgm[[ng]]<-quanteda::dfm_trim(dgm[[ng]],sparsity=sparse)
    if (ng==ngrams[1]) { dtm<-dgm[[ng]]
    } else{
      if ((overlap<1)&(!is.null(dim(dgm[[ng]])))){
        dtm<-overlaps(high=dgm[[ng]],low=dtm, cutoff=overlap,verbose=verbose)
      } else {
        dtm<-cbind(dtm,dgm[[ng]])
      }

    }

    if (verbose) print(paste(c(ng,"-grams ", dim(dtm)),collapse=" "))
  }
  dtm<-doublestacker(dtm)
  if(!is.null(vocabmatch)) dtm<-vocabmatcher(vocabmatch, dtm)
  if(per.100){
    wdcts<-stringr::str_count(texts,"[[:alpha:]]+")
    dtm<-apply(dtm,2,function(x) 100*x/wdcts)
  }
  return(as.matrix(dtm))
}