R/ngramComponents.R

Defines functions cleanpunct usWords vocabmatcher overlaps stemexcept stemmer ctxpand textformat cleantext

Documented in cleanpunct cleantext ctxpand overlaps stemexcept stemmer textformat usWords vocabmatcher

############################################################################
# Underlying functions for Tokenizer
############################################################################
#' Text Cleaner
#' @description background function to load.
#' @param text character Vector of strings to clean.
#' @param language character Language to use for cleaning. Default is "english".
#' @param punct logical Should punctuation be kept as tokens? Default is TRUE.
#' @param stop.words logical Should stop words be kept? default is TRUE.
#' @param number.words logical Should numbers be converted to words? default is TRUE.
#' @return character Vector of cleaned strings.
#' @keywords internal
cleantext<-function(text, language="english", punct=FALSE,
                    stop.words=TRUE, number.words=TRUE){
  #PUTS ALL LETTERS IN LOWER CASE
  text<-tolower(text)
  text<-textformat(text, punct)
  #EXPANDS CONTRACTIONS
  if(language=="english"){
    text<-ctxpand(text)
  }
  #DELETES PUNCTUATION & HTML JUNK
  text<-gsub("[[:punct:]]", " ", text,perl=TRUE)
  #DELETES STOP WORDS
  if(length(stop.words)>1){
    text<-tm::removeWords(text, stop.words)
  }else if(!stop.words){
    text<-tm::removeWords(text, tm::stopwords(language))
  }
  if(number.words){
    text<-strsplit(text," ")[[1]]
    nx<-suppressWarnings(as.numeric(text))

    text[!is.na(nx)]<-as.character(english::as.english(nx[!is.na(nx)]))
    text<-paste(text,collapse=" ")
  } else {  #DELETES NUMBERS
    text<-tm::removeNumbers(text)
  }

  text<-tm::stripWhitespace(text)
  return(as.character(text))
}

############################################################################
#' Text Formatter
#' @description background function to load.
#' @param text character Vector of strings to clean.
#' @param punct logical Should punctuation be kept as tokens? Default is TRUE.
#' @return character Vector of cleaned strings.
#' @keywords internal
textformat<-function(text, punct=FALSE){
  text <- gsub(" ?(f|ht)tp(s?)://(.*)[.][a-z]+", "", text,perl=TRUE)
  text <- gsub("www.(.*)[.][a-z]+", "", text,perl=TRUE)
  text <- gsub("\u201D", "\"", text,perl=TRUE)
  text <- gsub("\u201C", "\"", text,perl=TRUE)
  text <- gsub("\u2019", "\'", text,perl=TRUE)

  text<-gsub("ha ha"," haha ",text,fixed=TRUE)
  text<-gsub("lol "," haha ",text,fixed=TRUE)
  text<-gsub("lol."," haha.",text,fixed=TRUE)
  text<-gsub("lol!"," haha!",text,fixed=TRUE)
  text<-gsub("Lol "," haha ",text,fixed=TRUE)
  text<-gsub("Lol."," haha.",text,fixed=TRUE)
  text<-gsub("Lol!"," haha!",text,fixed=TRUE)
  text<-gsub("LOL"," haha ",text,fixed=TRUE)
  text<-gsub("LOl"," haha ",text,fixed=TRUE)
  text<-gsub("LOl"," haha ",text,fixed=TRUE)
  text<-gsub("LoL"," haha ",text,fixed=TRUE)
  text<-gsub("ROFL"," haha ",text,fixed=TRUE)
  text<-gsub("rofl"," haha ",text,fixed=TRUE)
  for (x in 1:8){
    text<-gsub(".?","?",text,fixed=TRUE)
    text<-gsub("?.","?",text,fixed=TRUE)
    text<-gsub("!?","?",text,fixed=TRUE)
    text<-gsub("?!","?",text,fixed=TRUE)
    text<-gsub("??","?",text,fixed=TRUE)
    text<-gsub("!!","!",text,fixed=TRUE)
  }
  if(punct){
    text<-gsub("!"," xmark.",text,fixed=TRUE)
    text<-gsub("?"," qmark.",text,fixed=TRUE)
  }
  text<-gsub("||",". ",text,fixed=TRUE)
  text<-gsub("|",". ",text,fixed=TRUE)
  text<-gsub("[[:cntrl:]]", " ", text,perl=TRUE)
  return(text)
}

############################################################################
#' Contraction Expander
#' @description background function to load.
#' @param text character vector of sentences to un-contract.
#' @return character Vector of sentences without contractions.
#' @keywords internal
ctxpand<-function(text){
  text <- gsub("let's", "let us", text, fixed=TRUE)
  text <- gsub("i'm", "i am", text, fixed=TRUE)
  text <- gsub("won't", "will not", text, fixed=TRUE)
  text <- gsub("can't", "cannot", text, fixed=TRUE)
  text <- gsub("Let's", "Let us", text, fixed=TRUE)
  text <- gsub("I'm", "I am", text, fixed=TRUE)
  text <- gsub("Won't", "Will not", text, fixed=TRUE)
  text <- gsub("Can't", "Cannot", text, fixed=TRUE)
  text <- gsub("shan't", "shall not", text, fixed=TRUE)
  text <- gsub("'d", " would", text, fixed=TRUE)
  text <- gsub("'ve", " have", text, fixed=TRUE)
  text <- gsub("'s", " is", text, fixed=TRUE)
  text <- gsub("'ll", " will", text, fixed=TRUE)
  text <- gsub("'re", " are", text, fixed=TRUE)
  text <- gsub("n't", " not", text, fixed=TRUE)
  text <- gsub("u.s.", "US", text, fixed=TRUE)
  text <- gsub("U.S.", "US", text, fixed=TRUE)
  text <- gsub("e.g.", "eg", text, fixed=TRUE)
  text <- gsub("i.e.", "ie", text, fixed=TRUE)
  return(text)
}

############################################################################
#' Stemmer
#' @description background function to load.
#' @param text character vector of strings to clean.
#' @param wstem character Which words should be stemmed? Defaults to "all".
#' @param language Language for stemming. Default is "english".
#' @return Sentence of stemmed words.
#' @keywords internal

stemmer<-function(text, wstem="all", language="english"){
  if(nchar(text)%in%c(NA,NULL,0:2)){
    return(text)
  }else{
    xes<-(strsplit(text, split=" ")[[1]])
    xes<-xes[which(nchar(xes)>0)]
    if(length(wstem)>1) xes<-sapply(xes, function(x) stemexcept(x, wstem, language), USE.NAMES=F)
    if(wstem[1]=="all") xes<-sapply(xes, SnowballC::wordStem, language=language, USE.NAMES=F)
    return(xes)
  }
}

############################################################################
#' Conditional Stemmer
#' @description background function to load
#' @param sentence character Vector of sentences to stem.
#' @param excepts character Vector of words that should not be stemmed.
#' @param language Language for stemming. Default is "english".
#' @return Sentence of stemmed words.
#' @keywords internal
stemexcept<-function(sentence, excepts, language="english"){
  words<-strsplit(sentence, split=" ")[[1]]
  SS<-which(!(words %in% excepts))
  words[SS]<-SnowballC::wordStem(words[SS], language)
  return(paste(words, collapse=" "))
}
############################################################################
#' Overlap cleaner
#' @description background function to load
#' @param high matrix Token counts that will all be kept.
#' @param low matrix Token counts that will evaluated (and pruned) for overlapping.
#' @param cutoff numeric Threshold (as cosine distance) for including overlapping tokens. Default is 1 (i.e. all tokens included).
#' @return Combined token count matrix.
#' @keywords internal
overlaps<-function(high, low, cutoff=1,verbose=FALSE){
  if(cutoff==1){
    combined<-cbind(high,low)
  } else {
    hM=as.matrix(high)
    keep=rep(TRUE,ncol(low))
    xnames=unlist(colnames(low))
    ynames=unlist(colnames(high))
    for(x in 1:ncol(low)){
      lV=as.vector(low[,x])
      yflag="go"
      y=1
      while(yflag!="break"){
        if(xnames[x]%in%strsplit(ynames[y],"_")[[1]]){
          hV=as.vector(hM[,y])
          cossim=sum(lV*hV)/(sqrt(sum(lV^2))*sqrt(sum(hV^2)))
          if(cossim>cutoff){
            if(verbose) message(paste(xnames[x],ynames[y]))
            keep[x]=FALSE
            yflag="break"
          }
        }
        if(y==ncol(high)){
          yflag="break"
        }
        y=y+1
      }
    }
    combined<-cbind(low[,keep],high)
  }
  return(combined)
}

############################################################################
#' Doublestacker
#' @description background function to load
#' @param wdcts matrix Token counts that will have doubled column names condensed.
#' @return Token count matrix with no doubled column names.
#' @keywords internal
doublestacker<-function (wdcts){
  words<- colnames(wdcts)
  if(sum(duplicated(words))>0){
    #wdcts<-as.matrix(wdcts)
    for (Q in words[duplicated(words)]) {
      wdcts[, (words== Q) & (!duplicated(words))] <- as.numeric(rowSums(wdcts[,(words== Q)]))
      wdcts[, ((words== Q) & (duplicated(words)))] <- NA
    }
    wdcts<-wdcts[, !is.na(colMeans(wdcts))]
  }
  return(wdcts)
}
############################################################################

############################################################################
#' Feature Count Matcher
#' @description background function to load
#' @param hole matrix Token counts in model data.
#' @param peg matrix Token counts in new data.
#' @return Token counts matrix from new data, with column names that match the model data.
#' @keywords internal
vocabmatcher<-function(hole, peg){
  peg<-doublestacker(peg)
  newpeg<-array(0, c(nrow(peg), ncol(hole)))
  for (i in 1:ncol(newpeg)){
    if(colnames(hole)[i] %in% colnames(peg)){
      newpeg[,i]<-as.vector(peg[,which(colnames(peg)==colnames(hole)[i])])
    }
  }
  dimnames(newpeg)<-list(rownames(peg), colnames(hole))
  newpeg<-quanteda::as.dfm(newpeg)
  return(newpeg)
}
############################################################################

#' UK to US conversion
#' @description background function to load.
#' @param text character Vector of strings to convert to US spelling.
#' @return character Vector of Americanized strings.
#' @keywords internal
usWords<-function(text){
toks <- quanteda::tokens(text)
tokUS<-quanteda::tokens_lookup(toks, doc2concrete::uk2us,
                               exclusive = FALSE,capkeys = FALSE)
sentUS<-unlist(lapply(tokUS,paste, collapse=" "),use.names = F)
}

#' Cleaning weird encodings
#' @description Handles curly quotes, umlauts, etc.
#' @param text character Vector of strings to clean.
#' @return character Vector of clean strings.
#' @keywords internal
cleanpunct<-function(text){
  # text<- gsub("‘", "'",text)
  # text<-gsub("’", "'", text)
  # text<-gsub("“", '"', text)
  # text<-gsub("”", '"', text)
  text<-gsub("[\u201C\u201D\u201E\u201F\u2033\u2036]", '"', text)
  text<-gsub("[\u2018\u2019\u201A\u201B\u2032\u2035]", "'", text)
  text<-stringi::stri_trans_general(text, "latin-ascii")
  return(text)
}

Try the doc2concrete package in your browser

Any scripts or data that you put into this service are public.

doc2concrete documentation built on June 29, 2022, 1:05 a.m.