R/ngramComponents.R
In doc2concrete: Measuring Concreteness in Natural Language

Documented in cleanpunct cleantext ctxpand overlaps stemexcept stemmer textformat usWords vocabmatcher

############################################################################
# Underlying functions for Tokenizer
############################################################################
#' Text Cleaner
#' @description background function to load.
#' @param text character Vector of strings to clean.
#' @param language character Language to use for cleaning. Default is "english".
#' @param punct logical Should punctuation be kept as tokens? Default is TRUE.
#' @param stop.words logical Should stop words be kept? default is TRUE.
#' @param number.words logical Should numbers be converted to words? default is TRUE.
#' @return character Vector of cleaned strings.
#' @keywords internal
cleantext<-function(text, language="english", punct=FALSE,
                    stop.words=TRUE, number.words=TRUE){
  #PUTS ALL LETTERS IN LOWER CASE
  text<-tolower(text)
  text<-textformat(text, punct)
  #EXPANDS CONTRACTIONS
  if(language=="english"){
    text<-ctxpand(text)
  }
  #DELETES PUNCTUATION & HTML JUNK
  text<-gsub("[[:punct:]]", " ", text,perl=TRUE)
  #DELETES STOP WORDS
  if(length(stop.words)>1){
    text<-tm::removeWords(text, stop.words)
  }else if(!stop.words){
    text<-tm::removeWords(text, tm::stopwords(language))
  }
  if(number.words){
    text<-strsplit(text," ")[[1]]
    nx<-suppressWarnings(as.numeric(text))

    text[!is.na(nx)]<-as.character(english::as.english(nx[!is.na(nx)]))
    text<-paste(text,collapse=" ")
  } else {  #DELETES NUMBERS
    text<-tm::removeNumbers(text)
  }

  text<-tm::stripWhitespace(text)
  return(as.character(text))
}

############################################################################
#' Text Formatter
#' @description background function to load.
#' @param text character Vector of strings to clean.
#' @param punct logical Should punctuation be kept as tokens? Default is TRUE.
#' @return character Vector of cleaned strings.
#' @keywords internal
textformat<-function(text, punct=FALSE){
  text <- gsub(" ?(f|ht)tp(s?)://(.*)[.][a-z]+", "", text,perl=TRUE)
  text <- gsub("www.(.*)[.][a-z]+", "", text,perl=TRUE)
  text <- gsub("\u201D", "\"", text,perl=TRUE)
  text <- gsub("\u201C", "\"", text,perl=TRUE)
  text <- gsub("\u2019", "\'", text,perl=TRUE)

  text<-gsub("ha ha"," haha ",text,fixed=TRUE)
  text<-gsub("lol "," haha ",text,fixed=TRUE)
  text<-gsub("lol."," haha.",text,fixed=TRUE)
  text<-gsub("lol!"," haha!",text,fixed=TRUE)
  text<-gsub("Lol "," haha ",text,fixed=TRUE)
  text<-gsub("Lol."," haha.",text,fixed=TRUE)
  text<-gsub("Lol!"," haha!",text,fixed=TRUE)
  text<-gsub("LOL"," haha ",text,fixed=TRUE)
  text<-gsub("LOl"," haha ",text,fixed=TRUE)
  text<-gsub("LOl"," haha ",text,fixed=TRUE)
  text<-gsub("LoL"," haha ",text,fixed=TRUE)
  text<-gsub("ROFL"," haha ",text,fixed=TRUE)
  text<-gsub("rofl"," haha ",text,fixed=TRUE)
  for (x in 1:8){
    text<-gsub(".?","?",text,fixed=TRUE)
    text<-gsub("?.","?",text,fixed=TRUE)
    text<-gsub("!?","?",text,fixed=TRUE)
    text<-gsub("?!","?",text,fixed=TRUE)
    text<-gsub("??","?",text,fixed=TRUE)
    text<-gsub("!!","!",text,fixed=TRUE)
  }
  if(punct){
    text<-gsub("!"," xmark.",text,fixed=TRUE)
    text<-gsub("?"," qmark.",text,fixed=TRUE)
  }
  text<-gsub("||",". ",text,fixed=TRUE)
  text<-gsub("|",". ",text,fixed=TRUE)
  text<-gsub("[[:cntrl:]]", " ", text,perl=TRUE)
  return(text)
}

############################################################################
#' Contraction Expander
#' @description background function to load.
#' @param text character vector of sentences to un-contract.
#' @return character Vector of sentences without contractions.
#' @keywords internal
ctxpand<-function(text){
  text <- gsub("let's", "let us", text, fixed=TRUE)
  text <- gsub("i'm", "i am", text, fixed=TRUE)
  text <- gsub("won't", "will not", text, fixed=TRUE)
  text <- gsub("can't", "cannot", text, fixed=TRUE)
  text <- gsub("Let's", "Let us", text, fixed=TRUE)
  text <- gsub("I'm", "I am", text, fixed=TRUE)
  text <- gsub("Won't", "Will not", text, fixed=TRUE)
  text <- gsub("Can't", "Cannot", text, fixed=TRUE)
  text <- gsub("shan't", "shall not", text, fixed=TRUE)
  text <- gsub("'d", " would", text, fixed=TRUE)
  text <- gsub("'ve", " have", text, fixed=TRUE)
  text <- gsub("'s", " is", text, fixed=TRUE)
  text <- gsub("'ll", " will", text, fixed=TRUE)
  text <- gsub("'re", " are", text, fixed=TRUE)
  text <- gsub("n't", " not", text, fixed=TRUE)
  text <- gsub("u.s.", "US", text, fixed=TRUE)
  text <- gsub("U.S.", "US", text, fixed=TRUE)
  text <- gsub("e.g.", "eg", text, fixed=TRUE)
  text <- gsub("i.e.", "ie", text, fixed=TRUE)
  return(text)
}

############################################################################
#' Stemmer
#' @description background function to load.
#' @param text character vector of strings to clean.
#' @param wstem character Which words should be stemmed? Defaults to "all".
#' @param language Language for stemming. Default is "english".
#' @return Sentence of stemmed words.
#' @keywords internal

stemmer<-function(text, wstem="all", language="english"){
  if(nchar(text)%in%c(NA,NULL,0:2)){
    return(text)
  }else{
    xes<-(strsplit(text, split=" ")[[1]])
    xes<-xes[which(nchar(xes)>0)]
    if(length(wstem)>1) xes<-sapply(xes, function(x) stemexcept(x, wstem, language), USE.NAMES=F)
    if(wstem[1]=="all") xes<-sapply(xes, SnowballC::wordStem, language=language, USE.NAMES=F)
    return(xes)
  }
}

############################################################################
#' Conditional Stemmer
#' @description background function to load
#' @param sentence character Vector of sentences to stem.
#' @param excepts character Vector of words that should not be stemmed.
#' @param language Language for stemming. Default is "english".
#' @return Sentence of stemmed words.
#' @keywords internal
stemexcept<-function(sentence, excepts, language="english"){
  words<-strsplit(sentence, split=" ")[[1]]
  SS<-which(!(words %in% excepts))
  words[SS]<-SnowballC::wordStem(words[SS], language)
  return(paste(words, collapse=" "))
}
############################################################################
#' Overlap cleaner
#' @description background function to load
#' @param high matrix Token counts that will all be kept.
#' @param low matrix Token counts that will evaluated (and pruned) for overlapping.
#' @param cutoff numeric Threshold (as cosine distance) for including overlapping tokens. Default is 1 (i.e. all tokens included).
#' @return Combined token count matrix.
#' @keywords internal
overlaps<-function(high, low, cutoff=1,verbose=FALSE){
  if(cutoff==1){
    combined<-cbind(high,low)
  } else {
    hM=as.matrix(high)
    keep=rep(TRUE,ncol(low))
    xnames=unlist(colnames(low))
    ynames=unlist(colnames(high))
    for(x in 1:ncol(low)){
      lV=as.vector(low[,x])
      yflag="go"
      y=1
      while(yflag!="break"){
        if(xnames[x]%in%strsplit(ynames[y],"_")[[1]]){
          hV=as.vector(hM[,y])
          cossim=sum(lV*hV)/(sqrt(sum(lV^2))*sqrt(sum(hV^2)))
          if(cossim>cutoff){
            if(verbose) message(paste(xnames[x],ynames[y]))
            keep[x]=FALSE
            yflag="break"
          }
        }
        if(y==ncol(high)){
          yflag="break"
        }
        y=y+1
      }
    }
    combined<-cbind(low[,keep],high)
  }
  return(combined)
}

############################################################################
#' Doublestacker
#' @description background function to load
#' @param wdcts matrix Token counts that will have doubled column names condensed.
#' @return Token count matrix with no doubled column names.
#' @keywords internal
doublestacker<-function (wdcts){
  words<- colnames(wdcts)
  if(sum(duplicated(words))>0){
    #wdcts<-as.matrix(wdcts)
    for (Q in words[duplicated(words)]) {
      wdcts[, (words== Q) & (!duplicated(words))] <- as.numeric(rowSums(wdcts[,(words== Q)]))
      wdcts[, ((words== Q) & (duplicated(words)))] <- NA
    }
    wdcts<-wdcts[, !is.na(colMeans(wdcts))]
  }
  return(wdcts)
}
############################################################################

############################################################################
#' Feature Count Matcher
#' @description background function to load
#' @param hole matrix Token counts in model data.
#' @param peg matrix Token counts in new data.
#' @return Token counts matrix from new data, with column names that match the model data.
#' @keywords internal
vocabmatcher<-function(hole, peg){
  peg<-doublestacker(peg)
  newpeg<-array(0, c(nrow(peg), ncol(hole)))
  for (i in 1:ncol(newpeg)){
    if(colnames(hole)[i] %in% colnames(peg)){
      newpeg[,i]<-as.vector(peg[,which(colnames(peg)==colnames(hole)[i])])
    }
  }
  dimnames(newpeg)<-list(rownames(peg), colnames(hole))
  newpeg<-quanteda::as.dfm(newpeg)
  return(newpeg)
}
############################################################################

#' UK to US conversion
#' @description background function to load.
#' @param text character Vector of strings to convert to US spelling.
#' @return character Vector of Americanized strings.
#' @keywords internal
usWords<-function(text){
toks <- quanteda::tokens(text)
tokUS<-quanteda::tokens_lookup(toks, doc2concrete::uk2us,
                               exclusive = FALSE,capkeys = FALSE)
sentUS<-unlist(lapply(tokUS,paste, collapse=" "),use.names = F)
}

#' Cleaning weird encodings
#' @description Handles curly quotes, umlauts, etc.
#' @param text character Vector of strings to clean.
#' @return character Vector of clean strings.
#' @keywords internal
cleanpunct<-function(text){
  # text<- gsub("‘", "'",text)
  # text<-gsub("’", "'", text)
  # text<-gsub("“", '"', text)
  # text<-gsub("”", '"', text)
  text<-gsub("[\u201C\u201D\u201E\u201F\u2033\u2036]", '"', text)
  text<-gsub("[\u2018\u2019\u201A\u201B\u2032\u2035]", "'", text)
  text<-stringi::stri_trans_general(text, "latin-ascii")
  return(text)
}

Any scripts or data that you put into this service are public.

doc2concrete documentation built on June 29, 2022, 1:05 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

doc2concrete
Measuring Concreteness in Natural Language

R/ngramComponents.R
In doc2concrete: Measuring Concreteness in Natural Language

Defines functions cleanpunct usWords vocabmatcher overlaps stemexcept stemmer ctxpand textformat cleantext

Documented in cleanpunct cleantext ctxpand overlaps stemexcept stemmer textformat usWords vocabmatcher

Try the doc2concrete package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

doc2concrete Measuring Concreteness in Natural Language

R/ngramComponents.R In doc2concrete: Measuring Concreteness in Natural Language

Defines functions cleanpunct usWords vocabmatcher overlaps stemexcept stemmer ctxpand textformat cleantext

Documented in cleanpunct cleantext ctxpand overlaps stemexcept stemmer textformat usWords vocabmatcher

Try the doc2concrete package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

doc2concrete
Measuring Concreteness in Natural Language

R/ngramComponents.R
In doc2concrete: Measuring Concreteness in Natural Language