R/stem_complete.R

Defines functions stem_complete

#' @export stem_complete
stem_complete <- function(txt,lang = "en"){
# save(file="../user_data/dom",list=ls())
  # load("c:/Users/Dominique/Desktop/Stat_Regie/data/user_data/dom")
  txt0<-txt
  lang2<-plyr::mapvalues(lang,c("en", "fr","ge","sp","it","pt","ru","du")
                  ,c("english","french","german","spanish","italian","portuguese","russian","dutch"))
  library(stringr)
  library(dplyr)
  library(stringdist)
  library(data.table)
 # save(file="dom",list=ls())
  # load("C:/Users/Dominique/Desktop/Stat_Regie/data/application_data/dom")
  txt<-factor(str_split(paste(txt,collapse   = " MOT_SEPARATEUR_DE_VERBATIM ")," ")[[1]])
  txt<-txt[str_trim(txt)!=""]
  length(txt)
  t<-table(txt)
  l <- levels(txt)
  l2<-sapply(l,SnowballC::wordStem,language=lang2)
  out <- data.frame(l=l,l2=l2,t=as.vector(t),stringsAsFactors = FALSE)
  x<-out %>%group_by(l2)%>%summarise(t=max(t))
  x<-inner_join(out,x,by=c("l2","t"))
  x<-dplyr::rename(x,l3=l)
  x<-x%>%dplyr::select(-t)%>%as.data.table
  x[,d:=stringdist(l2,l3,method="osa") + stringdist(l2,l3,method="cosine")/5  + stringdist(l2,l3,method="soundex")/20 ]
  y<-x%>%group_by(l2)%>%summarise(d=min(d))
  x<-inner_join(x,y,by=c("l2","d"))%>%as.data.table
  is(x)
  x<-x%>%dplyr::select(l2,l3)
  setkeyv(x,"l2")
  x<-x%>%unique
  x<-inner_join(out,x,by=c("l2"))%>%as.data.table
  l3<-plyr::mapvalues(l,x$l,x$l3,warn_missing = FALSE)
  l3<-ifelse(is.na(l3),l,l3)
  levels(txt) <- l3
  txt<-as.character(txt)
  txt <- paste(txt,collapse=" ")
  txt <- str_trim(str_split(txt,"MOT_SEPARATEUR_DE_VERBATIM")[[1]])
  return(txt)


}
dominiqueemmanuel/verbatim.utils documentation built on Jan. 20, 2020, 3:16 a.m.