R/dropRedundantTags.R

Defines functions dropRedundantTags

Documented in dropRedundantTags

#' Tag Dropper
#' @description Drop redundant parses
#' @param texts a list of character vectors
#' @return a list of character vectors
#' @importFrom magrittr %>%
#' @import dplyr
dropRedundantTags <- function(pos_lists, sparse=0.99){
  dt_pos <- data.frame(raw = (unlist(pos_lists))) %>%
    mutate(token = gsub("_[A-Z]+$","",raw)) %>%
    mutate(pos = gsub("^.*_","",raw)) %>%
    group_by(raw) %>%
    summarize(count=n(), token=first(token),pos=first(pos))
  dt_keepers<- dt_pos %>%
    filter(count>length(pos_lists)*(1-sparse)) %>%
    filter(token%in%(token[duplicated(token)]))
  dt_switchers<-dt_pos[!dt_pos$raw%in%dt_keepers$raw,]
  subbed_lists<-lapply(pos_lists, function(x) unlist(plyr::mapvalues(x,
                                                                     dt_switchers$raw,
                                                                     dt_switchers$token,
                                                                     warn_missing=F)))
  return(subbed_lists)
}
myeomans/DTMtools documentation built on March 2, 2020, 8:57 p.m.