R/FindSimilarString.R

Defines functions FindSimilarString

FindSimilarString <- function(x, FR=0.2, ignore.case=T){
  if(!require(ggplot2)) {install.packages('ggplot2'); library(ggplot2)}
  x <- as.character(x)
  if(any(nchar(na.omit(unique(x))) <= 3)){
    message('Warning message: If a length of string is too low, it may work incorrectly')
  }
  message('Calculating..')
  acc_matrix <- adist(na.omit(unique(x)), ignore.case = ignore.case)
  rownames(acc_matrix) <- na.omit(unique(x))
  colnames(acc_matrix) <- na.omit(unique(x))
  acc_dat <- as.data.frame(acc_matrix)
  acc_dat$nchar <- nchar(rownames(acc_dat))

  result <- list()
  for(i in 1:nrow(acc_dat)){
    result[[i]] <- rownames(acc_dat[acc_dat[[i]] <= round(acc_dat$nchar*FR),])
  }
  result <- unique(result[unlist(lapply(result, function(x) ifelse(length(x)>1,T,F)))])
  #result <- do.call('rbind', lapply(unique(result), function(x) data.frame(t(x))))
  if(length(result)==0) return(message('No problems found on this variable.'))

  plotdat <- data.frame(table(x[x %in% unlist(result)]), stringsAsFactors = F)
  plotdat$fill <- NA
  for(i in 1:length(result)){
    plotdat$fill[plotdat$Var1 %in% result[[i]]] <- i
  }

  g <- ggplot(plotdat) +
    geom_bar(aes(x=reorder(Var1, plotdat$fill), fill=factor(fill), y=Freq), stat='identity') +
    labs(x='Values', y='Freq') + guides(fill=F) +
    coord_flip()

  cat('Variable Summary')
  cat('\nCardinality :', length(na.omit(unique(x))))
  cat('\nThe Number of PBM :', length(result), '(Cases which may have syntax probelms)')

  invisible(list(PBM=result, PBM.Plot=g))
}
gerolt/test2 documentation built on May 25, 2019, 5:25 p.m.