R/CleanText.R

#' Clean text and build term matrix for bag of words,TF DFI and bi-gram.
#'
#' @param source_dataset A dataframe  having two columns, review as text, label as binary.
#' @param dtm_method 1 for bag of word, 2 for TF DFI, 3 for bigram.
#' @param reductionrate how many percent of term matrix you want to keep,usually 0.999 and not less than 0.99.
#' @return dataframe "dataset" : The term matrix converted to dataframe plus target label.
#' @author Zahra Khoshmanesh
#' @import tm
#' @import checkmate
#' @importFrom NLP ngrams words
#' @importFrom methods hasArg
#' @export 
#' @return A clean dataframe,a term-matrix
#' @examples
#' \dontrun{
#' library("SentiAnalyzer")
#' direction <- system.file(package = "SentiAnalyzer", "extdata/Restaurant_Reviews.tsv")
#' orignal_dataset <- read.delim(direction,quote='',stringsAsFactors = FALSE)
#' CleanText(original_dataset,dtm_method=1,reductionrate=0.99)
#' CleanText(original_dataset,dtm_method=2,reductionrate=0.99)
#' CleanText(original_dataset,dtm_method=3,reductionrate=0.999)}
 
CleanText <- function(source_dataset,dtm_method,reductionrate){


    if(!hasArg(source_dataset)){
      source_dataset=system.file(package = "SentiAnalyzer", "extdata/Restaurant_Reviews.tsv")
      warning('file path does not provided by user, set to default file path')
    }
    else if(!hasArg(dtm_method)){
      dtm_method=1
      warning('dtm_method does not exist, set to default method, bag of word with simple count')
    }
    # check whether the reductionrate is a number 
    if(!is.numeric(reductionrate)){
      reductionrate=0.99
      warning('reductionrate is not numeric,set to default 0.99 value')
      
    }
    # check whether the reductionrate is a number between 0.99 and 1.
    checkmate::assertNumber(reductionrate,lower = 0.99, upper =1) 
  
  

  origin_data=source_dataset

  corpus <- VCorpus(VectorSource(origin_data[[1]])) %>%
      tm_map(content_transformer(tolower)) %>% #convert all review to lower case
      tm_map(removeNumbers) %>% # remove numbers from reviews
      tm_map(removePunctuation) %>% # remove punctuations from reviews
      tm_map(removeWords,stopwords()) %>% # remove Stop words from reviews
      tm_map(stemDocument) %>% # Stemming
      tm_map(stripWhitespace)  # remove extra space that created in cleaning stage when for example number remove
  

  #creating document term matrix of words in reviews

  # bigram
  BigramTokenizer <-  function(x)  unlist(lapply(NLP::ngrams(NLP::words(x), 2), paste, collapse = " "), use.names = FALSE)
  dtm <-switch(dtm_method,
               '1' = DocumentTermMatrix(corpus),
               '2' = DocumentTermMatrix(corpus,control = list(weighting = function(x) weightTfIdf(x, normalize = FALSE))),
               '3' = t(TermDocumentMatrix(corpus, control = list(tokenize = BigramTokenizer)))
                )

  # reduce dimention of sparse matrix
  dtm = removeSparseTerms(dtm,reductionrate)

  # convert matrix of independent variables to data frame
  clean_dataset = as.data.frame(as.matrix(dtm))
  # encode the target feature as factor
  clean_dataset$target = factor(origin_data[[-1]],levels=c(0,1))
  #check outputs
  checkmate::checkFactor(clean_dataset$target)
  checkmate::testDataFrame(clean_dataset)
  
  return(clean_dataset)
}
zahrakhoshmanesh/FinalProjectSTAT585 documentation built on June 4, 2019, 1:57 p.m.