R/createAmazonSentimentStars.R

#' Create Amazon Sentiment star ratings dataset
#' 
#' The goal in this dataset is to predict the number of stars given in a product review taken from Amazon.com in four different categories: books, dvds, electronics and kitchen. 
#' 
#' For 21669 reviews, 1009925 unigram and bigram features are given in a sparse matrix X, along with a vector y containing the star rating and a vector indicating the domain. Note the data is NOT a \code{\link{data.table}}, but sparse matrix generated by \code{\link[Matrix]{sparseMatrix}}.
#' 
#' \emph{Task:} Classification: Use X to predict y, possibly in a domain adaptation setting.
#' 
#' @inheritParams createDiabetes
#' @return List containing:
#' \itemize{
##'  \item{"X"}{ \code{dgCMatrix}; sparse matrix with count of unigram and bigram features}
##'  \item{"y"}{ numeric; star rating}
##'  \item{"domains"}{ factor; domain/category for each review}
##' }
#' @seealso \code{\link{createAmazonSentiment}}, \url{http://www.cs.jhu.edu/~mdredze/datasets/sentiment/}
#' @export
createAmazonSentimentStars<-function(file=getfilepath("amazonsentimentstars.rds"),write=TRUE,read=TRUE) {
  
  if (!read | !file.exists(file)) {

    tmpfile<-tempfile()
    download.file("http://www.cs.jhu.edu/~mdredze/datasets/sentiment/processed_stars.tar.gz",tmpfile)
    tmpdir<-tempdir()
    untar(tmpfile,compressed=TRUE,exdir = tmpdir)
    
    files<-c("processed_stars/books/all_balanced.review",
             "processed_stars/dvd/all_balanced.review",
             "processed_stars/electronics/all_balanced.review",
             "processed_stars/kitchen/all_balanced.review"
             )
    
    out<-lapply(files, function(filename) { read_domainsentimentfile(file.path(tmpdir,filename)) })
    
    dfs<-list(data.table(words=out[[1]]$words,counts=out[[1]]$counts,instances=out[[1]]$instances))
    labels<-out[[1]]$labels
    for (i in 2:length(out)) {
      dfs[[i]] <- data.table(words=out[[i]]$words,counts=out[[i]]$counts,instances=out[[i]]$instances+max(sapply(dfs,function(x) {max(x$instances)})))
      labels<-c(labels,out[[i]]$labels)
    }
    labels<-as.numeric(labels)
    
    domains <- factor(c(rep("books",5501),
                        rep("dvd",5118),
                        rep("electronics",5901),
                        rep("kitchen",5149)))
    
    dfall<-rbindlist(dfs)
    
    dfall[,"words":=as.factor("words")]
    X<-sparseMatrix(i=as.integer(dfall$instances),j=as.integer(dfall$words),x=dfall$counts,dimnames=list(NULL,levels(dfall$words)))
  
    data<-list(X=X,y=labels,domains=domains)
    if (write) {
      saveRDS(data, file=file)
    }
  } else {
    data<-readRDS(file)
  }
  
  return(data)
}
jkrijthe/createdatasets documentation built on May 19, 2019, 12:44 p.m.