#' Create Amazon Sentiment star ratings dataset
#'
#' The goal in this dataset is to predict the number of stars given in a product review taken from Amazon.com in four different categories: books, dvds, electronics and kitchen.
#'
#' For 21669 reviews, 1009925 unigram and bigram features are given in a sparse matrix X, along with a vector y containing the star rating and a vector indicating the domain. Note the data is NOT a \code{\link{data.table}}, but sparse matrix generated by \code{\link[Matrix]{sparseMatrix}}.
#'
#' \emph{Task:} Classification: Use X to predict y, possibly in a domain adaptation setting.
#'
#' @inheritParams createDiabetes
#' @return List containing:
#' \itemize{
##' \item{"X"}{ \code{dgCMatrix}; sparse matrix with count of unigram and bigram features}
##' \item{"y"}{ numeric; star rating}
##' \item{"domains"}{ factor; domain/category for each review}
##' }
#' @seealso \code{\link{createAmazonSentiment}}, \url{http://www.cs.jhu.edu/~mdredze/datasets/sentiment/}
#' @export
createAmazonSentimentStars<-function(file=getfilepath("amazonsentimentstars.rds"),write=TRUE,read=TRUE) {
if (!read | !file.exists(file)) {
tmpfile<-tempfile()
download.file("http://www.cs.jhu.edu/~mdredze/datasets/sentiment/processed_stars.tar.gz",tmpfile)
tmpdir<-tempdir()
untar(tmpfile,compressed=TRUE,exdir = tmpdir)
files<-c("processed_stars/books/all_balanced.review",
"processed_stars/dvd/all_balanced.review",
"processed_stars/electronics/all_balanced.review",
"processed_stars/kitchen/all_balanced.review"
)
out<-lapply(files, function(filename) { read_domainsentimentfile(file.path(tmpdir,filename)) })
dfs<-list(data.table(words=out[[1]]$words,counts=out[[1]]$counts,instances=out[[1]]$instances))
labels<-out[[1]]$labels
for (i in 2:length(out)) {
dfs[[i]] <- data.table(words=out[[i]]$words,counts=out[[i]]$counts,instances=out[[i]]$instances+max(sapply(dfs,function(x) {max(x$instances)})))
labels<-c(labels,out[[i]]$labels)
}
labels<-as.numeric(labels)
domains <- factor(c(rep("books",5501),
rep("dvd",5118),
rep("electronics",5901),
rep("kitchen",5149)))
dfall<-rbindlist(dfs)
dfall[,"words":=as.factor("words")]
X<-sparseMatrix(i=as.integer(dfall$instances),j=as.integer(dfall$words),x=dfall$counts,dimnames=list(NULL,levels(dfall$words)))
data<-list(X=X,y=labels,domains=domains)
if (write) {
saveRDS(data, file=file)
}
} else {
data<-readRDS(file)
}
return(data)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.