#' Create Amazon Sentiment dataset
#'
#' The goal in this dataset is to predict binary (positive/negative) sentiment from product reviews taken from Amazon.com in four different categories: books, dvds, electronics and kitchen.
#'
#' For 27677 reviews, 1110349 unigram and bigram features are given in a sparse matrix X, along with a label vector y and a vector indicating the domain. Note the data is NOT a \code{\link{data.table}}, but sparse matrix generated by \code{\link[Matrix]{sparseMatrix}}.
#'
#' \emph{Task:} Classification: Use X to predict y, possibly in a domain adaptation setting.
#'
#' @inheritParams createDiabetes
#' @return List containing:
#' \itemize{
##' \item{"X"}{ \code{dgCMatrix}; sparse matrix with count of unigram and bigram features}
##' \item{"y"}{ factor; labels}
##' \item{"domains"}{ factor; domain/category for each review}
##' }
#' @seealso \code{\link{createAmazonSentimentStars}}, \url{http://www.cs.jhu.edu/~mdredze/datasets/sentiment/}
#' @export
createAmazonSentiment<-function(file=getfilepath("amazonsentiment.rds"),write=TRUE,read=TRUE) {
if (!read | !file.exists(file)) {
tmpfile<-tempfile()
download.file("http://www.cs.jhu.edu/~mdredze/datasets/sentiment/processed_acl.tar.gz",tmpfile)
tmpdir<-tempdir()
untar(tmpfile,compressed=TRUE,exdir = tmpdir)
files<-c("processed_acl/books/negative.review",
"processed_acl/books/positive.review",
"processed_acl/dvd/negative.review",
"processed_acl/dvd/positive.review",
"processed_acl/electronics/negative.review",
"processed_acl/electronics/positive.review",
"processed_acl/kitchen/negative.review",
"processed_acl/kitchen/positive.review",
"processed_acl/books/unlabeled.review",
"processed_acl/dvd/unlabeled.review",
"processed_acl/electronics/unlabeled.review",
"processed_acl/kitchen/unlabeled.review")
out<-lapply(files, function(filename) { read_domainsentimentfile(file.path(tmpdir,filename)) })
dfs<-list(data.table(words=out[[1]]$words,counts=out[[1]]$counts,instances=out[[1]]$instances))
labels<-out[[1]]$labels
for (i in 2:length(out)) {
dfs[[i]] <- data.table(words=out[[i]]$words,counts=out[[i]]$counts,instances=out[[i]]$instances+max(sapply(dfs,function(x) {max(x$instances)})))
labels<-c(labels,out[[i]]$labels)
}
labels<-factor(labels)
domains <- factor(c(rep("books",2000),
rep("dvd",2000),
rep("electronics",2000),
rep("kitchen",2000),
rep("books",4465),
rep("dvd",3586),
rep("electronics",5681),
rep("kitchen",5945)))
dfall<-rbindlist(dfs)
dfall[,"words":=as.factor("words")]
X<-sparseMatrix(i=as.integer(dfall$instances),j=as.integer(dfall$words),x=dfall$counts,dimnames=list(NULL,levels(dfall$words)),giveCsparse=TRUE)
data<-list(X=X,y=labels,domains=domains)
if (write) {
saveRDS(data, file=file)
}
} else {
data<-readRDS(file)
}
return(data)
}
read_domainsentimentfile<-function(filename) {
# Get number of instances
con <- file(filename, open="r")
N <- length(readLines(con))
close(con)
#Preallocate memory and initialize loop
con <- file(filename, open="r")
words <- rep(NA,400*N)
counts <- rep(NA,400*N)
instances <- rep(NA,400*N)
labels <- rep(NA,N)
i <- 0
start_idx <- 1
while (length(oneLine <- readLines(con, n = 1, warn = FALSE)) > 0) {
i<-i+1
terms <- (strsplit(oneLine, " "))[[1]]
ipairs <- sapply(terms,function(my){strsplit(my, ":")[[1]]})
iwords <- ipairs[1,-length(terms)]
icounts <- as.integer(ipairs[2,-length(terms)])
labels[i] <- ipairs[2,length(terms)]
end_idx <- start_idx+length(iwords)-1
words[start_idx:end_idx] <- iwords
counts[start_idx:end_idx] <- icounts
instances[start_idx:end_idx] <- rep(i,length(iwords))
start_idx <- end_idx+1
}
close(con)
return(list(instances=instances[1:end_idx-1], counts=counts[1:end_idx-1], words=words[1:end_idx-1], labels=labels))
}
# For writing to Matlab
# Note: first do giveCsparse=TRUE
# writeout<-data.table(i=sM@i+1,j=sM@j+1,x=sM@x)
# write.table(writeout,file = "out.txt",sep = "\t",row.names = FALSE,col.names = FALSE)
# write.table(colnames(sM),"names.txt",row.names = FALSE,col.names = FALSE)
# write.table(labels,"labels.txt",row.names = FALSE,col.names = FALSE)
# write.table(domains,"domains.txt",row.names = FALSE,col.names = FALSE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.