R/createAmazonSentiment.R

#' Create Amazon Sentiment dataset
#' 
#' The goal in this dataset is to predict binary (positive/negative) sentiment from product reviews taken from Amazon.com in four different categories: books, dvds, electronics and kitchen. 
#' 
#' For 27677 reviews, 1110349 unigram and bigram features are given in a sparse matrix X, along with a label vector y and a vector indicating the domain. Note the data is NOT a \code{\link{data.table}}, but sparse matrix generated by \code{\link[Matrix]{sparseMatrix}}.
#' 
#' \emph{Task:} Classification: Use X to predict y, possibly in a domain adaptation setting.
#' 
#' @inheritParams createDiabetes
#' @return List containing:
#' \itemize{
##'  \item{"X"}{ \code{dgCMatrix}; sparse matrix with count of unigram and bigram features}
##'  \item{"y"}{ factor; labels}
##'  \item{"domains"}{ factor; domain/category for each review}
##' }
#' @seealso \code{\link{createAmazonSentimentStars}}, \url{http://www.cs.jhu.edu/~mdredze/datasets/sentiment/}
#' @export
createAmazonSentiment<-function(file=getfilepath("amazonsentiment.rds"),write=TRUE,read=TRUE) {

  if (!read | !file.exists(file)) {
    tmpfile<-tempfile()
    download.file("http://www.cs.jhu.edu/~mdredze/datasets/sentiment/processed_acl.tar.gz",tmpfile)
    tmpdir<-tempdir()
    untar(tmpfile,compressed=TRUE,exdir = tmpdir)

    files<-c("processed_acl/books/negative.review",
             "processed_acl/books/positive.review",
             "processed_acl/dvd/negative.review",
             "processed_acl/dvd/positive.review",
             "processed_acl/electronics/negative.review",
             "processed_acl/electronics/positive.review",
             "processed_acl/kitchen/negative.review",
             "processed_acl/kitchen/positive.review",
             "processed_acl/books/unlabeled.review",
             "processed_acl/dvd/unlabeled.review",
             "processed_acl/electronics/unlabeled.review",
             "processed_acl/kitchen/unlabeled.review")

    out<-lapply(files, function(filename) { read_domainsentimentfile(file.path(tmpdir,filename)) })
    
    dfs<-list(data.table(words=out[[1]]$words,counts=out[[1]]$counts,instances=out[[1]]$instances))
    labels<-out[[1]]$labels
    for (i in 2:length(out)) {
      dfs[[i]] <- data.table(words=out[[i]]$words,counts=out[[i]]$counts,instances=out[[i]]$instances+max(sapply(dfs,function(x) {max(x$instances)})))
      labels<-c(labels,out[[i]]$labels)
    }
    labels<-factor(labels)

    domains <- factor(c(rep("books",2000),
                 rep("dvd",2000),
                 rep("electronics",2000),
                 rep("kitchen",2000),
                 rep("books",4465),
                 rep("dvd",3586),
                 rep("electronics",5681),
                 rep("kitchen",5945)))

    dfall<-rbindlist(dfs)

    dfall[,"words":=as.factor("words")]
    X<-sparseMatrix(i=as.integer(dfall$instances),j=as.integer(dfall$words),x=dfall$counts,dimnames=list(NULL,levels(dfall$words)),giveCsparse=TRUE)
    
    data<-list(X=X,y=labels,domains=domains)
    if (write) {
      saveRDS(data, file=file)
    }
      } else {
        data<-readRDS(file)
      }

  return(data)
}


read_domainsentimentfile<-function(filename) {
  # Get number of instances
  con <- file(filename, open="r")
  N <- length(readLines(con))
  close(con)
  
  #Preallocate memory and initialize loop
  con <- file(filename, open="r")
  words <- rep(NA,400*N)
  counts <- rep(NA,400*N)
  instances <- rep(NA,400*N)
  labels <- rep(NA,N)
  i <- 0
  start_idx <- 1
  
  
  while (length(oneLine <- readLines(con, n = 1, warn = FALSE)) > 0) {
    i<-i+1
    terms <- (strsplit(oneLine, " "))[[1]]
    ipairs <- sapply(terms,function(my){strsplit(my, ":")[[1]]})
    iwords <- ipairs[1,-length(terms)]
    icounts <- as.integer(ipairs[2,-length(terms)])
    labels[i] <- ipairs[2,length(terms)]
    
    end_idx <- start_idx+length(iwords)-1
    words[start_idx:end_idx] <- iwords
    counts[start_idx:end_idx] <- icounts
    instances[start_idx:end_idx] <- rep(i,length(iwords))
    start_idx <- end_idx+1
  } 
  close(con)
  return(list(instances=instances[1:end_idx-1], counts=counts[1:end_idx-1], words=words[1:end_idx-1], labels=labels))
}


# For writing to Matlab
# Note: first do giveCsparse=TRUE
# writeout<-data.table(i=sM@i+1,j=sM@j+1,x=sM@x)
# write.table(writeout,file = "out.txt",sep = "\t",row.names = FALSE,col.names = FALSE)
# write.table(colnames(sM),"names.txt",row.names = FALSE,col.names = FALSE)
# write.table(labels,"labels.txt",row.names = FALSE,col.names = FALSE)
# write.table(domains,"domains.txt",row.names = FALSE,col.names = FALSE)
jkrijthe/createdatasets documentation built on May 19, 2019, 12:44 p.m.