R/createMNIST.R

#' Create MNIST digit recognition dataset
#' 
#' Code heavily inspired by https://gist.github.com/brendano/39760
#' 
#' First 60000 is train set, last 10000 is test set.
#' 
#' formula(Class~.)
#' @examples
#' #as.matrix(data[1040,-785])  %>% matrix(28,28)  %>%  Matrix    %>% t%>% image(asp=1)
#' @inheritParams createDiabetes
#' @export
createMNIST<-function(file=getfilepath("MNIST.rds"),write=TRUE,read=TRUE) {
  # Check if the user forced the recreation of the datasets or whether the datafile is missing on disk
  if (!read | !file.exists(file)) {
    con<-gzcon(url("http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz","rb"))
    data<-read_MNIST_bin_images(con)
    close(con)
    
    con<-gzcon(url("http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz","rb"))
    data<-rbind(data,read_MNIST_bin_images(con))
    close(con)
    
    con<-gzcon(url("http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz","rb"))
    y<-read_MNIST_bin_labels(con)
    close(con)
    
    con<-gzcon(url("http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz","rb"))
    y<-c(y,read_MNIST_bin_labels(con))
    close(con)
  
    data <- data.frame(data, Class=factor(y))
    
    if (write) {
      saveRDS(data, file=file)
    }
  } else {
    data<-readRDS(file)
  }
  
  return(data)
}

read_MNIST_bin_images<-function(con) {
  readBin(con,'integer',n=1,size=4,endian='big')
  n <- readBin(con,'integer',n=1,size=4,endian='big')
  nrow <- readBin(con,'integer',n=1,size=4,endian='big')
  ncol <- readBin(con,'integer',n=1,size=4,endian='big')
  x <- readBin(con,'integer',n=n*nrow*ncol,size=1,signed=F)
  x <- matrix(x, ncol=nrow*ncol, byrow=T)
  x
}

read_MNIST_bin_labels<-function(con) {
  readBin(con,'integer',n=1,size=4,endian='big')
  n <- readBin(con,'integer',n=1,size=4,endian='big')
  y <- readBin(con,'integer',n=n,size=1,signed=F)
  y
}
jkrijthe/createdatasets documentation built on May 19, 2019, 12:44 p.m.