Example data set

In the following examples, we'll use a data set that has one million rows and ten columns. The associated data frame is $800$MB. To make things interesting, the data frame will include a mixture of numbers and missing values.

x = runif(1e7)
x[sample(1e7, 1e6)] = NA # 10% NAs
dd =  as.data.frame(replicate(10, x))
x = runif(1e3)
x[sample(1e3, 1e2)] = NA # 10% NAs
dd =  as.data.frame(replicate(10, x))

Timings

Base R: read.csv/write.csv

system.time(write.csv(dd, "dd.csv"))
#   user  system elapsed 
#101.012   1.096 104.153 
system.time(read.csv("dd.csv"))
#   user  system elapsed 
#151.568   0.728 152.134 

Base R: readRDS/saveRDS

system.time(saveRDS(dd, file = "dd.RData"))
#   user  system elapsed 
# 101.77    0.28  102.69 
system.time(readRDS(file = "dd.RData"))
#   user  system elapsed 
#  5.052   0.104   5.148 

The data.table package

library("data.table")
system.time(fwrite(dd, "dd.csv"))
#   user  system elapsed 
# 28.776   1.600   4.128 
system.time(fread("dd.csv"))
#   user  system elapsed 
#  21.93    0.18   22.07 

The readr package}

library("readr")
system.time(write_csv(dd, "dd.csv"))
#   user  system elapsed 
# 189.44    1.54  190.77 
system.time(read_csv("dd.csv"))
#   user  system elapsed 
# 18.476   0.252  18.588 

The feather package

library("feather")
system.time(write_feather(dd, "dd.feather"))
#   user  system elapsed 
#  0.620   0.540   1.319 
system.time(read_feather("dd.feather"))
#   user  system elapsed 
#  0.356   0.068   0.422 

Personally, I think that feather looks very promising, but is very new.

Overview

writes = c(104, 102, 4.1, 190, 1.319)
reads = c(152, 5.1, 22.07, 18.58, 0.422)
n = c("read.csv", "Rdata", "data.table", "readr", "feather")

par(mar = c(3, 3, 2, 1), mgp = c(2, 0.4, 0), tck = -.01,
    cex.axis = 0.8, las = 1, xaxs = "i", yaxs = "i")
barplot(writes, names.arg = n, col = "steelblue", 
        border = NA, space = 0.05, ylim = c(0, 200), 
        ylab = "Time(s)")
abline(h = seq(0, 200, 20), col = "white")
title("Write times", adj = 1, 
      cex.main = 1, font.main = 2, col.main = "black")

 par(mar = c(3, 3, 2, 1), mgp = c(2, 0.4, 0), tck = -.01,
     cex.axis = 0.8, las = 1, xaxs = "i", yaxs = "i")
barplot(reads, names.arg = n, col = "steelblue", 
        border = NA, space = 0.05, ylim = c(0, 200), 
        ylab = "Time(s)")
abline(h = seq(0, 150, 20), col = "white")
title("Read times", adj = 1, 
      cex.main = 1, font.main = 2, col.main = "black")


jr-packages/jrBig documentation built on Jan. 1, 2020, 2:02 p.m.