Efficiency Relative to Storage and Time
In git2rdata: Store and Retrieve Data.frames in a Git Repository

library(knitr)
opts_chunk$set(
  fig.height = 4, fig.width = 6,
  collapse = TRUE,
  comment = "#>"
)
library(ggplot2)
inbo_colours <- c("#959B38", "#729BB7", "#E87837", "#BDDDD7", "#E4E517",
                  "#843860", "#C04384", "#C2C444", "#685457")
theme_inbo <- function(base_size = 12, base_family = "") {
  rect_bg <- "white"
  legend_bg <- "white"
  panel_bg <- "#F3F3F3"
  panel_grid <- "white"
  plot_bg <- "white"
  half_line <- base_size / 2
  ggplot2::theme(
    line = ggplot2::element_line(colour = "black", size = 0.5, linetype = 1,
                        lineend = "butt"),
    rect = ggplot2::element_rect(fill = rect_bg, colour = "black", size = 0.5,
                        linetype = 1),
    text = ggplot2::element_text(family = base_family, face = "plain",
                        colour = "#843860", size = base_size, hjust = 0.5,
                        vjust = 0.5, angle = 0, lineheight = 0.9,
                        margin = ggplot2::margin(), debug = FALSE),
    axis.line = ggplot2::element_blank(),
    axis.line.x = ggplot2::element_blank(),
    axis.line.y = ggplot2::element_blank(),
    axis.text = ggplot2::element_text(size = ggplot2::rel(0.8)),
    axis.text.x = ggplot2::element_text(
      margin = ggplot2::margin(t = 0.8 * half_line / 2), vjust = 1
    ),
    axis.text.x.top = NULL,
    axis.text.y = ggplot2::element_text(
      margin = ggplot2::margin(r = 0.8 * half_line / 2), hjust = 1
    ),
    axis.text.y.right = NULL,
    axis.ticks = ggplot2::element_line(),
    axis.ticks.length = ggplot2::unit(0.15, "cm"),
    axis.title = ggplot2::element_text(colour = "black"),
    axis.title.x = ggplot2::element_text(
      margin = ggplot2::margin(t = 0.8 * half_line, b = 0.8 * half_line / 2)
    ),
    axis.title.x.top = NULL,
    axis.title.y = ggplot2::element_text(
      margin = ggplot2::margin(r = 0.8 * half_line, l = 0.8 * half_line / 2),
      angle = 90
    ),
    axis.title.y.right = NULL,
    legend.background = ggplot2::element_rect(colour = NA, fill = legend_bg),
    legend.key = ggplot2::element_rect(fill = panel_bg, colour = NA),
    legend.key.size = ggplot2::unit(1.2, "lines"),
    legend.key.height = NULL,
    legend.key.width = NULL,
    legend.margin = NULL,
    legend.spacing = ggplot2::unit(0.2, "cm"),
    legend.spacing.x = NULL,
    legend.spacing.y = NULL,
    legend.text = ggplot2::element_text(size = ggplot2::rel(0.8)),
    legend.text.align = NULL,
    legend.title = ggplot2::element_text(
      size = ggplot2::rel(0.8), face = "bold", hjust = 0, colour = "black"
    ),
    legend.title.align = NULL,
    legend.position = "right",
    legend.direction = NULL,
    legend.justification = "center",
    legend.box = NULL,
    legend.box.margin = ggplot2::margin(
      t = half_line, r = half_line, b = half_line, l = half_line
    ),
    legend.box.background = ggplot2::element_rect(
      colour = NA, fill = legend_bg
    ),
    legend.box.spacing = ggplot2::unit(0.2, "cm"),
    panel.background = ggplot2::element_rect(fill = panel_bg, colour = NA),
    panel.border = ggplot2::element_blank(),
    panel.grid = ggplot2::element_line(colour = panel_grid),
    panel.grid.minor = ggplot2::element_line(colour = panel_grid, size = 0.25),
    panel.spacing = ggplot2::unit(half_line, "pt"),
    panel.spacing.x = NULL,
    panel.spacing.y = NULL,
    panel.ontop = FALSE,
    strip.background = ggplot2::element_rect(fill = "#8E9DA7", colour = NA),
    strip.text = ggplot2::element_text(
      size = ggplot2::rel(0.8), colour = "#F3F3F3"
    ),
    strip.text.x = ggplot2::element_text(
      margin = ggplot2::margin(t = half_line, b = half_line)
    ),
    strip.text.y = ggplot2::element_text(
      margin = ggplot2::margin(r = half_line, l = half_line), angle = -90
    ),
    strip.switch.pad.grid = ggplot2::unit(0.1, "cm"),
    strip.switch.pad.wrap = ggplot2::unit(0.1, "cm"),
    strip.placement = "outside",
    plot.background = ggplot2::element_rect(colour = NA, fill = plot_bg),
    plot.title = ggplot2::element_text(
      size = ggplot2::rel(1.2), margin = ggplot2::margin(0, 0, half_line, 0)
    ),
    plot.subtitle = ggplot2::element_text(
      size = ggplot2::rel(1), margin = ggplot2::margin(0, 0, half_line, 0)
    ),
    plot.caption = ggplot2::element_text(
      size = ggplot2::rel(0.6), margin = ggplot2::margin(0, 0, half_line, 0)
    ),
    plot.margin = ggplot2::margin(
      t = half_line, r = half_line, b = half_line, l = half_line
    ),
    plot.tag = ggplot2::element_text(
      size = ggplot2::rel(1.2), hjust = 0.5, vjust = 0.5
    ),
    plot.tag.position = "topleft",
    complete = TRUE
  )
}
theme_set(theme_inbo())
update_geom_defaults("line", list(colour = "#356196"))
update_geom_defaults("hline", list(colour = "#356196"))
update_geom_defaults("boxplot", list(colour = "#356196"))

Introduction

This vignette compares storage and retrieval of data by git2rdata with other standard R functionality. We consider write.table() and read.table() for data stored in a plain text format. saveRDS() and readRDS() use a compressed binary format.

To get some meaningful results, we will use the nassCDS dataset from the DAAG package. We'll avoid the dependency on the package by directly downloading the data.

airbag <- read.csv(
  "https://vincentarelbundock.github.io/Rdatasets/csv/DAAG/nassCDS.csv"
)
airbag$dead <- airbag$dead == "dead"
airbag$airbag <- airbag$airbag == "airbag"
airbag$seatbelt <- airbag$seatbelt == "belted"
airbag$dvcat <- as.ordered(airbag$dvcat)

if (system.file("efficiency", "airbag.rds", package = "git2rdata") == "") {
  saveRDS(airbag, file.path("..", "inst", "efficiency", "airbag.rds"))
} else {
  airbag <- readRDS(
    system.file("efficiency", "airbag.rds", package = "git2rdata")
  )
}

str(airbag)

Data Storage

On a File System

We start by writing the dataset as is with write.table(), saveRDS(), write_vc() and write_vc() without storage optimization. Note that write_vc() uses optimization by default. Since write_vc() creates two files for each data set, we take their combined file size into account.

library(git2rdata)
root <- tempfile("git2rdata-efficient")
dir.create(root)

write.table(airbag, file.path(root, "base_R.tsv"), sep = "\t")
base_size <- file.size(file.path(root, "base_R.tsv"))

saveRDS(airbag, file.path(root, "base_R.rds"))
rds_size <- file.size(file.path(root, "base_R.rds"))

fn <- write_vc(airbag, "airbag_optimize", root, sorting = "X")
optim_size <- sum(file.size(file.path(root, fn)))

fn <- write_vc(airbag, "airbag_verbose", root, sorting = "X", optimize = FALSE)
verbose_size <- sum(file.size(file.path(root, fn)))

Since the data is highly compressible, saveRDS() yields the smallest file at the cost of having a binary file format. Both write_vc() formats yield smaller files than write.table(). Partly because write_vc() doesn't store row names and doesn't use quotes unless needed. The difference between the optimized and verbose version of write_vc() is, in this case, solely due to the way write_vc() stores factors in the data (tsv) file. The optimized version stores the indices of the factor whereas the verbose version stores the levels. For example: airbag$dvcat has 5 levels with short labels (on average 5 character), storing the index requires 1 character. This results in more compact files.

kable(
  data.frame(
    method = c("saveRDS()", "write_vc(), optimized", "write_vc(), verbose",
               "write.table()"),
    file_size = c(rds_size, optim_size, verbose_size, base_size) / 2 ^ 10,
    relative = c(rds_size, optim_size, verbose_size, base_size) / base_size
  ),
  caption = "Resulting file sizes (in kB) and file sizes relative to the size of
  write.table().",
  digits = 2
)

The reduction in file size when storing in factors depends on the length of the labels, the number of levels and the number of observations. The figure below illustrates the strong gain as soon as the level labels contain more than two characters. The gain is less pronounced when the factor has a large number of levels. The optimization fails in extreme cases with short factor labels and a high number of levels.

ratio <- function(label_length = 1:20, n_levels = 9, n_obs = 1000) {
  meta_length <- 63 + (5 + label_length + floor(log10(n_levels))) * n_levels
  optimized <- n_obs * mean(ceiling(log10(seq_len(n_levels) + 1)))
  verbose <- n_obs * label_length
  ifelse(
    62 ^ label_length >= n_levels,
    (optimized + meta_length) / (verbose + meta_length),
    NA
  )
}
lengths <- 1:50
f_ratio <- rbind(
  data.frame(
    label_length = lengths,
    levels = 1000,
    ratio = ratio(lengths, 1000, 1000)
  ),
  data.frame(
    label_length = lengths,
    levels = 100,
    ratio = ratio(lengths, 100, 1000)
  ),
  data.frame(
    label_length = lengths,
    levels = 10,
    ratio = ratio(lengths, 10, 1000)
  ),
  data.frame(
    label_length = lengths,
    levels = 3,
    ratio = ratio(lengths, 3, 1000)
  )
)
f_ratio$levels <- factor(f_ratio$levels, levels = c(1000, 100, 10, 3))
ggplot(f_ratio, aes(x = label_length, y = ratio, colour = levels)) +
  geom_hline(yintercept = 1, linetype = 2) +
  geom_line() +
  scale_x_continuous("label length (characters)") +
  scale_y_continuous(paste("optimized bytes", "verbose bytes", sep = " / "),
                     breaks = seq(0, 1.25, by = 0.25)) +
  scale_colour_manual("number of \nlevels", values = inbo_colours)

The effect of the number of observations is mainly due to the overhead of storing the metadata. The importance of this overhead increases when the number of observations is small.

n_obs <- 10 ^ seq(log10(1), log10(10000), length = 41)
f_ratio <- rbind(
  data.frame(
    observations = n_obs,
    levels = 3,
    ratio = sapply(n_obs, ratio, label_length = 10, n_levels = 3)
  ),
  data.frame(
    observations = n_obs,
    levels = 10,
    ratio = sapply(n_obs, ratio, label_length = 10, n_levels = 10)
  ),
  data.frame(
    observations = n_obs,
    levels = 100,
    ratio = sapply(n_obs, ratio, label_length = 10, n_levels = 100)
  ),
  data.frame(
    observations = n_obs,
    levels = 1000,
    ratio = sapply(n_obs, ratio, label_length = 10, n_levels = 1000)
  )
)
f_ratio$levels <- factor(f_ratio$levels, levels = c(1000, 100, 10, 3))
ggplot(f_ratio, aes(x = observations, y = ratio, colour = levels)) +
  geom_hline(yintercept = 1, linetype = 2) +
  geom_line() +
  scale_x_log10() +
  scale_y_continuous(paste("optimized bytes", "verbose bytes", sep = " / "),
                     breaks = seq(0, 1.25, by = 0.25)) +
  scale_colour_manual("number of \nlevels", values = inbo_colours)

In Git Repositories

Here we will simulate how much space the data requires to store the history in a git repository. We will create a git repository for each method and store different subsets of the same data. Each commit contains a new version of the data. Each version is a random sample containing 90% of the observations of the airbag data. Two consecutive versions of the subset will have about 90% of the observations in common.

After writing each version, we commit the file, perform garbage collection (git gc) on the git repository and then calculate the size of the git history (git count-objects -v).

library(git2r)
tmp_repo <- function() {
  root <- tempfile("git2rdata-efficient-git")
  dir.create(root)
  repo <- git2r::init(root)
  git2r::config(repo, user.name = "me", user.email = "me@me.com")
  return(repo)
}
commit_and_size <- function(repo, filename) {
  add(repo, filename)
  commit(repo, "test", session = TRUE)
  git_size <- system(
    sprintf("cd %s\ngit gc\ngit count-objects -v", dirname(repo$path)),
    intern = TRUE
  )
  git_size <- git_size[grep("size-pack", git_size)]
  as.integer(gsub(".*: (.*)", "\\1", git_size))
}

repo_wt <- tmp_repo()
repo_wts <- tmp_repo()
repo_rds <- tmp_repo()
repo_wvco <- tmp_repo()
repo_wvcv <- tmp_repo()

repo_size <- replicate(
  100, {
    observed_subset <- rbinom(nrow(airbag), size = 1, prob = 0.9) == 1
    this <- airbag[
      sample(which(observed_subset)),
      sample(ncol(airbag))
    ]
    this_sorted <- airbag[observed_subset, ]
    fn_wt <- file.path(workdir(repo_wt), "base_R.tsv")
    write.table(this, fn_wt, sep = "\t")
    fn_wts <- file.path(workdir(repo_wts), "base_R.tsv")
    write.table(this_sorted, fn_wts, sep = "\t")
    fn_rds <- file.path(workdir(repo_rds), "base_R.rds")
    saveRDS(this, fn_rds)
    fn_wvco <- write_vc(this, "airbag_optimize", repo_wvco, sorting = "X")
    fn_wvcv <- write_vc(
      this, "airbag_verbose", repo_wvcv, sorting = "X", optimize = FALSE
    )
    c(
      write.table = commit_and_size(repo_wt, fn_wt),
      write.table.sorted = commit_and_size(repo_wts, fn_wts),
      saveRDS = commit_and_size(repo_rds, fn_rds),
      write_vc.optimized = commit_and_size(repo_wvco, fn_wvco),
      write_vc.verbose = commit_and_size(repo_wvcv, fn_wvcv)
    )
})

if (system.file("efficiency", "git_size.rds", package = "git2rdata") == "") {
  saveRDS(repo_size, file.path("..", "inst", "efficiency", "git_size.rds"))
} else {
  repo_size <- readRDS(
    system.file("efficiency", "git_size.rds", package = "git2rdata")
  )
}

Each version of the data has on purpose a random order of observations and variables. This is what would happen in a worst case scenario as it would generate the largest possible diff. We also test write.table() with a stable ordering of the observations and variables.

The randomised write.table() yields the largest git repository, converging to about r sprintf("%.1f", repo_size["write.table", 100] / repo_size["write.table.sorted", 100]) times the size of a git repository based on the sorted write.table(). saveRDS() yields a r sprintf("%.0f%%", 100 - 100 * repo_size["saveRDS", 100] / repo_size["write.table", 100]) reduction in repository size compared to the randomised write.table(), but still is r sprintf("%.1f", repo_size["saveRDS", 100] / repo_size["write.table.sorted", 100]) times larger than the sorted write.table(). Note that the gain of storing binary files in a git repository is much smaller than the gain in individual file size because git compresses its history. The optimized write_vc() starts at r sprintf("%.0f%%", 100 * repo_size["write_vc.optimized", 1] / repo_size["write.table.sorted", 1]) and converges toward r sprintf("%.0f%%", 100 * repo_size["write_vc.optimized", 100] / repo_size["write.table.sorted", 100]), the verbose version starts at r sprintf("%.0f%%", 100 * repo_size["write_vc.verbose", 1] / repo_size["write.table.sorted", 1]) and converges towards r sprintf("%.0f%%", 100 * repo_size["write_vc.verbose", 100] / repo_size["write.table.sorted", 100]). Storage size is a lot smaller when using write_vc() with optimization. The verbose option of write_vc() has little the gain in storage size. Another advantage is that write_vc() stores metadata.

rs <- lapply(
  rownames(repo_size),
  function(x) {
    if (x == "saveRDS") {
      fun <- "saveRDS"
      optimized <- "yes"
    } else if (x == "write_vc.optimized") {
      fun <- "write_vc"
      optimized <- "yes"
    } else if (x == "write_vc.verbose") {
      fun <- "write_vc"
      optimized <- "no"
    } else if (x == "write.table") {
      fun <- "write.table"
      optimized <- "no"
    } else if (x == "write.table.sorted") {
      fun <- "write.table"
      optimized <- "yes"
    }
    data.frame(commit = seq_along(repo_size[x, ]), size = repo_size[x, ],
               rel_size = repo_size[x, ] / repo_size["write.table.sorted", ],
               fun = fun, optimized = optimized, stringsAsFactors = FALSE)
  }
)
rs <- do.call(rbind, rs)
rs$optimized <- factor(rs$optimized, levels = c("yes", "no"))
ggplot(
  rs, aes(x = commit, y = size / 2^10, colour = fun, linetype = optimized)
) +
  geom_line() +
  scale_y_continuous("repo size (in MiB)") +
  scale_colour_manual("function", values = inbo_colours)

ggplot(rs, aes(x = commit, y = rel_size, colour = fun, linetype = optimized)) +
  geom_line() +
  scale_y_continuous("size relative to sorted write.table()", breaks = 0:10) +
  scale_colour_manual("function", values = inbo_colours)

Timings

The code below runs a microbenchmark on the four methods. A microbenchmark runs the code a hundred times and yields a distribution of timings for each expression.

Writing Data

library(microbenchmark)
mb <- microbenchmark(
  write.table = write.table(airbag, file.path(root, "base_R.tsv"), sep = "\t"),
  saveRDS = saveRDS(airbag, file.path(root, "base_R.rds")),
  write_vc.optim = write_vc(airbag, "airbag_optimize", root, sorting = "X"),
  write_vc.verbose = write_vc(airbag, "airbag_verbose", root, sorting = "X",
                              optimize = FALSE)
)
mb$time <- mb$time / 1e6

if (
  system.file("efficiency", "file_timings.rds", package = "git2rdata") == ""
) {
  saveRDS(mb, file.path("..", "inst", "efficiency", "file_timings.rds"))
} else {
  mb <- readRDS(
    system.file("efficiency", "file_timings.rds", package = "git2rdata")
  )
}

median_time <- aggregate(time ~ expr, data = mb, FUN = median)
write_ratio <- 100 * median_time$time /
  median_time$time[median_time$expr == "write.table"]
names(write_ratio) <- median_time$expr

write_vc() takes r paste(sprintf("%.0f%%", -100 + write_ratio[grep("write_vc", names(write_ratio))]), collapse = " to ") more time than write.table() because it needs to prepare the metadata and sort the observations and variables. When overwriting existing data, write_vc() checks the new data against the existing metadata. saveRDS() requires r sprintf("%.0f%%", write_ratio["saveRDS"]) of the time that write.table() needs.

mb$expr <- reorder(mb$expr, mb$time, FUN = median)
levels(mb$expr) <- gsub("write_vc\\.", "write_vc\n", levels(mb$expr))
ggplot(mb, aes(x = expr, y = time)) +
  geom_boxplot() +
  scale_y_continuous("Time (in milliseconds)", limits = c(0, NA)) +
  theme(axis.title.x = ggplot2::element_blank())

Reading Data

mb <- microbenchmark(
  read.table = read.table(file.path(root, "base_R.tsv"), header = TRUE,
                          sep = "\t"),
  readRDS = readRDS(file.path(root, "base_R.rds")),
  read_vc.optim = read_vc("airbag_optimize", root),
  read_vc.verbose = read_vc("airbag_verbose", root)
)
mb$time <- mb$time / 1e6

if (
  system.file("efficiency", "read_timings.rds", package = "git2rdata") == ""
) {
  saveRDS(mb, file.path("..", "inst", "efficiency", "read_timings.rds"))
} else {
  mb <- readRDS(
    system.file("efficiency", "read_timings.rds", package = "git2rdata")
  )
}

median_time <- aggregate(time ~ expr, data = mb, FUN = median)
read_ratio <- 100 * median_time$time /
  median_time$time[median_time$expr == "read.table"]
names(read_ratio) <- median_time$expr

The timings on reading the data is another story. Reading the binary format takes about r sprintf("%.0f%%", read_ratio["readRDS"]) of the time needed to read the standard plain text format using read.table(). read_vc() takes about r sprintf("%.0f%%", read_ratio["read_vc.optim"]) (optimized) and r sprintf("%.0f%%", read_ratio["read_vc.verbose"]) (verbose) of the time needed by read.table(), which at first seems strange because read_vc() calls read.table() to read the files and has some extra work to convert the variables to the correct data type. The main difference is that read_vc() knows the required data type a priori and passes this info to read.table(). Otherwise, read.table() has to guess the correct data type from the file.

mb$expr <- factor(
  mb$expr,
  levels = c("readRDS", "read.table", "read_vc.optim", "read_vc.verbose")
)
levels(mb$expr) <- gsub("read_vc\\.", "read_vc\n", levels(mb$expr))
ggplot(mb, aes(x = expr, y = time)) +
  geom_boxplot() +
  scale_y_continuous("Time (in milliseconds)", limits = c(0, NA)) +
  theme(axis.title.x = ggplot2::element_blank())