R/group_by_in_memory.R

#' Time Data.table GROUP BY operation
#'
#' @param fname name of the file to read
group_by_data.table = function(fname = default_csv_file(), threads = 2L, by_col = "g", data_col = "col1", group_fun = median)
{
    require(data.table)
    setDTthreads(threads)
    read_time = system.time(DT <- fread(fname))
    split_and_compute_time = system.time(DT[, group_fun(.SD[, data_col]), by = by_col])
    list(read_time = read_time, split_and_compute_time = split_and_compute_time)
}


#' Time GROUP BY operation
#'
#' @param fname name of the file to read
group_by_tapply = function(fname = default_csv_file(), by_col = "g", data_col = "col1", group_fun = median)
{
    # No point in timing base R's IO functions, I know they're 2 orders of magnitude slower than high performance implementations.
    DT = data.table::fread(fname)
    df = as.data.frame(DT)
    split_and_compute_time = system.time({
        out = tapply(df[, data_col], df[, by_col], group_fun)
    })
    list(split_and_compute_time = split_and_compute_time)
}


#' Time GROUP BY operation
#'
#' @param fname name of the file to read
group_by_split_lapply = function(fname = default_csv_file(), by_col = "g", data_col = "col1", group_fun = median)
{
    DT = data.table::fread(fname)
    df = as.data.frame(DT)
    split_time = system.time({
        s = split(df[, data_col], df[, by_col])
    })
    lapply_time = system.time({
        out = lapply(s, group_fun)
    })

    list(split_time = split_time, lapply_time = lapply_time)
}
clarkfitzg/RDataBenchmarks documentation built on June 29, 2019, 11:38 p.m.