From: https://github.com/tidyverse/tidyr/issues/613.

require(tidyverse)
require(data.table)
data.test <- matrix(
  data = sample(
    x = c(0L, 1L, 2L, NA_integer_),#the genotypes
    size = 2e+07,
    replace = TRUE,
    prob = c(0.8, 0.10, 0.05, 0.05)
    ),
  nrow = 20000,#number of SNPs/markers
  ncol = 1000,#number of samples
  dimnames = list(rownames = seq(1, 20000, 1), colnames = seq(1, 1000, 1))
  ) %>%
  tibble::as_tibble(x = ., rownames = "MARKERS") 
library("cdata")
library("rqdatatable")

packageVersion("data.table")
packageVersion("tidyr")
packageVersion("dplyr")
packageVersion("cdata")
packageVersion("rqdatatable")
data.test <- data.frame(data.test)

test1: data.table::melt.data.table

system.time(
test1 <- data.table::as.data.table(data.test) %>%
  data.table::melt.data.table(
    data = .,
    id.vars = "MARKERS",
    variable.name = "INDIVIDUALS",
    value.name = "GENOTYPES",
    variable.factor = FALSE) 
)
# reported: #~0.41sec
test1 <- orderby(test1, qc(MARKERS, INDIVIDUALS, GENOTYPES)) 

test2: tidyr::gather

system.time(
test2 <- tidyr::gather(
  data = data.test,
  key = "INDIVIDUALS",
  value = "GENOTYPES",
  -MARKERS)
)
# reported: #~0.39sec
test2 <- orderby(test2, qc(MARKERS, INDIVIDUALS, GENOTYPES)) 
stopifnot(isTRUE(all.equal(test1, test2)))

test 2b: reverse

system.time({
  test2b <- tidyr::spread(test2, key = INDIVIDUALS, value = GENOTYPES)
})

test2b <- orderby(test2b, colnames(test2b)) 
data.test <- orderby(test2b, colnames(data.test)) 
stopifnot(isTRUE(all.equal(data.test, test2b)))

test3: latest tidyr::pivot_longer

run_pivot_longer <- exists('pivot_longer', 
                           where = 'package:tidyr', 
                           mode = 'function')
system.time(
test3 <- tidyr::pivot_longer(
  df = data.test,
  cols = -MARKERS,
  names_to = "INDIVIDUALS",
  values_to = "GENOTYPES")
)
# reported: #~90sec !!!
test3 <- orderby(test3, qc(MARKERS, INDIVIDUALS, GENOTYPES)) 
stopifnot(isTRUE(all.equal(test1, test3)))

test 4: cdata::unpivot_to_blocks() (with data.table)

system.time({
  cT <- build_unpivot_control(
    nameForNewKeyColumn = "INDIVIDUALS",
    nameForNewValueColumn = "GENOTYPES",
    columnsToTakeFrom = setdiff(colnames(data.test), 
                                c("MARKERS", "INDIVIDUALS", "GENOTYPES")))
  layout <- rowrecs_to_blocks_spec(
    cT,
    recordKeys = "MARKERS",
    allow_rqdatatable = TRUE)

  print(layout$allow_rqdatatable)

  test4 <- layout_by(layout, data.test)
})
test4 <- orderby(test4, qc(MARKERS, INDIVIDUALS, GENOTYPES)) 
stopifnot(isTRUE(all.equal(test1, test4)))

Slow.

system.time({
  inv_layout <- t(layout)

  print(inv_layout$allow_rqdatatable)

  back4 <- layout_by(inv_layout, test4)
})
back4 <- orderby(back4, colnames(back4)) 
data.test <- orderby(back4, colnames(data.test)) 
stopifnot(isTRUE(all.equal(data.test, back4)))

test 5: cdata::unpivot_to_blocks() (without data.table)

Slow.

system.time({
  cT <- build_unpivot_control(
    nameForNewKeyColumn = "INDIVIDUALS",
    nameForNewValueColumn = "GENOTYPES",
    columnsToTakeFrom = setdiff(colnames(data.test), 
                                c("MARKERS", "INDIVIDUALS", "GENOTYPES")))
  layout <- rowrecs_to_blocks_spec(
    cT,
    recordKeys = "MARKERS",
    allow_rqdatatable = FALSE)

  print(layout$allow_rqdatatable)

  test5 <- layout_by(layout, data.test)
})
test5 <- orderby(test5, qc(MARKERS, INDIVIDUALS, GENOTYPES)) 
stopifnot(isTRUE(all.equal(test1, test5)))

Slow.

system.time({
  inv_layout <- t(layout)

  print(inv_layout$allow_rqdatatable)

  back5 <- layout_by(inv_layout, test5)
})
back5 <- orderby(back5, colnames(back5)) 
data.test <- orderby(back5, colnames(data.test)) 
stopifnot(isTRUE(all.equal(data.test, back5)))


WinVector/cdata documentation built on Aug. 29, 2023, 3:56 a.m.