v4-comparisons.R
In nc: Named Capture to Data Tables

## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## -----------------------------------------------------------------------------

library(data.table)
data.table(iris)


## -----------------------------------------------------------------------------

iris.parts <- list(
  nc=nc::capture_melt_multiple(
    iris,
    column=".*?",
    "[.]",
    dim=".*"),
  tidyr=if(requireNamespace("tidyr"))tidyr::pivot_longer(
    iris, 
    cols=1:4, 
    names_to=c(".value", "dim"),
    names_sep="[.]"),
  stats=stats::reshape(
    iris,
    direction="long",
    timevar="dim",
    varying=1:4,
    sep="."),
  "data.table::melt"=melt(
    data.table(iris),
    measure.vars=patterns(
      Sepal="^Sepal",
      Petal="^Petal")
  )[data.table(
    variable=factor(1:2), dim=c("Length", "Width")
  ), on=.(variable)],
  if(requireNamespace("cdata"))cdata::rowrecs_to_blocks(
    iris,
    controlTable=data.frame(
      dim=c("Length", "Width"),
      Petal=c("Petal.Length", "Petal.Width"),
      Sepal=c("Sepal.Length", "Sepal.Width"),
      stringsAsFactors=FALSE),
    columnsToCopy="Species"))
iris.parts$nc


## -----------------------------------------------------------------------------

if(require(ggplot2)){
  ggplot()+
    theme_bw()+
    theme(panel.spacing=grid::unit(0, "lines"))+
    facet_grid(dim ~ Species)+
    coord_equal()+
    geom_abline(slope=1, intercept=0, color="grey")+
    geom_point(aes(
      Petal, Sepal),
      data=iris.parts$nc)
}


## -----------------------------------------------------------------------------

iris.dims <- list(
  nc=nc::capture_melt_multiple(
    iris,
    part=".*?",
    "[.]",
    column=".*"),
  stats=stats::reshape(
    structure(iris, names=sub("(.*?)[.](.*)", "\\2.\\1", names(iris))),
    direction="long",
    timevar="part",
    varying=1:4,
    sep="."))
iris.dims$nc


## -----------------------------------------------------------------------------

if(require(ggplot2)){
  ggplot()+
    theme_bw()+
    theme(panel.spacing=grid::unit(0, "lines"))+
    facet_grid(part ~ Species)+
    coord_equal()+
    geom_abline(slope=1, intercept=0, color="grey")+
    geom_point(aes(
      Length, Width),
      data=iris.dims$nc)
}


## -----------------------------------------------------------------------------

TC <- data.table::data.table(
  age.treatment=c(1, 5),
  sex.control=c("M", "M"),
  sex.treatment=c("F", "F"),
  age.control=c(10, 50))


## -----------------------------------------------------------------------------

input.list <- list(
  "nc"=nc::capture_melt_multiple(
    TC,
    column=".*?",
    "[.]",
    group=".*"),
  "cdata"=if(requireNamespace("cdata"))cdata::rowrecs_to_blocks(
    TC,
    controlTable=data.frame(
      group=c("treatment", "control"),
      age=c("age.treatment", "age.control"),
      sex=c("sex.treatment", "sex.control"),
      stringsAsFactors=FALSE)),
  "data.table"=data.table::melt(TC, measure.vars=patterns(
    age="age",
    sex="sex")),
  "stats"=stats::reshape(
    TC,
    varying=1:4,
    direction="long"),
  "tidyr"=if(requireNamespace("tidyr"))tidyr::pivot_longer(
    TC, 1:4,
    names_to=c(".value", "group"),
    names_sep="[.]"))
output.list <- list()
for(pkg in names(input.list)){
  df.or.null <- input.list[[pkg]]
  if(is.data.frame(df.or.null)){
    output.list[[pkg]] <- data.table::data.table(df.or.null)[order(age)]
  }
}
output.list
sapply(output.list, function(DT)identical(DT$sex, c("F", "F", "M", "M")))


## -----------------------------------------------------------------------------

if(requireNamespace("tidyr")){
  data(who, package="tidyr")
}else{
  who <- data.frame(id=1, new_sp_m5564=2, newrel_f65=3)
}
names(who)


## -----------------------------------------------------------------------------

who.chr.list <- list(
  nc=nc::capture_melt_single(
    who,
    "new_?",
    diagnosis=".*",
    "_",
    gender=".",
    ages=".*"),
  tidyr=if(requireNamespace("tidyr"))tidyr::pivot_longer(
    who,
    new_sp_m014:newrel_f65,
    names_to=c("diagnosis", "gender", "ages"),
    names_pattern="new_?(.*)_(.)(.*)"))


## -----------------------------------------------------------------------------

who.pattern <- "new_?(.*)_(.)((0|[0-9]{2})([0-9]{0,2}))"
as.numeric.Inf <- function(y)ifelse(y=="", Inf, as.numeric(y))
who.typed.list <- list(
  nc=nc::capture_melt_single(
    who,
    "new_?",
    diagnosis=".*",
    "_",
    gender=".",
    ages=list(
      ymin.num="0|[0-9]{2}", as.numeric,
      ymax.num="[0-9]{0,2}", as.numeric.Inf),
    value.name="count",
    na.rm=TRUE),
  tidyr=if(requireNamespace("tidyr"))try(tidyr::pivot_longer(
    who,
    cols=grep(who.pattern, names(who)),
    names_transform=list(
      ymin.num=as.numeric,
      ymax.num=as.numeric.Inf),
    names_to=c("diagnosis", "gender", "ages", "ymin.num", "ymax.num"),
    names_pattern=who.pattern,
    values_drop_na=TRUE,
    values_to="count")))
str(who.typed.list)


## -----------------------------------------------------------------------------

if(requireNamespace("tidyr")){
  gather.result <- tidyr::gather(
    who,
    "variable",
    "count",
    grep(who.pattern, names(who)),
    na.rm=TRUE)
  extract.result <- tidyr::extract(
    gather.result,
    "variable",
    c("diagnosis", "gender", "ages", "ymin.int", "ymax.int"),
    who.pattern,
    convert=TRUE)
  transform.result <- base::transform(
    extract.result,
    ymin.num=as.numeric(ymin.int),
    ymax.num=ifelse(is.na(ymax.int), Inf, as.numeric(ymax.int)))
  str(transform.result)
}


## -----------------------------------------------------------------------------

reshape2.result <- if(requireNamespace("reshape2")){
  reshape2:::melt.data.frame(
    who,
    measure.vars=grep(who.pattern, names(who)),
    na.rm=TRUE,
    value.name="count")
}


## -----------------------------------------------------------------------------

dt.result <- data.table::melt.data.table(
  data.table(who),
  measure.vars=patterns(who.pattern),
  na.rm=TRUE,
  value.name="count")


## -----------------------------------------------------------------------------

who.df <- data.frame(who)
is.varying <- grepl(who.pattern, names(who))
names(who.df)[is.varying] <- paste0("count.", names(who)[is.varying])
stats.result <- stats::reshape(
  who.df,
  direction="long",
  timevar="variable",
  varying=is.varying)


## -----------------------------------------------------------------------------

if(requireNamespace("cdata")){
  cdata.result <- cdata::rowrecs_to_blocks(
    who, 
    cdata::build_unpivot_control(
      "variable",
      "count",
      grep(who.pattern, names(who), value=TRUE)),
    columnsToCopy=grep(who.pattern, names(who), value=TRUE, invert=TRUE))
}


## -----------------------------------------------------------------------------

## Example 1: melting a wider iris data back to original.
library(data.table)
iris.dt <- data.table(
  i=1:nrow(iris),
  iris[,1:4],
  Species=paste(iris$Species))
print(iris.dt)

## what if we had two observations on each row?
set.seed(1)
iris.rand <- iris.dt[sample(.N)]
iris.wide <- cbind(treatment=iris.rand[1:75], control=iris.rand[76:150])
print(iris.wide, topn=2, nrows=10)

## This is the usual data.table syntax for getting the original iris back.
iris.melted <- melt(iris.wide, value.factor=TRUE, measure.vars = patterns(
  i="i$",
  Sepal.Length="Sepal.Length$",
  Sepal.Width="Sepal.Width$",
  Petal.Length="Petal.Length$",
  Petal.Width="Petal.Width$",
  Species="Species$"))
identical(iris.melted[order(i), names(iris.dt), with=FALSE], iris.dt)

## nc can do the same thing -- you must define an R argument named
## column, and another named argument which identifies each group.
(nc.melted <- nc::capture_melt_multiple(
  iris.wide,
  group="[^.]+",
  "[.]",
  column=".*"))
identical(nc.melted[order(i), names(iris.dt), with=FALSE], iris.dt)

## This is how we do it using stats::reshape.
iris.wide.df <- data.frame(iris.wide)
names(iris.wide.df) <- sub("(.*?)[.](.*)", "\\2_\\1", names(iris.wide))
iris.reshaped <- stats::reshape(
  iris.wide.df,
  direction="long",
  timevar="group",
  varying=names(iris.wide.df),
  sep="_")
identical(data.table(iris.reshaped[, names(iris.dt)])[order(i)], iris.dt)

## get the parts columns and groups -- is there any difference
## between groups? of course not!
parts.wide <- nc::capture_melt_multiple(
  iris.wide,
  group=".*?",
  "[.]",
  column=".*?",
  "[.]",
  dim=".*")
if(require("ggplot2")){
  ggplot()+
    theme_bw()+
    theme(panel.spacing=grid::unit(0, "lines"))+
    facet_grid(dim ~ group)+
    coord_equal()+
    geom_abline(slope=1, intercept=0, color="grey")+
    geom_point(aes(
      Petal, Sepal),
      data=parts.wide)
}


## -----------------------------------------------------------------------------

## Example 2. Lots of column types, from example(melt.data.table).
DT <- data.table(
  i_1 = c(1:5, NA),
  i_2 = c(NA,6:10),
  f_1 = factor(sample(c(letters[1:3], NA), 6, TRUE)),
  f_2 = factor(c("z", "a", "x", "c", "x", "x"), ordered=TRUE),
  c_1 = sample(c(letters[1:3], NA), 6, TRUE),
  d_1 = as.Date(c(1:3,NA,4:5), origin="2013-09-01"),
  d_2 = as.Date(6:1, origin="2012-01-01"))
## add a couple of list cols
DT[, l_1 := DT[, list(c=list(rep(i_1, sample(5,1)))), by = i_1]$c]
DT[, l_2 := DT[, list(c=list(rep(c_1, sample(5,1)))), by = i_1]$c]

## original DT syntax is quite repetitive.
melt(DT, measure=patterns(
  i="^i",
  f="^f",
  d="^d",
  l="^l"
))

## nc syntax uses a single regex rather than four.
nc::capture_melt_multiple(
  DT,
  column="^[^c]",
  "_",
  number="[12]")

## id.vars can be specified using original DT syntax.
melt(DT, id=1:2, measure=patterns(
  f="^f",
  l="^l"
))

nc::capture_melt_multiple(
  DT,
  column="^[fl]",
  "_",
  number="[12]")

## reshape does not support list columns.
reshape(
  DT,
  varying=grep("^[fid]", names(DT)),
  sep="_",
  direction="long",
  timevar="number")

## tidyr does, but errors for combining ordered and un-ordered factors.
if(requireNamespace("tidyr")){
  tidyr::pivot_longer(
    DT, grep("[cf]", names(DT), invert=TRUE),
    names_pattern="(.)_(.)",
    names_to=c(".value", "number"))
}


## -----------------------------------------------------------------------------

## Example 3, three children, one family per row, from data.table
## vignette.
family.dt <- fread(text="
family_id age_mother dob_child1 dob_child2 dob_child3 gender_child1 gender_child2 gender_child3
1         30 1998-11-26 2000-01-29         NA             1             2            NA
2         27 1996-06-22         NA         NA             2            NA            NA
3         26 2002-07-11 2004-04-05 2007-09-02             2             2             1
4         32 2004-10-10 2009-08-27 2012-07-21             1             1             1
5         29 2000-12-05 2005-02-28         NA             2             1            NA")
(children.melt <- melt(family.dt, measure = patterns(
  dob="^dob", gender="^gender"
), na.rm=TRUE, variable.factor=FALSE))

## nc::field can be used to define group name and pattern at the
## same time, to avoid repetitive code.
(children.nc <- nc::capture_melt_multiple(
  family.dt,
  column="[^_]+",
  "_",
  nc::field("child", "", "[1-3]"),
  na.rm=TRUE))

## reshape works too.
stats::reshape(
  family.dt,
  varying=grep("child", names(family.dt)),
  direction="long",
  sep="_",
  timevar="child.str")


## -----------------------------------------------------------------------------

## Comparison with base R. 1. mfrow means parts on rows, mfcol means
## parts on columns. 2. same number of lines of code. 3. nc/ggplot2
## code has more names and fewer numbers.
imat <- as.matrix(iris[, 1:4])
ylim <- range(table(imat))
xlim <- range(imat)
old.par <- par(mfcol=c(2,2), mar=c(2,2,1,1))
for(col.i in 1:ncol(imat)){
  hist(
    imat[, col.i],
    breaks=seq(xlim[1], xlim[2], by=0.1),
    ylim=ylim,
    main=colnames(imat)[col.i])
}


## -----------------------------------------------------------------------------

pen.peaks.wide <- data.table::data.table(
  data.set=c("foo", "bar"),
  "10.1"=c(5L, 10L),
  "0.3"=c(26L, 39L))
pen.peaks.gather <- if(requireNamespace("tidyr"))tidyr::gather(
  pen.peaks.wide,
  "penalty",
  "peaks",
  -1,
  convert=TRUE)
str(pen.peaks.gather)

pen.peaks.nc <- nc::capture_melt_single(
  pen.peaks.wide,
  penalty="^[0-9.]+", as.numeric,
  value.name="peaks")
str(pen.peaks.nc)

pen.peaks.pivot <- if(requireNamespace("tidyr"))try(tidyr::pivot_longer(
  pen.peaks.wide,
  -1,
  names_to="penalty",
  names_transform=list(penalty=as.numeric),
  values_to="peaks"))
str(pen.peaks.pivot)

varying <- 2:3
pen.peaks.reshape.times <- stats::reshape(
  pen.peaks.wide,
  direction="long",
  varying=varying,
  times=as.numeric(names(pen.peaks.wide)[varying]),
  v.names="peaks",
  timevar="penalty")
str(pen.peaks.reshape.times)

pen.peaks.renamed <- pen.peaks.wide
names(pen.peaks.renamed) <- paste0(ifelse(
  grepl("^[0-9]", names(pen.peaks.wide)),
  "peaks_", ""),
  names(pen.peaks.wide))
pen.peaks.reshape.sep <- stats::reshape(
  pen.peaks.renamed,
  direction="long",
  varying=varying,
  sep="_",
  timevar="penalty")
str(pen.peaks.reshape.sep)


## -----------------------------------------------------------------------------
peaks.csv <- system.file(
  "extdata", "RD12-0002_PP16HS_5sec_GM_F_1P.csv",
  package="nc", mustWork=TRUE)
peaks.wide <- data.table::fread(peaks.csv)
tidyr.long <- tidyr::pivot_longer(
  peaks.wide,
  grep(" [0-9]", names(peaks.wide)),
  names_pattern = "(.*) ([0-9]+)",
  names_to = c(".value", "peak"),
  names_transform = list(peak=as.integer))
peaks.tall <- nc::capture_melt_multiple(
  peaks.wide,
  column=".*",
  " ",
  peak="[0-9]+", as.integer)
options(old.par)