Nothing
library(testthat)
library(nc)
library(data.table)
context("multiple")
source(system.file("test_engines.R", package="nc", mustWork=TRUE), local=TRUE)
test_engines("only one input variable per column value is an error", {
expect_error({
nc::capture_melt_multiple(iris, column=".*[.].*", other=".*")
}, "only one input variable for each value captured in column group; typically this happens when the column group matches the entire input column name; fix by changing regex so that column group matches a strict substring (not the entire input column names)",
fixed=TRUE)
})
iris.dt <- data.table(i=1:nrow(iris), iris)
iris.dims <- capture_melt_multiple(
iris.dt,
part=".*?",
"[.]",
column=".*")
iris.dims.wide <- dcast(
iris.dims,
i + Species ~ part,
value.var=c("Length", "Width"))
names(iris.dims.wide) <- sub("(.*?)_(.*)", "\\2.\\1", names(iris.dims.wide))
should.be.orig <- iris.dims.wide[, names(iris.dt), with=FALSE]
setkeyv(should.be.orig, key(iris.dt))
test_engines("orig iris.dt and recast data are the same", {
expect_equal(should.be.orig, iris.dt)
})
iris.dt[, chr := paste(Species)]
compare.cols <- c(
"Sepal.Length", "Petal.Length",
"Sepal.Width", "Petal.Width",
"chr")
set.seed(1)
iris.rand <- iris.dt[sample(.N)]
iris.wide <- cbind(treatment=iris.rand[1:75], control=iris.rand[76:150])
test_engines("iris multiple column matches", {
nc.melted <- capture_melt_multiple(
iris.wide,
group="[^.]+",
"[.]",
column=".*")
expect_identical(nc.melted$group, rep(c("control", "treatment"), each=75))
nc.melted.orig <- nc.melted[order(i), ..compare.cols]
expect_identical(nc.melted.orig, iris.dt[, ..compare.cols])
})
set.seed(1)
iris.wide.rand <- iris.wide[, sample(names(iris.wide)), with=FALSE]
test_engines("iris multiple rand column matches", {
rand.melted <- capture_melt_multiple(
iris.wide.rand,
group="[^.]+",
"[.]",
column=".*"
)[order(i)]
expect_identical(rand.melted[, ..compare.cols], iris.dt[, ..compare.cols])
})
iris.wide.bad <- data.table(iris.wide)
names(iris.wide.bad)[2] <- "treatment.Sepal.bad"
test_engines("iris multiple rand column matches", {
expect_error({
capture_melt_multiple(
iris.wide.bad,
group="[^.]+",
"[.]",
column=".*")
}, "need column=same count for each value")
})
set.seed(45)
DT <- data.table(
i_1 = c(1:5, NA),
i_2 = c(NA,6:10),
f_1 = factor(sample(c(letters[1:3], NA), 6, TRUE)),
f_2 = factor(c("z", "a", "x", "c", "x", "x"), ordered=TRUE),
c_1 = sample(c(letters[1:3], NA), 6, TRUE),
d_1 = as.Date(c(1:3,NA,4:5), origin="2013-09-01"),
d_2 = as.Date(6:1, origin="2012-01-01"))
## add a couple of list cols
DT[, l_1 := DT[, list(c=list(rep(i_1, sample(5,1)))), by = i_1]$c]
DT[, l_2 := DT[, list(c=list(rep(c_1, sample(5,1)))), by = i_1]$c]
test_engines("melt multiple column types error if no other group", {
expect_error({
capture_melt_multiple(DT, column="^[^c]")
}, "need at least one group other than column")
})
test_engines("melt multiple column types", {
result <- capture_melt_multiple(
DT,
column="^[^c]",
"_",
.number="[0-9]", as.integer)
expect_identical(result$.number, rep(1:2, each=nrow(DT)))
expect_is(result$c_1, "character")
expect_is(result$i, "integer")
expect_is(result$f, "character")
expect_is(result$d, "Date")
expect_is(result$l, "list")
expect_identical(nrow(result), 12L)
})
family.dt <- fread(text="family_id age_mother dob_child1 dob_child2 dob_child3 gender_child1 gender_child2 gender_child3
1 30 1998-11-26 2000-01-29 NA 1 2 NA
2 27 1996-06-22 NA NA 2 NA NA
3 26 2002-07-11 2004-04-05 2007-09-02 2 2 1
4 32 2004-10-10 2009-08-27 2012-07-21 1 1 1
5 29 2000-12-05 2005-02-28 NA 2 1 NA")
test_engines("gender dob example", {
na.children <- capture_melt_multiple(
family.dt,
column="[^_]+",
between="_child",
number="[1-3]",
na.rm=FALSE)
expect_is(na.children$family_id, "integer")
expect_is(na.children$age_mother, "integer")
expect_is(na.children$dob, "IDate")
expect_is(na.children$gender, "integer")
expect_equal(sum(is.na(na.children$dob)), 4)
expect_equal(nrow(na.children), 15)
})
test_engines("error for unequal number of children", {
bad.groups <- data.table(family.dt, dob_child0="1999-01-01", gender_child4=2)
expect_error({
capture_melt_multiple(
bad.groups,
column="[^_]+",
between="_child",
number="[0-9]")
}, "need between,number=same count for each value")
})
test_engines("error for column conversion to factor", {
expect_error({
capture_melt_multiple(
family.dt,
column="[^_]+", as.factor,
between="_child",
number="[0-9]")
},
"column group must be character, but conversion function returned factor")
})
test_engines("error for column conversion to integer", {
expect_error({
expect_warning({
capture_melt_multiple(
family.dt,
column="[^_]+", as.integer,
between="_child",
number="[0-9]")
}, "NAs introduced by coercion")
},
"column group must be character, but conversion function returned integer")
})
test_engines("error for unequal number of columns", {
bad.cols <- data.table(family.dt, bar_child0="1999-01-01", foo_child0=2)
expect_error({
capture_melt_multiple(
bad.cols,
column="[^_]+",
between="_child",
number="[0-9]")
}, "need column=same count for each value")
})
test_engines("gender dob example na.rm=TRUE", {
children <- capture_melt_multiple(
family.dt,
column="[^_]+",
between="_child",
number="[1-3]",
na.rm=TRUE)
expect_is(children$family_id, "integer")
expect_is(children$age_mother, "integer")
expect_is(children$dob, "IDate")
expect_is(children$gender, "integer")
expect_equal(sum(is.na(children$dob)), 0)
expect_equal(nrow(children), 11)
})
test_engines("variable group ok in melt_multiple", {
result <- capture_melt_multiple(
family.dt,
column="[^_]+",
variable="_child",
number="[1-3]")
exp.names <- c(
"family_id", "age_mother",
"variable", "number",
"dob", "gender")
expect_identical(names(result), exp.names)
})
test_engines("multiple error if subject not df", {
expect_error({
capture_melt_multiple("foobar")
}, "first argument (subject) must be a data.frame", fixed=TRUE)
})
test_engines("multiple error if no arg named variable", {
expect_error({
capture_melt_multiple(family.dt, baz="family")
}, "pattern must define group named column")
})
test_engines("multiple error if no matching column names", {
expect_error({
capture_melt_multiple(family.dt, column="foobar")
}, "no column names match regex")
})
## what if input df has repeated names?
bad.dt <- data.table(family_id=LETTERS[1:5], family.dt)
test_engines("multiple df with same col names is an error", {
expect_error({
capture_melt_multiple(
bad.dt, column=".*", "_", field("child", "", "[0-9]"))
}, "input must have columns with unique names, problems: family_id")
})
## what if there are two groups with the same name?
test_engines("multiple groups with the same name is an error", {
expect_error({
capture_melt_multiple(
family.dt, column=".*", child="_", field("child", "", "[0-9]"))
}, "duplicate capture group names are only allowed in alternatives, problem: child")
})
## what if a capture group has the same name as an input column?
test_engines("err mult capture group same as input col", {
expect_error({
capture_melt_multiple(
family.dt, column=".*", family_id="_", field("child", "", "[0-9]"))
},
"some capture group names (family_id) are the same as input column names that did not match the pattern; please change either the pattern or the capture group names so that all output column names will be unique",
fixed=TRUE)
})
## what if value.name is the same as input col?
bad2 <- data.table(family.dt)
names(bad2)[1] <- "dob"
test_engines("err change value.name if same as input col", {
expect_error({
capture_melt_multiple(
bad2, column=".*", family_id="_", field("child", "", "[0-9]"))
},
"unable to create unique output column names; some values (dob) captured by the regex group named column are the same as input column names which do not match the pattern; please change either the pattern or the input column names which do not match the pattern so that output column names will be unique",
fixed=TRUE)
})
test_engines("err mult value.name same as group names", {
expect_error({
capture_melt_multiple(
family.dt, column=".*", dob="_", field("child", "", "[0-9]"))
},
"unable to create unique output column names; some values (dob) captured by the regex group named column are the same as other regex group names; please change either the pattern or the other regex group names so that output column names will be unique",
fixed=TRUE)
})
i.vec <- 1:10000
c.vec <- paste(i.vec)
one.row <- data.frame(I=t(i.vec), C=t(c.vec), stringsAsFactors=FALSE)
test_engines("multiple melting lots of columns is OK", {
out <- capture_melt_multiple(
one.row, column=".", "[.]", int="[0-9]+", as.integer)
expect_identical(out$int, i.vec)
expect_identical(out$I, i.vec)
expect_identical(out$C, c.vec)
})
wide.metrics <- data.table(
FP.possible=8202, FN.possible=1835,
FP.count=0, FN.count=1835)
test_engines("count is either 0 or 1835", {
tall.metrics <- capture_melt_multiple(
wide.metrics,
metric=".*?",
"[.]",
column=".*")[order(metric)]
expect_identical(tall.metrics$metric, c("FN", "FP"))
expect_identical(tall.metrics$count, c(1835, 0))
expect_identical(tall.metrics$possible, c(1835, 8202))
})
test_engines("melt multiple with count group ok", {
iris.count <- nc::capture_melt_multiple(
iris,
column=".*?",
"[.]",
count=".*")
expect_identical(names(iris.count), c("Species", "count", "Petal", "Sepal"))
})
## apparently there are some data with missing columns
## https://github.com/Rdatatable/data.table/issues/4027
wide.input <- data.table(
id = 1, a.1 = 1, a.3 = 3, b.1 = 1, b.2 = 2, b.3 = 3)
ix.pattern <- list(column="[a-z]", "[.]", ix="[0-9]", as.integer)
test_engines("error by default for missing column", {
expect_error({
capture_melt_multiple(wide.input, ix.pattern)
}, "need ix=same count for each value, but have: 1=2 2=1 3=2; please change pattern, edit input column names, or use fill=TRUE to output missing values")
})
##But what I really want is the ix of a.3 to be 3, not 2.
## id ix a b
## 1: 1 1 1 1
## 2: 1 2 NA 2
## 3: 1 3 3 3
tall.expected <- data.table(
id=1,
ix=1:3,
a=c(1, NA, 3),
b=1:3)
test_engines("NA for missing column when fill=TRUE", {
tall.output <- capture_melt_multiple(
wide.input, ix.pattern, fill=TRUE)
expect_equal(data.frame(tall.output), data.frame(tall.expected))
})
test_engines("NA for missing columns as in data table example", {
DT.missing.cols <- DT[, .(d_1, d_2, c_1, f_2)]
computed <- capture_melt_multiple(
DT.missing.cols,
column="[a-z]",
"_",
int="[0-9]", as.integer,
fill=TRUE)
expected <- DT.missing.cols[, data.table(
int=rep(1:2, each=.N),
c=c(c_1, rep(NA, .N)),
d=c(d_1, d_2),
f=c(rep(NA, .N), paste(f_2)))]
expect_identical(data.frame(computed), data.frame(expected))
})
family4.dt <- data.table(family.dt)
family4.dt[, `:=`(dob_child4=NA_character_, gender_child4=NA_integer_)]
test_engines("no families with 4 children", {
child.pattern <- list(
column="[^_]+",
between="_child",
number="[1-4]")
na.rm.T <- suppressWarnings({
capture_melt_multiple(
family4.dt,
child.pattern,
na.rm=TRUE)
})
expect_equal(sum(is.na(na.rm.T$dob)), 0)
})
test_engines("lots of missing columns ok with chr capture col small", {
tall.dt <- data.table(
family=1,
child=10:1,
num=c(10, 9, rep(NA, 8)),
chr=c("ten", "nine", rep(NA, 8)))
wide.dt <- dcast(
tall.dt,
family ~ child,
value.var=c("num", "chr"))
tall.again <- capture_melt_multiple(
wide.dt,
column=".*",
"_",
child="[0-9]+",
na.rm=TRUE)[order(num)]
expect_identical(tall.again$child, paste(9:10))
})
test_engines("lots of missing columns ok with chr capture col big", {
PROVEDIt.csv <- system.file(
"extdata", "RD12-0002_PP16HS_5sec_GM_F_1P.csv",
package="nc", mustWork=TRUE)
PROVEDIt.wide <- data.table::fread(PROVEDIt.csv)
PROVEDIt.tall <- suppressWarnings({
nc::capture_melt_multiple(
PROVEDIt.wide,
column=".*",
" ",
peak="[0-9]+",
na.rm=TRUE)
})
peak.int <- sort(as.integer(unique(PROVEDIt.tall$peak)))
expect_identical(peak.int, seq_along(peak.int))
})
test_engines("lots of missing columns chr col big na.rm=FALSE", {
PROVEDIt.csv <- system.file(
"extdata", "RD12-0002_PP16HS_5sec_GM_F_1P.csv",
package="nc", mustWork=TRUE)
PROVEDIt.wide <- data.table::fread(PROVEDIt.csv)
PROVEDIt.tall <- suppressWarnings({
nc::capture_melt_multiple(
PROVEDIt.wide,
column=".*",
" ",
peak="[0-9]+",
na.rm=FALSE)[!is.na(Size)]
})
## Why does this test pass? No missing values are removed so data
## table assigns peak numbers 1-100 correctly and then we filter the
## NAs.
peak.int <- sort(as.integer(unique(PROVEDIt.tall$peak)))
expect_identical(peak.int, seq_along(peak.int))
})
test_engines("lots of missing columns int col big na.rm=TRUE", {
PROVEDIt.csv <- system.file(
"extdata", "RD12-0002_PP16HS_5sec_GM_F_1P.csv",
package="nc", mustWork=TRUE)
PROVEDIt.wide <- data.table::fread(PROVEDIt.csv)
PROVEDIt.tall <- suppressWarnings({
nc::capture_melt_multiple(
PROVEDIt.wide,
column=".*",
" ",
peak="[0-9]+", as.integer,
na.rm=TRUE)
})
## Why does this test pass? With buggy data.table the missing values
## are removed and then the peak IDs are assigned. Because the
## missing values are at the end (for peak>36) and the data are
## sorted numerically by integers, the assigned IDs match the peak
## numbers exactly.
peak.int <- sort(as.integer(unique(PROVEDIt.tall$peak)))
expect_identical(peak.int, seq_along(peak.int))
})
test_engines("missing first child", {
wide.dt <- data.table(
family=1,
age_childA=NA_real_, sex_childA=NA_character_,
age_childB=5, sex_childB="m")
child.pattern <- list(
column=".*",
"_",
nc::field("child", "", "[A-Z]"))
tall.dt <- nc::capture_melt_multiple(wide.dt, child.pattern, na.rm=TRUE)
expect_identical(tall.dt$child, "B")
expect_identical(tall.dt$age, 5)
expect_identical(tall.dt$sex, "m")
})
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.