context("handle missing data")
library(dplyr)
# randomly replace values with NA's
create_na <- function (x, prop = 0.1) {
if (is.data.frame(x)) {
n = nrow(x)
p = ncol(x)
} else if (is.vector(x)) {
n = length(x)
p = 1
}
NAloc = rep(FALSE, times = n * p)
NAloc[sample.int(n * p, floor(n * p * prop))] = TRUE
x[matrix(NAloc, nrow = n, ncol = p)] = NA
return(x)
}
# create example data
make_data <- function() {
N_PLATES = 5
wells = rep(platetools::num_to_well(1:96), N_PLATES)
plate_id = rep(c("plate_1", "plate_2", "plate_3", "plate_4", "plate_5"),
each = 96)
val1 = rnorm(96 * N_PLATES, 10, 10)
val2 = rnorm(96 * N_PLATES, 1, 100)
val3 = rnorm(96 * N_PLATES, 5, 10)
val4 = rnorm(96 * N_PLATES, 20, 0.1)
comps = c(rep("cmpd", 80), rep("DMSO", 16))
compound = rep(comps, N_PLATES)
df = data.frame(Metadata_well = wells,
Metadata_plate_id = plate_id,
Metadata_compound = compound,
val1, val2)
df2 = cbind(df, val3, val4)
return(df2)
}
df2 = make_data()
# create dataframe with one column (val1) containing 90% missing data
df2_m = df2
df2_m[["val1"]] = create_na(df2_m[["val1"]], 0.5)
test_that("drop_missing_cols removes column", {
output = drop_missing_cols(df2_m, threshold=0.4)
expect_true(! "val1" %in% colnames(output))
})
test_that("drop_missing_cols respects threshold value", {
output = drop_missing_cols(df2_m, threshold=0.99)
expect_equal(colnames(output), colnames(df2_m))
})
test_that("drop_missing_rows removes rows", {
output = drop_missing_rows(df2_m, threshold=0.8)
expect_lt(nrow(output), nrow(df2_m)) # expect_less_than hadley!
expect_true(all(complete.cases(output)))
})
test_that("impute_missing errors with invalid method", {
expect_error(impute_missing(df2_m, method="invalid"))
})
test_that("impute_missing mean returns correct value", {
test_df = data.frame(
x = c(1L, 2L, 3L, 400L, NA),
y = c(100L, 200L, 300L, 4000L, NA),
Metadata_col = c(rep("string", 4), NA)
)
output = impute_missing(test_df, method = "mean")
expect_equal(as.numeric(output[nrow(output), "x"]), 101.5)
expect_equal(as.numeric(output[nrow(output), "y"]), 1150)
# check it's not imputed metadata columns
expect_equal(test_df[["Metadata_col"]], output[["Metadata_col"]])
})
test_that("impute_missing median returns correct value", {
test_df = data.frame(
x = c(1L, 2L, 3L, 400L, NA),
y = c(100L, 200L, 300L, 4000L, NA),
Metadata_col = c(rep("string", 4), NA)
)
output = impute_missing(test_df, method = "median")
expect_equal(as.numeric(output[nrow(output), "x"]), 2.5)
expect_equal(as.numeric(output[nrow(output), "y"]), 250)
# check it's not imputed metadata columns
expect_equal(test_df[["Metadata_col"]], output[["Metadata_col"]])
})
test_that("impute_missing works with grouped dataframe", {
# dataframe with two groups
test_df = data.frame(
x = c(1L, 2L, 3L, 400L, NA, 100L, 200L, 300L, 4000L, NA),
y = c(100L, 200L, 300L, 4000L, NA, 1000L, 2000L, 3000L, 40000L, NA),
Metadata_col = rep(c("A", "B"), each=5)
)
grouped_median = test_df %>%
group_by(Metadata_col) %>%
impute_missing(method = "median") %>%
ungroup()
grouped_mean = test_df %>%
group_by(Metadata_col) %>%
impute_missing(method = "mean") %>%
ungroup()
expect_equal(
as.numeric(grouped_median[["x"]]),
c(1, 2, 3, 400, 2.5, 100, 200, 300, 4000, 250)
)
expect_equal(
as.numeric(grouped_median[["y"]]),
c(100, 200, 300, 4000, 250, 1000, 2000, 3000, 40000, 2500)
)
expect_equal(
as.numeric(grouped_mean[["x"]]),
c(1, 2, 3, 400, 101.5, 100, 200, 300, 4000, 1150)
)
expect_equal(
as.numeric(grouped_mean[["y"]]),
c(100, 200, 300, 4000, 1150, 1000, 2000, 3000, 40000, 11500)
)
})
test_that("impute_missing nn returns correct value", {
# TODO: write test
expect_true(TRUE)
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.