context("PSD gridding operations")
library(popcycle)
library(dplyr)
source("helper.R")
test_that("Grid two files", {
x <- setUp()
vct_dir <- x$psd.vct.dir
db <- x$psd.db
bins <- 5
quantile_ <- 97.5
qstr <- "q97.5"
qsuffix <- "_q97.5"
vct_files <- list.files(vct_dir, "\\.parquet$", full.names=T)
meta <- create_meta(db, quantile_)
refracs <- refracs <- tibble::tibble(
prochloro="mid", synecho="mid", picoeuk="lwr", croco="lwr", beads="lwr", unknown="lwr"
)
calib <- tibble::tibble(pop=c("prochloro", "synecho"), a=c(2, 3), b=c(0, 1))
grid <- create_grid(bins, log_base=2, log_answers=FALSE)
# Create the gridded data
psd <- create_PSD(
vct_files, quantile_, refracs, grid, log_base=NULL, use_data.table=TRUE
)
# Don't use data.table
psd_no_dt <- create_PSD(
vct_files, quantile_, refracs, grid, log_base=NULL, use_data.table=FALSE
)
# Pass a vector of dates to ignore
ignore_dates <- meta %>% dplyr::filter(flag != 0) %>% pull(date)
psd_ignore_dates <- create_PSD(
vct_files, quantile_, refracs, grid, log_base=NULL, use_data.table=TRUE,
ignore_dates=ignore_dates
)
# Make sure using data.table and not using data.table yields the same result.
# Exclude Qc_sum because of floating-point imprecision
expect_true(all.equal(
psd %>% select(-c(Qc_sum)),
psd_no_dt %>% select(-c(Qc_sum)),
check.attributes=FALSE
))
# Test Qc_sum with near
expect_true(all(near(psd$Qc_sum, psd_no_dt$Qc_sum)))
# Get the actual VCT data
vct <- lapply(vct_files, function(f) {
vct <- arrow::read_parquet(f) %>%
filter(get(qstr)) %>%
select(date, fsc_small, chl_small, pe, ends_with(qsuffix)) %>%
rename_with( # Remove quantile suffix from columns
function(x) { stringr::str_replace(x, paste0(qsuffix, "$"), "") },
ends_with(qsuffix)
) %>%
mutate(
diam=if_else(pop == "prochloro" | pop == "synecho", diam_mid, diam_lwr),
Qc=if_else(pop == "prochloro" | pop == "synecho", Qc_mid, Qc_lwr)
) %>%
select(-c(contains("lwr"), contains("mid"), contains("upr")))
}) %>% bind_rows()
# Total particles should be equal
expect_equal(sum(psd$n), nrow(vct))
# Total Qc sum should be equal
expect_equal(sum(psd$Qc_sum), sum(vct$Qc))
# There should not be any NAs
expect_true(all(!is.na(psd)))
# All columns are present
want_cols <- c("date", sapply(names(grid), function(n) { paste0(n, "_coord") }), "pop", "n", "Qc_sum")
names(want_cols) <- NULL
expect_equal(names(psd), want_cols)
# All dates accounted for
expect_equal(sort(unique(psd$date)), sort(unique(vct$date)))
# All coordinate indices within range
expect_true(min(psd %>% reframe(across(ends_with("_coord"), range))) >= 1)
expect_true(max(psd %>% reframe(across(ends_with("_coord"), range))) <= bins)
# Dates are properly ignored
expect_equal(sum(psd_ignore_dates$n), vct %>% filter(! (date %in% ignore_dates)) %>% nrow())
expect_true(length(intersect(psd_ignore_dates$date, ignore_dates)) == 0)
# Test a large random subset of rows to make sure n and Qc_sum add up properly
# This is slow
filter_vct <- function(vct, grid, r) {
# Find all VCT particles that match one PSD gridded data row
selection <- (
vct$fsc_small >= grid$fsc_small[r$fsc_small_coord] &
vct$fsc_small < grid$fsc_small[r$fsc_small_coord + 1] &
vct$chl_small >= grid$chl_small[r$chl_small_coord] &
vct$chl_small < grid$chl_small[r$chl_small_coord + 1] &
vct$pe >= grid$pe[r$pe_coord] &
vct$pe < grid$pe[r$pe_coord + 1] &
vct$diam >= grid$diam[r$diam_coord] &
vct$diam < grid$diam[r$diam_coord + 1] &
vct$Qc >= grid$Qc[r$Qc_coord] &
vct$Qc < grid$Qc[r$Qc_coord + 1] &
vct$pop == r$pop &
vct$date == r$date
)
return(vct[selection, ])
}
rows_to_test <- as.integer(nrow(psd) * 0.10) # 10% of test gridded data
set.seed(1)
psd_i_to_test <- sample(seq(nrow(psd)), rows_to_test)
n_answers <- sapply(psd_i_to_test, function(i) {
psd[[i, "n"]] == filter_vct(vct, grid, psd[i, ]) %>% nrow()
})
expect_true(all(n_answers))
# Qc_sum may not be exact because of floating-point imprecision, so use near
# rather than ==
Qc_sum_answers <- sapply(psd_i_to_test, function(i) {
near(psd[[i, "Qc_sum"]], filter_vct(vct, grid, psd[i, ]) %>% pull(Qc) %>% sum())
})
expect_true(all(Qc_sum_answers))
# Make sure boundary points are removed
psd_no_boundaries <- create_PSD(
vct_files, quantile_, refracs, grid, log_base=NULL, use_data.table=TRUE,
max_boundary_proportion=0.2
)
n_removed <- vct %>%
group_by(date) %>%
group_modify( ~ {
.x %>%
filter(
fsc_small == min(fsc_small) | fsc_small == max(fsc_small) |
chl_small == min(chl_small) | chl_small == max(chl_small) |
pe == min(pe) | pe == max(pe) |
diam == min(diam) | diam == max(diam) |
Qc == min(Qc) | Qc == max(Qc)
)
}) %>%
group_split() %>%
bind_rows() %>%
nrow()
expect_equal(sum(psd_no_boundaries$n), nrow(vct) - n_removed)
# Make sure boundary point cutoff is observed
# No data should be returned
psd_no_boundaries_empty <- create_PSD(
vct_files, quantile_, refracs, grid, log_base=NULL, use_data.table=TRUE,
max_boundary_proportion=0.01
)
expect_equal(nrow(psd_no_boundaries_empty), 0)
# Hourly data tests for one hour
hourly <- group_psd_by_time(psd, time_expr="1 hours")
expect_equal(length(unique(hourly$date)), 2) # 2 hours of data
seven_vct_n <- vct %>%
filter(
date >= lubridate::ymd_hms("2016-08-08 19:00:00"),
date < lubridate::ymd_hms("2016-08-08 20:00:00")
) %>% nrow()
expect_equal(
hourly %>% filter(date == lubridate::ymd_hms("2016-08-08 19:00:00")) %>% pull(n) %>% sum(),
seven_vct_n
)
seven_vct_Qc_sum <- vct %>%
filter(
date >= lubridate::ymd_hms("2016-08-08 19:00:00"),
date < lubridate::ymd_hms("2016-08-08 20:00:00")
) %>% pull(Qc) %>% sum()
expect_equal(
hourly %>% filter(date == lubridate::ymd_hms("2016-08-08 19:00:00")) %>% pull(Qc_sum) %>% sum(),
seven_vct_Qc_sum
)
# Test random rows in hourly data, slow
filter_vct_hourly <- function(vct, grid, r) {
# Find all VCT particles that match one PSD gridded data row for hourly data
selection <- (
vct$fsc_small >= grid$fsc_small[r$fsc_small_coord] &
vct$fsc_small < grid$fsc_small[r$fsc_small_coord + 1] &
vct$chl_small >= grid$chl_small[r$chl_small_coord] &
vct$chl_small < grid$chl_small[r$chl_small_coord + 1] &
vct$pe >= grid$pe[r$pe_coord] &
vct$pe < grid$pe[r$pe_coord + 1] &
vct$diam >= grid$diam[r$diam_coord] &
vct$diam < grid$diam[r$diam_coord + 1] &
vct$Qc >= grid$Qc[r$Qc_coord] &
vct$Qc < grid$Qc[r$Qc_coord + 1] &
vct$pop == r$pop &
lubridate::floor_date(vct$date, "1 hour") == r$date
)
return(vct[selection, ])
}
rows_to_test <- as.integer(nrow(hourly) * 0.10) # 10% of test gridded data
set.seed(1)
hourly_i_to_test <- sample(seq(nrow(hourly)), rows_to_test)
n_answers <- sapply(hourly_i_to_test, function(i) {
hourly[[i, "n"]] == filter_vct_hourly(vct, grid, hourly[i, ]) %>% nrow()
})
expect_true(all(n_answers))
# Qc_sum may not be exact because of floating-point imprecision, so use near
# rather than ==
Qc_sum_answers <- sapply(hourly_i_to_test, function(i) {
near(hourly[[i, "Qc_sum"]], filter_vct_hourly(vct, grid, hourly[i, ]) %>% pull(Qc) %>% sum())
})
expect_true(all(Qc_sum_answers))
tearDown(x)
})
test_that("Volume table creation", {
meta <- tibble::tibble(
date=c(
lubridate::ymd_hms("2016-08-08 19:33:41"),
lubridate::ymd_hms("2016-08-08 19:36:41"),
lubridate::ymd_hms("2016-08-08 20:33:41"),
lubridate::ymd_hms("2016-08-08 20:36:41")
),
volume=c(10000, 5000, 20000, 2000),
opp_evt_ratio=c(0.01, 0.02, 0.03, 0.04)
)
# No time aggregation
volumes <- popcycle::create_volume_table(meta, time_expr=NULL)
want <- meta %>%
mutate(
volume_virtualcore=c(250, 125, 500, 50),
volume_virtualcore_by_file=c(100, 100, 600, 80)
) %>%
select(-c(opp_evt_ratio))
expect_equal(volumes, want)
# Hourly
hourly <- popcycle::create_volume_table(meta)
want <- tibble::tibble(
date=c(
lubridate::ymd_hms("2016-08-08 19:00:00"),
lubridate::ymd_hms("2016-08-08 20:00:00")
),
volume=c(15000, 22000),
volume_virtualcore=c(375, 550),
volume_virtualcore_by_file=c(200, 680)
)
expect_equal(hourly, want)
})
test_that("Abundance calculation", {
psd <- tibble::tibble(
date=c(
lubridate::ymd_hms("2016-08-08 19:00:00"),
lubridate::ymd_hms("2016-08-08 19:00:00"),
lubridate::ymd_hms("2016-08-08 19:00:00"),
lubridate::ymd_hms("2016-08-08 20:00:00")
),
fsc_small_coord=c(1,1,2,1), pe_coord=c(1,1,1,1), chl_small_coord=c(1,1,1,1),
Qc_coord=c(1,1,1,1), diam_coord=c(1,1,1,1),
pop=c("prochloro", "unknown", "prochloro", "synecho"),
n=c(1, 2, 3, 4),
Qc_sum=c(10, 20, 40, 40)
)
volumes <- tibble::tibble(
date=c(
lubridate::ymd_hms("2016-08-08 19:00:00"),
lubridate::ymd_hms("2016-08-08 20:00:00")
),
volume=c(15000, 22000),
volume_virtualcore=c(375, 550),
volume_virtualcore_by_file=c(200, 680)
)
# No calibration to influx data
answers <- popcycle::add_abundance(psd, volumes)
want <- psd %>%
mutate(
n_per_uL=c(1 / 200, 2 / 375, 3 / 200, 4 / 550),
Qc_sum_per_uL=c(10 / 200, 20 / 375, 40 / 200, 40 / 550)
) %>%
select(-c(n, Qc_sum))
expect_equal(answers, want)
# Test calibration
calib <- tibble::tibble(pop=c("prochloro", "synecho"), a=c(2, 3))
answers <- popcycle::add_abundance(psd, volumes, calib=calib)
want <- psd %>%
mutate(
n_per_uL=c(2 * 1 / 200, 2 / 375, 2 * 3 / 200, 3 * 4 / 550),
Qc_sum_per_uL=c(2 * 10 / 200, 20 / 375, 2 * 40 / 200, 3 * 40 / 550)
) %>%
select(-c(n, Qc_sum))
expect_equal(answers, want)
})
test_that("Boundary point removal for two columns", {
df <- make_boundary_df()
answer_idx <- c(
c(2, 5, 6, 8),
c(12, 13, 16, 17, 19, 20)
)
expect_equal(
popcycle::remove_boundary_points(df, c("a", "b")),
df[answer_idx, ]
)
})
test_that("Boundary point removal for one column", {
df <- make_boundary_df()
answer_idx <- c(
c(2, 3, 4, 5, 6, 8),
c(12, 13, 15, 16, 17, 19, 20)
)
expect_equal(
popcycle::remove_boundary_points(df, c("a")),
df[answer_idx, ]
)
})
test_that("Boundary point removal, max saturated only, for two columns", {
df <- make_boundary_df()
answer_idx <- c(
c(1, 2, 3, 5, 6, 8, 10),
c(12, 13, 16, 17, 18, 19, 20)
)
expect_equal(
popcycle::remove_boundary_points(df, c("a", "b"), max_only = TRUE),
df[answer_idx, ]
)
})
test_that("Boundary point removal, max saturated only, for one column", {
df <- make_boundary_df()
answer_idx <- c(
c(1, 2, 3, 4, 5, 6, 8, 10),
c(11, 12, 13, 15, 16, 17, 18, 19, 20)
)
expect_equal(
popcycle::remove_boundary_points(df, c("a"), max_only = TRUE),
df[answer_idx, ]
)
})
test_that("Boundary point removal, error on bad column", {
df <- make_boundary_df()
expect_error(popcycle::remove_boundary_points(df, c("a", "d")))
})
test_that("Boundary point removal, error when on date column", {
df <- make_boundary_df()
df$date <- NULL
expect_error(popcycle::remove_boundary_points(df, c("a")))
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.