library(testthat)
real_data_binned_file_name <- system.file(file.path("extdata", "ZD_150bins_50sampled.Rda"), package = "NeuroDecodeR")
# test the ds_basic -----------------------------------------------------------
test_that("ds_basic() conforms to the NDR datasouce interface", {
ds <- ds_basic(real_data_binned_file_name, 'stimulus_ID', 18, 0)
expect_null(test_valid_datasource(ds))
})
test_that("ds_basic: the get_data() functions returns unique points in training and test sets (no data leakage)", {
ds <- ds_basic("fake_binned_data.Rda", "stim_names", 10)
the_data <- get_data(ds)
long_data <- the_data %>%
dplyr::select(starts_with("site")) %>%
tidyr::gather(site, activity)
expect_equal(length(unique(long_data$activity)), dim(long_data)[1])
})
test_that("ds_basic: if the labels_levels_to_use is set, only those label levels are returned", {
label_levels_to_use <- c("flower", "guitar", "kiwi")
ds <- ds_basic("fake_binned_data.Rda", "stim_names", 10, label_levels = label_levels_to_use)
the_data <- get_data(ds)
expect_equal(as.character(unique(the_data$train_labels)), label_levels_to_use)
expect_equal(as.character(unique(the_data$test_labels)), label_levels_to_use)
})
test_that("basic_ds: if only specific sites to be used, only those sites are returned", {
# use only a subset of sites
sites_to_use <- sites_to_use <- seq(1, 100, 3)
ds <- ds_basic("fake_binned_data.Rda", "stim_names", 10, site_IDs_to_use = sites_to_use)
the_data <- get_data(ds)
site_names <- the_data %>%
dplyr::select(starts_with("site")) %>%
names(.)
site_numbers <- as.numeric(stringr::str_sub(site_names, 6, 9))
expect_equal(sites_to_use, site_numbers)
# exclude a subset of sites
sites_to_exclude <- seq(2, 100, 5)
ds <- ds_basic("fake_binned_data.Rda", "stim_names", 10, site_IDs_to_exclude = sites_to_exclude)
remaining_sites <- setdiff(1:100, sites_to_exclude)
the_data <- get_data(ds)
site_names <- the_data %>%
dplyr::select(starts_with("site")) %>%
names(.)
site_numbers <- as.numeric(stringr::str_sub(site_names, 6, 9))
expect_equal(remaining_sites, site_numbers)
# use and exclude a subset of sites
ds <- ds_basic("fake_binned_data.Rda", "stim_names", 10,
site_IDs_to_use = sites_to_use, site_IDs_to_exclude = sites_to_exclude)
remaining_sites_used <- setdiff(sites_to_use, sites_to_exclude)
the_data <- get_data(ds)
site_names <- the_data %>%
dplyr::select(starts_with("site")) %>%
names(.)
site_numbers <- as.numeric(stringr::str_sub(site_names, 6, 9))
expect_equal(remaining_sites_used, site_numbers)
})
test_that("basic_ds: the correct number of resampled sites is returned", {
num_resample_sites <- 50
ds <- ds_basic("fake_binned_data.Rda", "stim_names", 10,
num_resample_sites = num_resample_sites)
the_data <- get_data(ds)
site_names <- the_data %>%
dplyr::select(starts_with("site")) %>%
names(.)
expect_equal(num_resample_sites, length(site_names))
# check that an error message is given if a negative value is provided
# for the number of resample sites
expect_error(ds_basic("fake_binned_data.Rda", "stim_names", 10,
num_resample_sites = -1))
})
test_that("basic_ds: the correct number of repeats per CV block are returned", {
num_CV <- 5
num_reps <- 2
ds <- ds_basic("fake_binned_data.Rda", "stim_names", num_CV,
num_label_repeats_per_cv_split = num_reps)
the_data <- get_data(ds)
total_num_labels <- the_data %>%
group_by(time_bin, train_labels) %>%
summarize(n = n())
expect_equal(total_num_labels$n, rep(num_CV * num_reps, 7 * length(unique(the_data$time_bin))))
})
test_that("basic_ds: simultaneously recorded data is returned correctly", {
# all whole numbers on each row should be the same this data when gotten simultaneously
ds <- ds_basic("fake_simultaneous_binned_data.Rda", "stim_names", 10,
create_simultaneous_populations = TRUE)
the_data <- get_data(ds)
the_site_data <- the_data %>%
select(starts_with('site'))
expect_equal(sum(abs(diff(t(round(data.matrix(the_site_data)))))), 0)
# the fractional part should match the site digit names
data_as_a_matrix = data.matrix(the_site_data)
whole = floor(data_as_a_matrix)
fraction = data_as_a_matrix - whole
site_name_fraction <- as.numeric(paste0('.', substr(names(the_site_data), 6, 9)))
site_name_matrix <- matrix(rep(site_name_fraction, each = dim(the_site_data)[1]),
nrow = dim(the_site_data)[1],
ncol = dim(the_site_data)[2])
expect_lt(max(site_name_matrix - fraction), 10^-10)
# all whole numbers on each row should NOT be the same this data when data is NOT gotten simultaneously
ds <- ds_basic("fake_simultaneous_binned_data.Rda", "stim_names", 10,
create_simultaneous_populations = FALSE)
the_data <- get_data(ds)
the_site_data <- the_data %>%
select(starts_with('site'))
expect_false(sum(abs(diff(t(round(data.matrix(the_site_data)))))) == 0)
# test that the simultaneous argument works even when trial number is a not variable in
# the binned_data data frame
load("fake_simultaneous_binned_data.Rda")
binned_data <- binned_data %>%
dplyr::select(-trial_number)
expect_warning(ds_basic(binned_data, "stim_names", 10,
create_simultaneous_populations = TRUE))
})
test_that("basic_ds: shuffling labels works", {
num_CV <- 5
num_reps <- 2
# check 1: when the label levels are not shuffled and we pull all 10 sites, there
# should always be the same mapping for sites and labels everytime we get a
# new set of data
ds1 <- ds_basic("fake_simultaneous_binned_data.Rda", "stim_names", num_CV,
num_label_repeats_per_cv_split = num_reps)
unshuffled_data1 <- get_data(ds1) %>%
select(site_0001, train_labels, test_labels) %>%
arrange(train_labels, test_labels, site_0001)
ds2 <- ds_basic("fake_simultaneous_binned_data.Rda", "stim_names", num_CV,
num_label_repeats_per_cv_split = num_reps)
unshuffled_data2 <- get_data(ds2) %>%
select(site_0001, train_labels, test_labels) %>%
arrange(train_labels, test_labels, site_0001)
expect_true(identical(unshuffled_data1, unshuffled_data2))
# check 2a: when the label levels are shuffled before running, and we pull all 10 sites,
# there be a different mapping for sites and labels everytime we get a new set
# of data
ds1 <- ds_basic("fake_simultaneous_binned_data.Rda", "stim_names", num_CV,
num_label_repeats_per_cv_split = num_reps,
randomly_shuffled_labels = TRUE)
shuffled_data1 <- get_data(ds1) %>%
select(site_0001, train_labels, test_labels) %>%
arrange(train_labels, test_labels, site_0001)
ds2 <- ds_basic("fake_simultaneous_binned_data.Rda", "stim_names", num_CV,
num_label_repeats_per_cv_split = num_reps,
randomly_shuffled_labels = TRUE)
shuffled_data2 <- get_data(ds2) %>%
select(site_0001, train_labels, test_labels) %>%
arrange(train_labels, test_labels, site_0001)
expect_false(identical(shuffled_data1, shuffled_data2))
# check 2b: since the shuffling happens before using the get_data() method
# a second pull should give rise to the same label site ID shuffling
shuffled_data1_second_pull <- get_data(ds1) %>%
select(site_0001, train_labels, test_labels) %>%
arrange(train_labels, test_labels, site_0001)
expect_true(identical(shuffled_data1, shuffled_data1_second_pull))
# check 3: when suffling simulatenous data, all sites should have the same
# random trial to label mapping...
ds_simul1 <- ds_basic("fake_simultaneous_binned_data.Rda", "stim_names", num_CV,
num_label_repeats_per_cv_split = num_reps,
randomly_shuffled_labels = TRUE,
create_simultaneous_populations = TRUE)
simul_shuffled_data1 <- get_data(ds_simul1) %>%
mutate(site_0001 = round(site_0001), site_0002 = round(site_0002)) %>%
select(site_0001, site_0002, train_labels, test_labels) %>%
arrange(train_labels, test_labels, site_0001)
expect_equal(simul_shuffled_data1$site_0001, simul_shuffled_data1$site_0002)
# check 4: when one creates a new data source and get the data a second time using
# simulatenous populations the mapping should be different from the first
# time.
ds_simul2 <- ds_basic("fake_simultaneous_binned_data.Rda", "stim_names", num_CV,
num_label_repeats_per_cv_split = num_reps,
randomly_shuffled_labels = TRUE,
create_simultaneous_populations = TRUE)
simul_shuffled_data2 <- get_data(ds_simul2) %>%
mutate(site_0001 = round(site_0001), site_0002 = round(site_0002)) %>%
select(site_0001, site_0002, train_labels, test_labels) %>%
arrange(train_labels, test_labels, site_0001)
expect_false(identical(simul_shuffled_data1$site_0001, simul_shuffled_data2$site_0001))
})
test_that("basic_ds: different cross-validation arrangements of data are returned each time get_data() is called", {
num_CV <- 5
num_reps <- 2
ds_pseudo <- ds_basic("fake_simultaneous_binned_data.Rda", "stim_names", num_CV,
num_label_repeats_per_cv_split = num_reps)
data1 <- get_data(ds_pseudo)
data2 <- get_data(ds_pseudo)
expect_false(identical(data1, data2))
ds_simul <- ds_basic("fake_simultaneous_binned_data.Rda", "stim_names", num_CV,
num_label_repeats_per_cv_split = num_reps,
create_simultaneous_populations = TRUE)
data1 <- get_data(ds_simul)
data2 <- get_data(ds_simul)
expect_false(identical(data1, data2))
})
# test the ds_generalization -------------------------------------------------
# generate the training and test levels for the ds_generalization
id_levels <- c("hand", "flower", "guitar", "face", "kiwi", "couch", "car")
position_levels <- c("lower", "middle", "upper")
train_label_levels <- NULL
test_label_levels <- NULL
for (i in seq_along(id_levels)){
train_label_levels[[i]] <- c(paste(id_levels[i], "upper",sep = '_'),
paste(id_levels[i], "middle",sep = '_'))
test_label_levels[[i]] <- list(paste(id_levels[i], "lower",sep = '_'))
}
test_that("generalization_ds constructor and get data work", {
ds <- ds_generalization(real_data_binned_file_name,
'combined_ID_position', 18,
train_label_levels,
test_label_levels)
expect_null(test_valid_datasource(ds))
the_data <- get_data(ds)
# test that if the same levels are assigned to different classes this gives an
# error (otherwise there could be data leakage)
test_label_levels2 <- train_label_levels
temp <- test_label_levels2[[1]]
test_label_levels2[[1]] <- test_label_levels2[[2]]
test_label_levels2[[2]] <- temp
expect_error(ds_generalization(real_data_binned_file_name,
'combined_ID_position', 18,
train_label_levels,
test_label_levels2))
})
# test that ds_generalization leads to results at chance in baseline
# and above chance in stimulus period (more of an integration test than a unit test)
test_that("generalization_ds classification accuracy seem reasonable", {
# get firing rate data
ds <- ds_generalization(real_data_binned_file_name, 'combined_ID_position',
num_cv_splits = 3, num_label_repeats_per_cv_split = 6,
train_label_levels = train_label_levels,
test_label_levels = test_label_levels)
cv_data <- get_data(ds)
training_set <- dplyr::filter(cv_data, time_bin == "time.200_350", CV_1 == "train") %>%
select(starts_with("site"), train_labels = train_labels)
test_set <- dplyr::filter(cv_data, time_bin %in% c("time.-350_-200", "time.200_350"), CV_1 == "test") %>%
select(starts_with("site"), test_labels = test_labels, time_bin)
levels(test_set$time_bin)[levels(test_set$time_bin)=="time.-350_-200"] <- "baseline"
levels(test_set$time_bin)[levels(test_set$time_bin)=="time.200_350"] <- "stimulus"
cl <- cl_max_correlation()
# prediction_results <- get_predictions(cl, normalized_training_set, normalized_test_set)
prediction_results <- get_predictions(cl, training_set, test_set)
accuracies <- prediction_results %>%
dplyr::group_by(test_time) %>%
dplyr::summarize(mean_accuracy = mean(actual_labels == predicted_labels))
expect_gt(dplyr::filter(accuracies, test_time == "stimulus")$mean_accuracy, .49)
expect_lt(dplyr::filter(accuracies, test_time == "baseline")$mean_accuracy, .3)
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.