context("episcout cleaning and utility")
######################
library(episcout)
library(testthat)
library(dplyr)
library(tibble)
library(lubridate)
library(purrr)
library(stringi)
library(stringr)
library(magrittr)
library(data.table)
######################
######################
# Working directory for informal tests, should be from pkg/tests/testthat/:
# setwd('/Users/antoniob/Documents/github.dir/AntonioJBT/episcout/tests/testthat/')
######################
######################
# Test set df:
set.seed(12345)
n <- 20
df <- data.frame(var_id = rep(1:(n / 2), each = 2),
var_to_rep = rep(c("Pre", "Post"), n / 2),
x = rnorm(n),
y = rbinom(n, 1, 0.50),
z = rpois(n, 2)
)
# df
# And save to disk with:
# epi_write(df, 'tests/testthat/df.tsv')
# df.tsv is then used for tests below
######################
######################
# Get all duplicates:
print("Function being tested: epi_clean_get_dups")
test_that("Test expected output after epi_clean_get_dups", {
# Should contain all the df (because each row is duplicated
# if looking at 'var_id' only)
check_dups <- epi_clean_get_dups(df, 'var_id', 1)
# str(dim(check_dups))
# check_dups
expect_output(str(dim(check_dups)), ' 20 5')
expect_output(str(head(check_dups)), 'var_id : int 1 1 2 2 3 3')
# Should be empty:
check_dups <- epi_clean_get_dups(df, 'var_id', 2)
# str(dim(check_dups))
# str(check_dups)
expect_output(str(dim(check_dups)), ' 0 5')
expect_output(str(check_dups), '0 obs. of 5 variables:')
}
)
######################
######################
print("Function being tested: epi_clean_compare_dup_rows")
test_that("Test expected output after epi_clean_compare_dup_rows", {
# Check a few duplicated individuals:
check_dups <- epi_clean_get_dups(df, 'var_id', 1)
val_id <- '2' # TO DO: '1' matches '1' and '10' despite fixed = TRUE
comp <- epi_clean_compare_dup_rows(check_dups, val_id, 'var_id', 1, 2)
# comp
# View(t(check_dups[comp$duplicate_indices, ]))
# View(t(check_dups[comp$duplicate_indices, comp$differing_cols]))
expect_output(str(comp$differing_cols), ' 3 5')
expect_output(str(comp$col_names), ' "x" "z"')
expect_output(str(comp$duplicate_indices), ' 3 4')
}
)
######################
######################
print("Function being tested: epi_clean_compare_str")
test_that("Test expected output after epi_clean_compare_str", {
# test data:
letts <- paste(letters, collapse = ' ')
other_letts <- toupper(paste(letters, collapse = ' '))
df_comp <- data.frame ('sub' = rep(x = substr(letts, 1, 5), 10),
'str' = rep(x = substr(letts, 1, 5), 10),
stringsAsFactors = FALSE)
df2_comp <- data.frame ('sub' = rep(x = substr(letts, 1, 5), 10),
'str' = rep(x = substr(other_letts, 6, 10), 10),
stringsAsFactors = FALSE)
df3 <- rbind(df_comp, df2_comp)
col_1 <- 'sub'
col_2 <- 'str'
# df3[1, c(col_1, col_2)]
# df3
# Test output expectations:
expect_output(str(epi_clean_compare_str(df3, 1, col_1, col_2)), ' TRUE')
expect_output(str(epi_clean_compare_str(df3, 11, col_1, col_2)), ' FALSE')
}
)
######################
######################
print("Function being tested: epi_clean_cond_numeric")
test_that("Test expected output after epi_clean_cond_numeric", {
get_cols <- df %>%
# select_if(is.integer) %>%
# select_if(is.numeric) %>%
select_if(~ epi_clean_cond_numeric(.))
expect_output(str(head(get_cols, 1)), 'x : num 0.586')
expect_output(str(tail(get_cols, 1)), 'z : int 1')
expect_output(str(tail(get_cols, 1)), 'var_id: int 10')
expect_output(str(epi_clean_cond_numeric(df[[2]])), 'FALSE')
expect_output(str(epi_clean_cond_numeric(df[, 'x'])), 'TRUE')
}
)
######################
######################
print("Function being tested: epi_clean_cond_chr_fct")
test_that("Test expected output after epi_clean_cond_chr_fct", {
col_chr <- data.frame('chr1' = rep(c('A', 'B')),
'chr2' = rep(c('C', 'D'))
)
df_cont_chr <- as.tibble(cbind(df, col_chr))
get_cols <- df_cont_chr %>% select_if(~ epi_clean_cond_chr_fct(.))
# Tests:
expect_output(str(head(get_cols, 1)), 'Factor w/ 2 levels "Post","Pre"')
expect_output(str(tail(get_cols, 1)), 'Factor w/ 2 levels "C","D"')
expect_output(str(epi_clean_cond_chr_fct(df_cont_chr[[2]])), 'TRUE')
expect_output(str(epi_clean_cond_chr_fct(df_cont_chr[, 'x'])), 'FALSE')
}
)
######################
######################
print("Function being tested: epi_clean_cond_date")
test_that("Test expected output after epi_clean_cond_date", {
df_date <- df
df_date$date_col <- seq(as.Date("2018/1/1"), by = "year", length.out = 5)
get_cols <- df_date %>% select_if(~ epi_clean_cond_date(.))
# Tests:
expect_output(str(head(get_cols, 1)), 'date_col: Date, format:')
expect_output(str(tail(get_cols, 1)), 'date_col: Date, format:')
expect_output(str(epi_clean_cond_date(df_date[[2]])), 'FALSE')
expect_output(str(epi_clean_cond_date(df_date[, 'date_col'])), 'TRUE')
}
)
######################
######################
print("Function being tested: epi_clean_count_classes")
test_that("epi_clean_count_classes", {
df$date_col <- seq(as.Date("2018/1/1"), by = "year", length.out = 5)
expect_output(str(epi_clean_count_classes(df)), ' 1 1 3 1')
expect_output(str(head(epi_clean_count_classes(df))), '"Date" "factor" "integer" "numeric"')
}
)
######################
######################
print("Function being tested: epi_clean_class_to_factor")
test_that("epi_clean_class_to_factor", {
df_factor <- df
df_factor$date_col <- seq(as.Date("2018/1/1"), by = "year", length.out = 5)#nrow(df_factor))
# lapply(df_factor, class)
# lapply(df_factor, function(x) length(unique(x)))
# Check conditions:
i <- 'date_col'
cutoff_unique <- 10
# if num. of unique values is less than cut-off:
expect_output(str(
length(unique(df_factor[[i]])) < cutoff_unique), # should be TRUE
'TRUE')
# and the class is not already a date:
expect_output(str(
class(df_factor[[i]]) != "Date"), # should be FALSE
'FALSE')
# Test column class changed, should return y and z as factors, date_col as Date:
df_factor <- epi_clean_class_to_factor(df_factor, cutoff_unique = 10)
get_classes <- lapply(df_factor, class)
expect_output(str(head(get_classes, 1)), 'chr "integer"')
expect_output(str(head(get_classes, 2)), 'chr "factor"')
expect_output(str(head(get_classes, 3)), 'chr "numeric"')
expect_output(str(head(get_classes, 4)), 'chr "factor"')
expect_output(str(head(get_classes, 5)), 'chr "factor"')
expect_output(str(head(get_classes, 6)), 'chr "Date"')
}
)
######################
######################
print("Function being tested: epi_clean_replace_value")
test_that("epi_clean_replace_value", {
df_factor <- df
df_factor$date_col <- seq(as.Date("2018/1/1"), by = "year", length.out = 5)
# Convert to character first:
df_factor$date_col <- as.character(df_factor$date_col)
# lapply(df_factor, class)
patterns <- c('2018', '2022')
# match values starting with string
pattern <- pattern <- sprintf('^%s', patterns[1])
# Convert first pattern to NA
# In a separate vector:
# convert_NA <- epi_clean_replace_value(df_factor, 'date_col', pattern, NA)
# convert_NA
# df_factor$date_col
# In place:
df_factor[['date_col']] <- epi_clean_replace_value(df_factor,
'date_col',
pattern,
NA)
# df_factor$date_col
expect_output(str(head(df_factor$date_col)), 'NA "2019-01-01" "2020-01-01"')
expect_output(str(tail(df_factor$date_col)), '"2022-01-01" NA "2019-01-01"')
}
)
######################
######################
print("Function being tested: epi_clean_add_rep_num")
test_that("epi_clean_add_rep_num", {
var_id <- 'var_id'
var_to_rep <- 'var_to_rep'
reps <- epi_clean_add_rep_num(df, 'var_id', 'var_to_rep')
# reps
# Sanity check:
expect_output(str(identical(as.character(reps[[var_id]]),
as.character(df[[var_id]]))), # should be TRUE
'TRUE')
# Bind:
df2 <- as.tibble(cbind(df, 'rep_num' = reps$rep_num))
# merge() adds all rows from both data frames as there are duplicates
# so use cbind after making sure order is exact
# epi_head_and_tail(df2, rows = 3, last_cols = TRUE)
expect_output(str(head(df2)), ' x : num 0.586 0.709 -0.109 -0.453 0.606')
expect_output(str(head(df2)), 'rep_num : num 1 2 1 2 1 2')
expect_output(str(tail(df2)), 'rep_num : num 1 2 1 2 1 2')
}
)
######################
######################
print("Function being tested: epi_clean_add_colname_suffix")
test_that("epi_clean_add_colname_suffix", {
# names(df)
# Add .0 as suffix to all column names starting from column 2 (skip id col):
id_col <- 1
new_colnames <- epi_clean_add_colname_suffix(df, id_col, '.0')
# new_colnames
# Rename them in my data frame:
names(df)[-id_col] <- new_colnames
# names(df)
expect_output(str(names(df)), '"var_id" "var_to_rep.0" "x.0" "y.0"')
}
)
######################
######################
print("Function being tested: epi_clean_spread_repeated")
test_that("epi_clean_spread_repeated", {
df_spread <- epi_clean_spread_repeated(df, 'var_to_rep', 1)
# df_spread
expect_output(str(df_spread[1]), '0.586 -0.109 0.606 0.63 -0.284')
expect_output(str(df_spread[1]), '1 1 0 0 0 1 0 1 1 0')
expect_output(str(df_spread[2]), '0.709 -0.453 -1.818 -0.276 -0.919')
expect_output(str(df_spread[2]), '1 3 1 1 4 2 1 2 2 1')
}
)
######################
######################
print("Function being tested: epi_clean_merge_nested_dfs")
test_that("epi_clean_merge_nested_dfs", {
# Create a nested list of dataframes using the repeated measurements variable:
df_spread <- epi_clean_spread_repeated(df, 'var_to_rep', 1)
# Returns a nested list
# Run an example with epi_clean_merge_nested_dfs()
# to create a single dataframe with repeated observations spread and
# no duplicate IDs (create a wide instead of a long dataframe):
nested_list_dfs <- purrr::flatten(list(df_spread, df_spread, df_spread))
id_col <- 'var_id'
# epi_list_head(nested_list_dfs, 2, 3)
# epi_list_tail(nested_list_dfs, 2, 3)
all_merged <- epi_clean_merge_nested_dfs(nested_list_dfs, id_col)
# dim(all_merged)
# as.tibble(all_merged)
# names(all_merged)
# str(all_merged)
# epi_head_and_tail(all_merged, rows = 5, cols = 3)
expect_output(str(all_merged), '10 obs. of 25 variables')
expect_output(str(all_merged), '0.586 -0.109 0.606 0.63 -0.284') # x.Pre
expect_output(str(all_merged), '0.709 -0.453 -1.818 -0.276 -0.919') # x.Post_4
expect_output(str(all_merged), '1 3 1 1 4 2 1 2 2 1') # z.Post_6
}
)
######################
######################
print("Function being tested: epi_clean_transpose")
test_that("epi_clean_transpose", {
df$id_col <- rownames(df)
# df
id_col <- 6
df_t <- epi_clean_transpose(df, id_col)
# class(df_t)
# dim(df)
# dim(df_t)
# df_t
# names(df_t)
# df_t[, 1] # should contain the original column headers
expect_output(str(class(df_t)), 'data.frame')
expect_output(str(dim(df)), '20 6')
expect_output(str(dim(df_t)), '5 21')
expect_output(str(names(df_t)), '"V1" "1" "2" "3" "4" "5" "6" "7"')
expect_output(str(df_t[, 1]), '"var_to_rep" "x" "y" "z" "id_col"')
expect_output(str(epi_head_and_tail(df_t)), 'chr "1" "Pre" "0.585528817843856" "1"')
expect_output(str(epi_head_and_tail(df_t)), 'chr "2" "Post" "-0.453497173462763" "1" ...')
}
)
######################
######################
print("Function being tested: epi_clean_make_names")
test_that("epi_clean_make_names", {
string <- c('mean', '.j_j', '...', 'if',
'while', 'TRUE', 'NULL', '_jj',
' j', '.2way'
)
valid_names <- epi_clean_make_names(string)
# valid_names
str_ref <- c('mean', 'X_j_j', 'X___', 'if_',
'while_', 'TRUE_', 'NULL_', 'X_jj',
'X__j', 'X_2way'
)
expect_identical(valid_names, str_ref)
}
)
######################
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.