tests/testthat/test-cleaning_functions.R

context("episcout cleaning and utility")

######################
library(episcout)
library(testthat)
library(dplyr)
library(tibble)
library(lubridate)
library(purrr)
library(stringi)
library(stringr)
library(magrittr)
library(data.table)
######################

######################
# Working directory for informal tests, should be from pkg/tests/testthat/:
# setwd('/Users/antoniob/Documents/github.dir/AntonioJBT/episcout/tests/testthat/')
######################

######################
# Test set df:
set.seed(12345)
n <- 20
df <- data.frame(var_id = rep(1:(n / 2), each = 2),
                 var_to_rep = rep(c("Pre", "Post"), n / 2),
                 x = rnorm(n),
                 y = rbinom(n, 1, 0.50),
                 z = rpois(n, 2)
                 )
# df

# And save to disk with:
# epi_write(df, 'tests/testthat/df.tsv')
# df.tsv is then used for tests below
######################

######################
# Get all duplicates:
print("Function being tested: epi_clean_get_dups")

test_that("Test expected output after epi_clean_get_dups", {
  # Should contain all the df (because each row is duplicated
  # if looking at 'var_id' only)
  check_dups <- epi_clean_get_dups(df, 'var_id', 1)
  # str(dim(check_dups))
  # check_dups
  expect_output(str(dim(check_dups)), ' 20 5')
  expect_output(str(head(check_dups)), 'var_id    : int  1 1 2 2 3 3')
  # Should be empty:
  check_dups <- epi_clean_get_dups(df, 'var_id', 2)
  # str(dim(check_dups))
  # str(check_dups)
  expect_output(str(dim(check_dups)), ' 0 5')
  expect_output(str(check_dups), '0 obs. of  5 variables:')
  }
  )
######################

######################
print("Function being tested: epi_clean_compare_dup_rows")

test_that("Test expected output after epi_clean_compare_dup_rows", {
  # Check a few duplicated individuals:
  check_dups <- epi_clean_get_dups(df, 'var_id', 1)
  val_id <- '2' # TO DO: '1' matches '1' and '10' despite fixed = TRUE
  comp <- epi_clean_compare_dup_rows(check_dups, val_id, 'var_id', 1, 2)
  # comp
  # View(t(check_dups[comp$duplicate_indices, ]))
  # View(t(check_dups[comp$duplicate_indices, comp$differing_cols]))
  expect_output(str(comp$differing_cols), ' 3 5')
  expect_output(str(comp$col_names), ' "x" "z"')
  expect_output(str(comp$duplicate_indices), ' 3 4')
  }
  )
######################

######################
print("Function being tested: epi_clean_compare_str")

test_that("Test expected output after epi_clean_compare_str", {
  # test data:
  letts <- paste(letters, collapse = ' ')
  other_letts <- toupper(paste(letters, collapse = ' '))
  df_comp <- data.frame ('sub' = rep(x = substr(letts, 1, 5), 10),
                    'str' = rep(x = substr(letts, 1, 5), 10),
                    stringsAsFactors = FALSE)
  df2_comp <- data.frame ('sub' = rep(x = substr(letts, 1, 5), 10),
                    'str' = rep(x = substr(other_letts, 6, 10), 10),
                    stringsAsFactors = FALSE)
  df3 <- rbind(df_comp, df2_comp)
  col_1 <- 'sub'
  col_2 <- 'str'
  # df3[1, c(col_1, col_2)]
  # df3
  # Test output expectations:
  expect_output(str(epi_clean_compare_str(df3, 1, col_1, col_2)), ' TRUE')
  expect_output(str(epi_clean_compare_str(df3, 11, col_1, col_2)), ' FALSE')
  }
  )
######################

######################
print("Function being tested: epi_clean_cond_numeric")

test_that("Test expected output after epi_clean_cond_numeric", {
  get_cols <- df %>%
    # select_if(is.integer) %>%
    # select_if(is.numeric) %>%
    select_if(~ epi_clean_cond_numeric(.))
  expect_output(str(head(get_cols, 1)), 'x     : num 0.586')
  expect_output(str(tail(get_cols, 1)), 'z     : int 1')
  expect_output(str(tail(get_cols, 1)), 'var_id: int 10')
  expect_output(str(epi_clean_cond_numeric(df[[2]])), 'FALSE')
  expect_output(str(epi_clean_cond_numeric(df[, 'x'])), 'TRUE')
  }
  )
######################

######################
print("Function being tested: epi_clean_cond_chr_fct")

test_that("Test expected output after epi_clean_cond_chr_fct", {
  col_chr <- data.frame('chr1' = rep(c('A', 'B')),
                        'chr2' = rep(c('C', 'D'))
                        )
  df_cont_chr <- as.tibble(cbind(df, col_chr))
  get_cols <- df_cont_chr %>% select_if(~ epi_clean_cond_chr_fct(.))
  # Tests:
  expect_output(str(head(get_cols, 1)), 'Factor w/ 2 levels "Post","Pre"')
  expect_output(str(tail(get_cols, 1)), 'Factor w/ 2 levels "C","D"')
  expect_output(str(epi_clean_cond_chr_fct(df_cont_chr[[2]])), 'TRUE')
  expect_output(str(epi_clean_cond_chr_fct(df_cont_chr[, 'x'])), 'FALSE')
  }
  )
######################

######################
print("Function being tested: epi_clean_cond_date")

test_that("Test expected output after epi_clean_cond_date", {
  df_date <- df
  df_date$date_col <- seq(as.Date("2018/1/1"), by = "year", length.out = 5)
  get_cols <- df_date %>% select_if(~ epi_clean_cond_date(.))
  # Tests:
  expect_output(str(head(get_cols, 1)), 'date_col: Date, format:')
  expect_output(str(tail(get_cols, 1)), 'date_col: Date, format:')
  expect_output(str(epi_clean_cond_date(df_date[[2]])), 'FALSE')
  expect_output(str(epi_clean_cond_date(df_date[, 'date_col'])), 'TRUE')
  }
  )
######################

######################
print("Function being tested: epi_clean_count_classes")

test_that("epi_clean_count_classes", {
  df$date_col <- seq(as.Date("2018/1/1"), by = "year", length.out = 5)
  expect_output(str(epi_clean_count_classes(df)), ' 1 1 3 1')
  expect_output(str(head(epi_clean_count_classes(df))), '"Date" "factor" "integer" "numeric"')
  }
  )
######################

######################
print("Function being tested: epi_clean_class_to_factor")

test_that("epi_clean_class_to_factor", {
  df_factor <- df
  df_factor$date_col <- seq(as.Date("2018/1/1"), by = "year", length.out = 5)#nrow(df_factor))
  # lapply(df_factor, class)
  # lapply(df_factor, function(x) length(unique(x)))
  # Check conditions:
  i <- 'date_col'
  cutoff_unique <- 10
  # if num. of unique values is less than cut-off:
  expect_output(str(
    length(unique(df_factor[[i]])) < cutoff_unique), # should be TRUE
    'TRUE')
  # and the class is not already a date:
  expect_output(str(
    class(df_factor[[i]]) != "Date"), # should be FALSE
    'FALSE')
  # Test column class changed, should return y and z as factors, date_col as Date:
  df_factor <- epi_clean_class_to_factor(df_factor, cutoff_unique = 10)
  get_classes <- lapply(df_factor, class)
  expect_output(str(head(get_classes, 1)), 'chr "integer"')
  expect_output(str(head(get_classes, 2)), 'chr "factor"')
  expect_output(str(head(get_classes, 3)), 'chr "numeric"')
  expect_output(str(head(get_classes, 4)), 'chr "factor"')
  expect_output(str(head(get_classes, 5)), 'chr "factor"')
  expect_output(str(head(get_classes, 6)), 'chr "Date"')
  }
  )
######################

######################
print("Function being tested: epi_clean_replace_value")

test_that("epi_clean_replace_value", {
  df_factor <- df
  df_factor$date_col <- seq(as.Date("2018/1/1"), by = "year", length.out = 5)
  # Convert to character first:
  df_factor$date_col <- as.character(df_factor$date_col)
  # lapply(df_factor, class)
  patterns <- c('2018', '2022')
  # match values starting with string
  pattern <- pattern <- sprintf('^%s', patterns[1])
  # Convert first pattern to NA
  # In a separate vector:
  # convert_NA <- epi_clean_replace_value(df_factor, 'date_col', pattern, NA)
  # convert_NA
  # df_factor$date_col
  # In place:
  df_factor[['date_col']] <- epi_clean_replace_value(df_factor,
                                                     'date_col',
                                                     pattern,
                                                     NA)
  # df_factor$date_col
  expect_output(str(head(df_factor$date_col)), 'NA "2019-01-01" "2020-01-01"')
  expect_output(str(tail(df_factor$date_col)), '"2022-01-01" NA "2019-01-01"')
  }
  )
######################

######################
print("Function being tested: epi_clean_add_rep_num")

test_that("epi_clean_add_rep_num", {
  var_id <- 'var_id'
  var_to_rep <- 'var_to_rep'
  reps <- epi_clean_add_rep_num(df, 'var_id', 'var_to_rep')
  # reps
  # Sanity check:
  expect_output(str(identical(as.character(reps[[var_id]]),
            as.character(df[[var_id]]))), # should be TRUE
            'TRUE')
  # Bind:
  df2 <- as.tibble(cbind(df, 'rep_num' = reps$rep_num))
  # merge() adds all rows from both data frames as there are duplicates
  # so use cbind after making sure order is exact
  # epi_head_and_tail(df2, rows = 3, last_cols = TRUE)
  expect_output(str(head(df2)), ' x         : num  0.586 0.709 -0.109 -0.453 0.606')
  expect_output(str(head(df2)), 'rep_num   : num  1 2 1 2 1 2')
  expect_output(str(tail(df2)), 'rep_num   : num  1 2 1 2 1 2')
  }
  )
######################

######################
print("Function being tested: epi_clean_add_colname_suffix")

test_that("epi_clean_add_colname_suffix", {
  # names(df)
  # Add .0 as suffix to all column names starting from column 2 (skip id col):
  id_col <- 1
  new_colnames <- epi_clean_add_colname_suffix(df, id_col, '.0')
  # new_colnames
  # Rename them in my data frame:
  names(df)[-id_col] <- new_colnames
  # names(df)
  expect_output(str(names(df)), '"var_id" "var_to_rep.0" "x.0" "y.0"')
  }
  )
######################

######################
print("Function being tested: epi_clean_spread_repeated")

test_that("epi_clean_spread_repeated", {
  df_spread <- epi_clean_spread_repeated(df, 'var_to_rep', 1)
  # df_spread
  expect_output(str(df_spread[1]), '0.586 -0.109 0.606 0.63 -0.284')
  expect_output(str(df_spread[1]), '1 1 0 0 0 1 0 1 1 0')
  expect_output(str(df_spread[2]), '0.709 -0.453 -1.818 -0.276 -0.919')
  expect_output(str(df_spread[2]), '1 3 1 1 4 2 1 2 2 1')
  }
  )
######################

######################
print("Function being tested: epi_clean_merge_nested_dfs")

test_that("epi_clean_merge_nested_dfs", {
  # Create a nested list of dataframes using the repeated measurements variable:
  df_spread <- epi_clean_spread_repeated(df, 'var_to_rep', 1)
  # Returns a nested list
  # Run an example with epi_clean_merge_nested_dfs()
  # to create a single dataframe with repeated observations spread and
  # no duplicate IDs (create a wide instead of a long dataframe):
  nested_list_dfs <- purrr::flatten(list(df_spread, df_spread, df_spread))
  id_col <- 'var_id'
  # epi_list_head(nested_list_dfs, 2, 3)
  # epi_list_tail(nested_list_dfs, 2, 3)
  all_merged <- epi_clean_merge_nested_dfs(nested_list_dfs, id_col)
  # dim(all_merged)
  # as.tibble(all_merged)
  # names(all_merged)
  # str(all_merged)
  # epi_head_and_tail(all_merged, rows = 5, cols = 3)
  expect_output(str(all_merged), '10 obs. of  25 variables')
  expect_output(str(all_merged), '0.586 -0.109 0.606 0.63 -0.284') # x.Pre
  expect_output(str(all_merged), '0.709 -0.453 -1.818 -0.276 -0.919') # x.Post_4
  expect_output(str(all_merged), '1 3 1 1 4 2 1 2 2 1') # z.Post_6
  }
  )
######################


######################
print("Function being tested: epi_clean_transpose")

test_that("epi_clean_transpose", {
  df$id_col <- rownames(df)
  # df
  id_col <- 6
  df_t <- epi_clean_transpose(df, id_col)
  # class(df_t)
  # dim(df)
  # dim(df_t)
  # df_t
  # names(df_t)
  # df_t[, 1] # should contain the original column headers
  expect_output(str(class(df_t)), 'data.frame')
  expect_output(str(dim(df)), '20 6')
  expect_output(str(dim(df_t)), '5 21')
  expect_output(str(names(df_t)), '"V1" "1" "2" "3" "4" "5" "6" "7"')
  expect_output(str(df_t[, 1]), '"var_to_rep" "x" "y" "z" "id_col"')
  expect_output(str(epi_head_and_tail(df_t)), 'chr  "1" "Pre" "0.585528817843856" "1"')
  expect_output(str(epi_head_and_tail(df_t)), 'chr  "2" "Post" "-0.453497173462763" "1" ...')
  }
  )
######################

######################
print("Function being tested: epi_clean_make_names")

test_that("epi_clean_make_names", {
  string <- c('mean', '.j_j', '...', 'if',
              'while', 'TRUE', 'NULL', '_jj',
              '  j', '.2way'
              )
  valid_names <- epi_clean_make_names(string)
  # valid_names
  str_ref <- c('mean', 'X_j_j', 'X___', 'if_',
               'while_', 'TRUE_', 'NULL_', 'X_jj',
               'X__j', 'X_2way'
               )
  expect_identical(valid_names, str_ref)
  }
)
######################
AntonioJBT/episcout documentation built on April 28, 2024, 2:03 p.m.