scrutiny: Error Detection in Science

# Original function by Allard ---------------------------------------------

aGrimmer <- function(n, mean, SD, decimals_mean = 2, decimals_SD = 2){

  # if(n>10^decimals_mean){
  #   print("The sample size is too big compared to the precision of the reported mean, it is not possible to apply GRIM.")
  # }

  #Applies the GRIM test, and computes the possible mean.

  sum <- mean*n
  realsum <- round(sum)
  realmean <- realsum/n



  # Creates functions to round a number consistently up or down, when the last digit is 5

  round_down <- function(number, decimals=2){
    is_five <- number*10^(decimals+1)-floor(number*10^(decimals))*10
    number_rounded <- ifelse(is_five==5, floor(number*10^decimals)/10^decimals, round(number, digits = decimals))
    return(number_rounded)
  }

  round_up <- function(number, decimals=2){
    is_five <- number*10^(decimals+1)-floor(number*10^(decimals))*10
    number_rounded <- ifelse(is_five==5, ceiling(number*10^decimals)/10^decimals, round(number, digits = decimals))
    return(number_rounded)
  }

  # Applies the GRIM test, to see whether the reconstituted mean is the same as the reported mean (with both down and up rounding)

  consistency_down <- round_down(number = realmean, decimals = decimals_mean)==mean
  consistency_up <- round_up(number = realmean, decimals = decimals_mean)==mean

  if(consistency_down+consistency_up==0){
    return("GRIM inconsistent")
  }


  #Computes the lower and upper bounds for the sd.

  Lsigma <- ifelse(SD<5/(10^decimals_SD), 0, SD-5/(10^decimals_SD))
  Usigma <- SD+5/(10^decimals_SD)

  #Computes the lower and upper bounds for the sum of squares of items.

  Lowerbound <- (n-1)*Lsigma^2+n*realmean^2
  Upperbound <- (n-1)*Usigma^2+n*realmean^2

  #Checks that there is at least an integer between the lower and upperbound

  FirstTest<- ifelse(ceiling(Lowerbound)>floor(Upperbound), FALSE, TRUE)

  if(FirstTest==FALSE){
    return("GRIMMER inconsistent (test 1)")
  }

  #Takes a vector of all the integers between the lowerbound and upperbound

  Possible_Integers <- ceiling(Lowerbound):floor(Upperbound)

  #Creates the predicted variance and sd

  Predicted_Variance <- (Possible_Integers-n*realmean^2)/(n-1)
  Predicted_SD <- sqrt(Predicted_Variance)

  #Computes whether one Predicted_SD matches the SD (trying to round both down and up)

  Rounded_SD_down <- round_down(Predicted_SD, decimals_SD)
  Rounded_SD_up <- round_up(Predicted_SD, decimals_SD)

  Matches_SD <- Rounded_SD_down==SD | Rounded_SD_up==SD

  if(sum(Matches_SD)==0){
    return("GRIMMER inconsistent (test 2)")
  }

  #Computes first whether there is any integer between lower and upper bound, and then whether there is
  #an integer of the correct oddness between the lower and upper bounds.
  oddness <- realsum%%2
  Matches_Oddness <- Possible_Integers%%2==oddness
  Third_Test <- Matches_SD&Matches_Oddness
  return(ifelse(
    sum(Third_Test)==0, "GRIMMER inconsistent (test 3)", "The mean and SD are consistent.")
  )
}


# Modified function in rsprite2 -------------------------------------------

# (Note the few changes I made here, explained at the appropriate places within
# the function in comments starting on "IN SCRUTINY".)

GRIMMER_test <- function(mean, sd, n_obs, m_prec = NULL, sd_prec = NULL, n_items = 1, min_val = NULL, max_val = NULL) {
  if (is.null(m_prec)) {
    m_prec <- max(nchar(sub("^[0-9]*", "", mean)) - 1, 0)
  }

  if (is.null(sd_prec)) {
    sd_prec <- max(nchar(sub("^[0-9]*", "", sd)) - 1, 0)
  }

  # IN SCRUTINY: removed calls to functions from the checkmate package --
  # scrutiny shouldn't depend on it, and they only checked input formats that
  # were a given here anyway.

  effective_n = n_obs * n_items

  # Applies the GRIM test, and computes the possible mean.
  sum <- mean * effective_n
  realsum <- round(sum)
  realmean <- realsum / effective_n

  #Checks whether mean and SD are within possible range
  if (!is.null(min_val) & !is.null(max_val)) {
    if (mean < min_val | mean > max_val) {
      warning("The mean must be between the scale minimum and maximum")
      return(FALSE)
    }
    sd_limits <- .sd_limits(n_obs, mean, min_val, max_val, sd_prec, n_items)
    if (sd < sd_limits[1] | sd > sd_limits[2]) {
      warning("Given the scale minimum and maximum, the standard deviation has to be between ", sd_limits[1], " and ", sd_limits[2], ".")
      return(FALSE)
    }
  }
  # Creates functions to round a number consistently up or down, when the last digit is 5
  round_down <- function(number, decimals = 2) {
    to_round <- number * 10^(decimals + 1) - floor(number * 10^(decimals)) * 10
    number_rounded <- ifelse(to_round == 5,
                             floor(number * 10^decimals) / 10^decimals,
                             round(number, digits = decimals))
    return(number_rounded)
  }

  round_up <- function(number, decimals = 2) {
    to_round <- number * 10^(decimals + 1) - floor(number * 10^(decimals)) * 10
    number_rounded <- ifelse(to_round == 5,
                             ceiling(number * 10^decimals) / 10^decimals,
                             round(number, digits = decimals))
    return(number_rounded)
  }

  # Applies the GRIM test, to see whether the reconstituted mean is the same as the reported mean (with both down and up rounding)

  consistent_down <- round_down(number = realmean, decimals = m_prec) == mean
  consistent_up <- round_up(number = realmean, decimals = m_prec) == mean

  if (!consistent_down & !consistent_up) {
    # IN SCRUTINY: outcommented the below warning (and turned it into a message)
    # that was thrown when the inputs were GRIM-inconsistent. I think one
    # inconsistency should (essentially) be treated like any other, and such a
    # warning is not desirable when testing. It can be incommented to check when
    # this function thinks the inputs are GRIM-inconsistent!

    # message("GRIM inconsistent - so GRIMMER test cannot be run. See ?GRIM_test")
    return(FALSE)
  }

  # Computes the lower and upper bounds for the sd.

  Lsigma <- ifelse(sd < 5 / (10^(sd_prec+1)), 0, sd - 5 / (10^(sd_prec+1)))
  Usigma <- sd + 5 / (10^(sd_prec+1))

  # Computes the lower and upper bounds for the sum of squares of items.

  lower_bound <- ((n_obs - 1) * Lsigma^2 + n_obs * realmean^2)*n_items^2
  upper_bound <- ((n_obs - 1) * Usigma^2 + n_obs * realmean^2)*n_items^2

  # Checks that there is at least an integer between the lower and upperbound

  if (ceiling(lower_bound) > floor(upper_bound)) {
    # # IN SCRUTINY: added message
    # message("Failed test 1")
    return(FALSE)
  }

  # Takes a vector of all the integers between the lowerbound and upperbound

  possible_integers <- ceiling(lower_bound):floor(upper_bound)

  # Creates the predicted variance and sd

  Predicted_Variance <- (possible_integers/n_items^2 - n_obs * realmean^2) / (n_obs - 1)
  Predicted_SD <- sqrt(Predicted_Variance)

  # Computes whether one Predicted_SD matches the SD (trying to round both down and up)

  Rounded_SD_down <- round_down(Predicted_SD, sd_prec)
  Rounded_SD_up <- round_up(Predicted_SD, sd_prec)

  Matches_SD <- Rounded_SD_down == sd | Rounded_SD_up == sd

  if (!any(Matches_SD)) {
    # # IN SCRUTINY: added message
    # message("Failed test 2")
    return(FALSE)
  }

  # Computes whether there is an integer of the correct oddness between the lower and upper bounds.
  oddness <- realsum %% 2
  Matches_Oddness <- possible_integers %% 2 == oddness

  if (!any(Matches_SD & Matches_Oddness)) {
    # # IN SCRUTINY: added message
    # message("Failed test 3")
    return(FALSE)
  }

  return(TRUE)
}



# Preparations ------------------------------------------------------------

tested_cases_orig <- 7500

# Randomly generating a great number of values in the first step leaves nearly
# as many values with exactly 2 decimal places in the second. There are the
# first values by that number greater than 1 that have exactly two decimal
# places and where the second decimal place is not zero (so it counts as a
# decimal place even without a string transformation):
df1_mean <- seq(1, length.out = tested_cases_orig, by = 0.01)
df1_mean <- df1_mean[decimal_places(df1_mean) == 2]

length(df1_mean)

# Random `n` values with the same number as the mean values, truncated because
# they can only be whole numbers:
df1_n <- runif(length(df1_mean), 10, 150)
df1_n <- trunc(df1_n)

# Create an example data frame:
df1 <- tibble::tibble(
  n    = df1_n,
  mean = as.character(df1_mean),
  sd   = as.character(round(as.numeric(mean) * (2 / 5), 2))
)

# # The same data frame but with a different name for the `sd` column; this is
# # just due to a naming difference between the two functions:
df1 <- df1 %>%
  dplyr::rename(SD = sd) %>%
  dplyr::mutate(mean = as.numeric(mean), SD = as.numeric(SD))

df2 <- df1

# Helper that turns the original function's string output into `TRUE` if
# consistent and `FALSE` if inconsistent:
as_logical_consistency <- function(x) {
  !stringr::str_detect(x, "inconsistent")
}



names(df1) <- c("n", "x", "sd")
df1 <- df1 %>%
  dplyr::mutate(
    x  = restore_zeros(x, width = 2),
    sd = restore_zeros(sd, width = 2)
  )



# Testing -----------------------------------------------------------------

# Apply both functions, modified and original, to data frames containing the
# same data but (possibly) different column names:

start1 <- Sys.time()
out1 <- purrr::pmap_lgl(df1, grimmer_scalar)
end1 <- Sys.time()
diff1 <- difftime(end1, start1, units = "secs")
# message("\nApplying `grimmer_scalar()` took:\n", round(diff1, 2), " seconds\n")

start2 <- Sys.time()
out2 <- purrr::pmap_chr(df2, aGrimmer)
end2 <- Sys.time()
diff2 <- difftime(end2, start2, units = "secs")
# message("Applying `aGrimmer()` took:\n", round(diff2, 2), " seconds\n")

# Convert the original function's string output to logical so that it will be
# comparable to scrutiny-style Boolean output:
out2 <- as_logical_consistency(out2)


df_out <- tibble::tibble(df1, out1, out2)
df_out <- dplyr::mutate(df_out, digits_sd = decimal_places(sd))

# The problem seems to be restricted to cases where `out1` is consistent and
# `out2` is not, and where `n` is either `40` or `80`:
df_disagree <- df_out %>%
  dplyr::filter(out1 != out2)


df_disagree


disagree_rate <- nrow(df_disagree) / nrow(df_out)

disagree_rate

# message("The rate of disagreement between implementations is ", round(disagree_rate, 2))


df_disagree_out1_true <- df_disagree %>%
  dplyr::filter(out1)

# Proportion of cases within the disagreements where `grimmer_scalar()` thinks
# the inputs are consistent but `aGrimmer()` thinks they are not:
disagree_new_impl_true_rate <- nrow(df_disagree_out1_true) / nrow(df_disagree)

disagree_new_impl_true_rate


# The reason behind this test is that `grimmer_scalar()`, and thereby all of
# scrutiny's GRIMMER implementation, is somewhat different from the original
# `aGrimmer()` function -- circa 70 percent of the disagreements are due to
# `grimmer_scalar()` being more lenient. The overall rate of disagreement
# revolves around 0.3 percent, but due to the element of randomness and the
# relatively low number of tested cases (equal to `nrow(df_out)`), the test only
# requires the rate to be below 5 percent. This will be met absent some
# significant changes in `grimmer_scalar()`. In such a situation, it would be
# better for the test to fail.
test_that("the two functions disagree on less than 3 percent of cases", {
  disagree_rate %>% expect_lt(0.05)
})



# Resolve disagreements ---------------------------------------------------

# TODO: resolve disagreements between the implementations! Here are the only
# disagreements that occurred in hundreds of thousands of simulated test cases:
# (Note that `out1` is the result of `grimmer_scalar()`, and `out2` of
# `aGrimmer()`. Most important is that `n` is always 40 or 80!)
c(n = "40", x = "16.03",  sd = "6.41",   out1 = "TRUE", out2 = "FALSE", digits_sd = "2")
c(n = "40", x = "64.73",  sd = "25.89",  out1 = "TRUE", out2 = "FALSE", digits_sd = "2")
c(n = "80", x = "64.73",  sd = "25.89",  out1 = "TRUE", out2 = "FALSE", digits_sd = "2")
c(n = "80", x = "32.68",  sd = "13.07",  out1 = "TRUE", out2 = "FALSE", digits_sd = "2")
c(n = "40", x = "64.27",  sd = "25.71",  out1 = "TRUE", out2 = "FALSE", digits_sd = "2")
c(n = "80", x = "16.22",  sd = "6.49",   out1 = "TRUE", out2 = "FALSE", digits_sd = "2")
c(n = "40", x = "256.03", sd = "102.41", out1 = "TRUE", out2 = "FALSE", digits_sd = "2")
c(n = "40", x = "519.93", sd = "207.97", out1 = "TRUE", out2 = "FALSE", digits_sd = "2")

# # Use this to get a vector such as above:
# df_disagree %>% dplyr::slice(1) %>% unlist() %>% constructive::construct(one_liner = TRUE)

# Here they are in tibble form. Run `GRIMMER_test()` on them and see whether
# this is all due to GRIM's 40/80 leniency!
df_disagree_all <- tibble::tibble(
  n = c(40, 40, 80, 80, 40, 80, 40, 40, 40, 40, 80, 40, 40, 40),
  x = c(
    "16.03", "64.73", "64.73", "32.68", "64.27", "16.22", "256.03", "519.93",
    "32.32", "256.03", "512.33", "512.93", "513.07", "518.93"
  ),
  sd = c(
    "6.41", "25.89", "25.89", "13.07", "25.71", "6.49", "102.41", "207.97",
    "12.93", "102.41", "204.93", "205.17", "205.23", "207.57"
  ),
) %>%
  dplyr::relocate(x, sd, n) %>%
  dplyr::mutate(n = as.numeric(n))

# See if there are warnings about GRIM (!) when mapping `GRIMMER_test()`:
df_disagree_all %>%
  dplyr::rename(mean = x, n_obs = n) %>%
  dplyr::mutate(mean = as.numeric(mean), sd = as.numeric(sd)) %>%
  purrr::pmap(GRIMMER_test)



# New implementation from rsprite2 ----------------------------------------

test_that("GRIMMER works correctly by default", {
  expect_true(grimmer_scalar("5.21", "1.6", 28))
  expect_false(grimmer_scalar("3.44", "2.47", 18))
})

test_that("GRIMMER works correctly when compared to the rsprite2 implementation", {
  grimmer_scalar("1.2", "0.3",  57) %>% expect_equal(GRIMMER_test(1.2, 0.3,  57))
  grimmer_scalar("8.3", "7.5", 103) %>% expect_equal(GRIMMER_test(8.3, 7.5, 103))

  # Dealing with test-3 inconsistencies:
  grimmer_scalar("5.23", "2.55", 35)  %>% expect_equal(GRIMMER_test(5.23, 2.55, 35))
  grimmer_scalar("5.23", "2.55", 127) %>% expect_equal(GRIMMER_test(5.23, 2.55, 127))
  grimmer_scalar("5.2" , "2.5" , 35)  %>% expect_equal(GRIMMER_test(5.2 , 2.5 , 35))

  # This value set is from `pigs5`. It used to be flagged as a test-3
  # inconsistency by `grimmer_scalar()`, but it is consistent according to both
  # the new version and rsprite2:
  grimmer_scalar("2.57", "2.57", 30) %>% expect_equal(GRIMMER_test(2.57, 2.57, 30))

  # Some finer variations:
  grimmer_scalar("3.756", "4.485", 89) %>% expect_equal(GRIMMER_test(3.756, 4.485, 89))
  grimmer_scalar("3.756", "4.485", 12) %>% expect_equal(GRIMMER_test(3.756, 4.485, 12))
  grimmer_scalar("3.75",  "4.48",  12) %>% expect_equal(GRIMMER_test(3.75, 4.48, 12))
  grimmer_scalar("3.75",  "4.48",  89) %>% expect_equal(GRIMMER_test(3.75, 4.48, 89))
})

test_that("GRIMMER works correctly with `items = 2`", {
  grimmer_scalar("5.21", "1.60", 28, items = 2) %>% expect_equal(GRIMMER_test(5.21, 1.6 , 28, n_items = 2))
  grimmer_scalar("3.44", "2.47", 18, items = 2) %>% expect_equal(GRIMMER_test(3.44, 2.47, 18, n_items = 2))
})

test_that("GRIMMER works correctly with `items = 3`", {
  grimmer_scalar("5.21", "1.60", 28, items = 3) %>% expect_equal(GRIMMER_test(5.21, 1.6 , 28, n_items = 3))
  grimmer_scalar("3.44", "2.47", 18, items = 3) %>% expect_equal(GRIMMER_test(3.44, 2.47, 18, n_items = 3))
})



# test_that("sd_bounds_measure works", {
#   expect_equal(c(.45, 3.03), sd_bounds_measure(n = 5, x = 4.2, min_val = 1, max_val = 7, sd_prec = 2))
#   expect_equal(c(.27, 3.03), sd_bounds_measure(n = 5, x = 4.2, min_val = 1, max_val = 7, sd_prec = 2, items = 2))
#   expect_equal(c(0, 0), sd_bounds_measure(n = 100, x = 1, min_val = 1, max_val = 7))
# })