scrutiny: Error Detection in Science

#' Define parameters for SPRITE algorithm
#'
#' The SPRITE algorithm aims to construct possible distributions that conform to
#' observed/reported parameters. This function performs some checks and returns a list of these
#' parameters that can then be passed to the functions that actually generate
#' the distributions (e.g. \code{\link{find_possible_distribution}})
#'
#' Restrictions can be used to define how often a specific value should appear in the sample.
#' They need to be passed as a list in the form `value = frequency`. Thus, to specify that
#' there should be no 3s and five 4s in the distribution, you would pass
#' `restrictions_exact = list("3" = 0, "4" = 5)`. To specify that there should be at least
#' one 1 and one 7, you would pass `restrictions_minimum = list("1" = 1, "7" = 1)`. If you just want to
#' specify that the minimum and maximum values appear at least once (for instance when they are the
#' reported rather than possible range), you can use the shortcut `restrictions_minimum = "range"`. Finally,
#' if you work with multi-item scales that result in decimal responses, round those names to two decimal points, e.g.,
#' when `n_items = 3` you could specify `list("1.67" = 0)`.
#'
#' @param mean The mean of the distribution
#' @param sd The standard deviation of the distribution
#' @param n_obs The number of observations (sample size)
#' @param min_val The minimum value
#' @param max_val The maximum value
#' @param m_prec The precision of the mean, as number of digits after the decimal point.
#' If not provided, taken based on the significant digits of `mean` - so only needed if reported mean ends in 0
#' @param sd_prec The precision of the standard deviation, again only needed if
#' reported standard deviation ends in 0.
#' @param n_items Number of items in scale, if distribution represents scale averages.
#' Defaults to 1, which represents any single-item measure.
#' @param restrictions_exact Restrictions on the exact frequency of specific responses, see Details
#' @param restrictions_minimum Restrictions on the minimum frequency of specific responses, see Details
#' @param dont_test By default, this function tests whether the mean is possible, given the sample size (GRIM-test) and whether
#' the standard deviation is possible, given mean and sample size (GRIMMER test), and fails otherwise. If you want to override this,
#' and run SPRITE anyway, you can set this to TRUE.
#'
#' @return A named list of parameters, pre-processed for further rsprite2 functions.
#'
#' @examples
#'
#' set.seed(1234) #To get reproducible results
#'
#' # Simple case
#' sprite_parameters <- set_parameters(mean = 2.2, sd = 1.3, n_obs = 20, min_val = 1, max_val = 5)
#' find_possible_distribution(sprite_parameters)
#'
#' # With restrictions
#' sprite_parameters <- set_parameters(mean = 1.95, sd = 1.55, n_obs = 20,
#'                                     min_val = 1, max_val = 5, n_items = 3,
#'                                     restrictions_exact = list("3"=0, "3.67" = 2),
#'                                     restrictions_minimum = "range")
#' find_possible_distribution(sprite_parameters)
#'
#' @export

set_parameters <- function(mean, sd, n_obs, min_val, max_val,
                           m_prec = NULL, sd_prec = NULL,
                           n_items = 1, restrictions_exact = NULL,
                           restrictions_minimum = NULL,
                           dont_test = FALSE) {
  if (is.null(m_prec)) {
    m_prec <- max(nchar(sub("^[0-9]*", "", mean)) - 1, 0)
  }

  if (is.null(sd_prec)) {
    sd_prec <- max(nchar(sub("^[0-9]*", "", sd)) - 1, 0)
  }

  assert_count(m_prec)
  assert_count(sd_prec)
  assert_count(n_obs)
  assert_count(n_items)
  assert_int(min_val)
  assert_int(max_val)
  assert_number(mean)
  assert_number(sd)

  if (min_val >= max_val) {
    stop("max_val needs to be larger than min_val")
  }

  if (!dont_test) {

    if (n_obs * n_items <= 10 ^ m_prec) {
      if (!GRIM_test(mean, n_obs, m_prec, n_items)) {
        stop("The mean is not consistent with this number of observations (fails GRIM test).
             You can use GRIM_test() to identify the closest possible mean and try again.")
      }
      }

  if (!GRIMMER_test(mean, sd, n_obs, m_prec, sd_prec, n_items)) {
    stop("The standard deviation is not consistent with this mean and number of observations (fails GRIMMER test).
         For details, see ?GRIMMER_test.")
  }
  }

  sd_limits <- .sd_limits(n_obs, mean, min_val, max_val, sd_prec, n_items)

  if (!(sd >= sd_limits[1] & sd <= sd_limits[2])) {
    stop("The standard deviation is outside the possible range, given the other parameters.
         It should be between ", sd_limits[1], " and ", sd_limits[2], ".")
  }

    if (!(mean >= min_val & mean <= max_val)) {
    stop("The mean is outside the possible range, which is impossible - please check inputs.")
    }

  if (isTRUE(checkmate::check_choice(restrictions_minimum, "range"))) {
    restrictions_minimum <- list(1, 1)
    names(restrictions_minimum) <- c(min_val, max_val)
  }

  poss_values <- max_val
  for (i in seq_len(n_items)) {
    poss_values <- c(poss_values, min_val:(max_val-1) + (1 / n_items) * (i - 1))
  }
  poss_values <- sort(poss_values)

  poss_values_chr <- round(poss_values, 2)

  fixed_responses <- numeric()
  fixed_values <- NA


  if(!is.null(restrictions_minimum)&!is.null(restrictions_exact)) {

  if(any(duplicated(c(round(as.numeric(names(restrictions_exact)), 2), round(as.numeric(names(restrictions_minimum)), 2))))) {
    duplicated <- c(round(as.numeric(names(restrictions_exact)), 2), round(as.numeric(names(restrictions_minimum)), 2))[duplicated(c(round(as.numeric(names(restrictions_exact)), 2), round(as.numeric(names(restrictions_minimum)), 2)))]
    stop("Several restrictions for same value found. Ensure there is only one restriction (exact or minimum) for: ", duplicated)
  }
}
  if(!is.null(restrictions_minimum)) {

  if(any(!(round(as.numeric(names(restrictions_minimum)), 2) %in% poss_values_chr))) {
    no_match <- names(restrictions_minimum)[!(round(as.numeric(names(restrictions_minimum)), 2) %in% poss_values_chr)]
    stop("Invalid names in restrictions_minimum. The following could not be matched to possible response values: ", no_match)
  }

  #Ensure restrictions are ordered
  restrictions_minimum <- restrictions_minimum[as.character(poss_values_chr[poss_values_chr %in% names(restrictions_minimum)])]

  fixed_responses <- c(fixed_responses, rep(poss_values[poss_values_chr %in% names(restrictions_minimum)], unlist(restrictions_minimum)))

  }

  if(!is.null(restrictions_exact)) {


    if(any(!(round(as.numeric(names(restrictions_exact)), 2) %in% poss_values_chr))) {
      no_match <- names(restrictions_exact)[!(round(as.numeric(names(restrictions_exact)), 2) %in% poss_values_chr)]
      stop("Invalid names in restrictions_exact. The following could not be matched to possible response values: ", no_match)
    }


    #Ensure restrictions are ordered
    restrictions_exact <- restrictions_exact[as.character(poss_values_chr[poss_values_chr %in% names(restrictions_exact)])]

    fixed_responses <- c(fixed_responses, rep(poss_values[poss_values_chr %in% names(restrictions_exact)], unlist(restrictions_exact)))

    fixed_values <- poss_values[poss_values_chr %in% names(restrictions_exact)]
  }

  possible_values <- setdiff(poss_values, fixed_values)
  n_fixed <- length(fixed_responses)

  out <- .named_list(mean, sd, n_obs, min_val, max_val, m_prec, sd_prec, n_items, restrictions_minimum, restrictions_exact, possible_values, fixed_values, fixed_responses, n_fixed)

  class(out) <- c("sprite_parameters", class(out))

  out
}

.named_list <- function(...) {
  out <- list(...)
  names(out) <- as.list(match.call())[-1]
  out
}

#' Find several possible distributions.
#'
#' This function aims to find several possible distribution that would give rise to
#' the observed sample parameters. For that, you need to pass a list of parameters,
#' created with \code{\link{set_parameters}}
#'
#' @param parameters List of parameters, see \code{\link{set_parameters}}
#' @param n_distributions The target number of distributions to return.
#' @param seed An integer to use as the seed for random number generation. Set this in scripts to ensure reproducibility.
#' @param return_tibble Should a tibble, rather than a list, be returned? Requires the `tibble`-package, ignored if that package is not available.
#' @param return_failures Should distributions that failed to produce the desired SD be returned? Defaults to false
#'
#' @return A tibble or list (depending on the `return_tibble` argument) with:
#' \item{outcome}{success or failure - character}
#' \item{distribution}{The distribution that was found (if success) / that had the closest variance (if failure) - numeric}
#' \item{mean}{The exact mean of the distribution - numeric}
#' \item{sd}{The SD of the distribution that was found (success) / that came closest (failure) - numeric}
#' \item{iterations}{The number of iterations required to achieve the specified SD - numeric - the first time this distribution was found}
#'
#' @examples
#'
#' sprite_parameters <- set_parameters(mean = 2.2, sd = 1.3, n_obs = 20,
#'                                     min_val = 1, max_val = 5)
#'
#' find_possible_distributions(sprite_parameters, 5, seed = 1234)
#'
#' @export


find_possible_distributions <- function(parameters, n_distributions = 10, seed = NULL, return_tibble = TRUE, return_failures = FALSE) {

  if (!is.null(seed)) {
    assert_int(seed)
    set.seed(seed)
  }

  assert_count(n_distributions)
  assert_logical(return_tibble)

  outcome <- character()
  distributions <- list()
  found_sd <- numeric()
  found_mean <- numeric()
  iterations <- numeric()

    duplications <- 0

    for (i in 1:(n_distributions * rSprite.maxDupLoops)) {

      n_found <- sum(outcome == "success")

      #This break should possibly be earlier?
      if(length(outcome) - max(c(which(outcome == "success"),0)) >= 10) {
        warning("No successful distribution found in last 10 attempts. Exiting.", if (n_found == 0) " There might not be any possible distribution, but you can try running the search again.")
        break
      }
      if (n_found >= n_distributions) break

      # Calculate the maximum number of consecutive duplicates we will accept before deciding to give up.
      # The value of 0.00001 below is our nominal acceptable chance of missing a valid solution;
      #  however, it's extremely likely that all possible solutions are not all equally likely to be found.
      # So we also set a floor of 100 attempts.
      max_duplications <- max(round(log(0.00001) / log(n_found / (n_found + 1))), 100)

      res <- find_possible_distribution(parameters, seed = NULL)

      res$values <- sort(res$values) # sorting lets us find duplicates more easily

      distributions <- c(list(res$values), distributions)
      if(head(duplicated(distributions, fromLast = TRUE), 1)) {
        distributions <- distributions[-1]
        duplications <- duplications + 1
        if (duplications > max_duplications) {
          break
        }
      } else {
        outcome <- c(res$outcome, outcome)
        found_sd <- c(res$sd, found_sd)
        found_mean <- c(res$mean, found_mean)
        iterations <- c(res$iterations, iterations)
      }

    }

    if (n_found < n_distributions) message("Only ", n_found, " matching distributions could be found. You can try again - given that SPRITE is based on random number generation, more distributions might be found then.")

    if (return_tibble & suppressWarnings(requireNamespace("tibble", quietly = TRUE))) {
      out <- tibble::tibble(id = seq_along(outcome), outcome = outcome, distribution = distributions, mean = found_mean, sd = found_sd, iterations = iterations)
      class(out) <- c("sprite_distributions", class(out))
      attr(out, "parameters") <- parameters
      if(!return_failures) return(out[out$outcome == "success",])
      out
    } else {
      if(!return_failures) {
        successes <- outcome == "success"
      return(list(outcome = outcome[successes], distribution = distributions[successes], mean = found_mean[successes], sd = found_sd[successes], iterations = iterations[successes]))
      }
      list(outcome = outcome, distribution = distributions, mean = found_mean, sd = found_sd, iterations = iterations)
    }

}

#' Find a possible distribution.
#'
#' This function aims to find a possible distribution that would give rise to
#' the observed sample parameters. For that, you need to pass a list of parameters,
#' best created with \code{\link{set_parameters}}
#'
#' @param parameters List of parameters, see \code{\link{set_parameters}}
#' @param seed An integer to use as the seed for random number generation. Set this in scripts to ensure reproducibility.
#' @param values_only Should only values or a more informative list be returned. See Value section.
#'
#' @return Unless `values_only = TRUE`, a list with:
#' \item{outcome}{success or failure - character}
#' \item{distribution}{The distribution that was found (if success) / that had the closest variance (if failure) - numeric}
#' \item{mean}{The exact mean of the distribution - numeric}
#' \item{sd}{The SD of the distribution that was found (success) / that came closest (failure) - numeric}
#' \item{iterations}{The number of iterations required to achieve the specified SD - numeric}
#' If `values_only = TRUE`, then the distribution is returned if one was found, and NULL if it failed.
#'
#' @examples
#' sprite_parameters <- set_parameters(mean = 2.2, sd = 1.3, n_obs = 20,
#'                                     min_val = 1, max_val = 5)
#' find_possible_distribution(sprite_parameters)
#'
#' @export
#'


find_possible_distribution <- function(parameters, seed = NULL, values_only = FALSE) {

  assert_class(parameters, "sprite_parameters")

  if (!is.null(seed)) {
    assert_int(seed)
    set.seed(seed)
  }


  # Generate some random starting data.
  rN <- parameters$n_obs -  parameters$n_fixed
  vec <- sample(parameters$possible_values, rN, replace = TRUE)

  # Adjust mean of starting data.
  max_loops <- parameters$n_obs * length(parameters$possible_values)
  vec <- .adjust_mean(max_loops, vec, parameters$fixed_responses, parameters$mean, parameters$m_prec, parameters$possible_values)


  # Find distribution that also matches SD
  maxLoops <- min(max(round(parameters$n_obs * (length(parameters$possible_values)^2)), rSprite.maxDeltaLoopsLower), rSprite.maxDeltaLoopsUpper)
  granule_sd <- ((0.1^parameters$sd_prec) / 2) + rSprite.dust # allow for rounding errors

  result <- NULL

  for (i in seq_len(maxLoops)) {

    #Should one break out of loop when vec no longer changes? Prob not worth all the comparisons?
    current_sd <- sd(c(vec, parameters$fixed_responses))
    if (abs(current_sd - parameters$sd) <= granule_sd) {
      result <- c(vec, parameters$fixed_responses)
      iter <- i
      break
    }
    vec <- .shift_values(vec, parameters$mean, parameters$sd, parameters$min_val, parameters$max_val, parameters$m_prec, parameters$sd_prec, parameters$fixed_responses, parameters$possible_values, parameters$fixed_values)
  }

  if (!is.null(result)) {
    if(values_only) return(result)
    return(list(outcome = "success", values = result, mean = mean(result), sd = current_sd, iterations = iter))
  } else {
    if(values_only) return(NULL)
    return(list(outcome = "failure", values = c(vec, parameters$fixed_responses), mean = mean(c(vec, parameters$fixed_responses)), sd = current_sd, iterations = maxLoops))
  }
}

.adjust_mean <- function(max_iter, vec, fixed_vals, target_mean, m_prec, poss_values) { #poss_values to exclude those restricted

  meanOK <- FALSE

  for (i in 1:max_iter) {
    fullVec <- c(vec, fixed_vals)
    current_mean <- mean(fullVec)
    if ((round(current_mean, m_prec) == target_mean)) {
      meanOK <- TRUE
      break
    }

    increaseMean <- (current_mean < target_mean)
    if (increaseMean) {
      filter <- (vec < (poss_values[length(poss_values)]))
    } else {
      filter <- (vec > (poss_values[1]))
    }

    possible_bump <- which(filter)
    bumpMean <- possible_bump[as.integer(runif(1) * length(possible_bump)) + 1] # select a  number
    vec[bumpMean] <- poss_values[which(poss_values == vec[bumpMean]) + ifelse(increaseMean, 1, -1)]
  }
  if (!meanOK) {
    if (length(fixed_vals)>0) {
      stop("Couldn't initialize data with correct mean. This *might* be because the restrictions cannot be satisfied.")
    } else {
      stop("Couldn't initialize data with correct mean") # this probably indicates a coding error, if the mean is in range
    }
  }
  return(vec)
}

.shift_values <- function(vec, target_mean, target_sd, min_val, max_val, m_prec = 2, sd_prec, fixed_responses, poss_non_restricted, fixed_vals) { #poss_vals are only those not

  # Backup
  vec_original <- vec

  # Decide if we want to increment or decrement first.
  incFirst <- sample(c(TRUE, FALSE), 1)

  # Decide if we need to increase or decrease the SD.
  fullVec <- c(vec, fixed_responses)
  increaseSD <- (sd(fullVec) < target_sd)

  poss_values <- sort(c(poss_non_restricted, fixed_vals))

  maxToInc <- poss_values[length(poss_values) - 1] # maximum value that we can increment
  minToDec <- poss_values[2] # minimum value that we can decrement

  # Select an element to increment or decrement.
  # For better performance, we select from unique elements only; this means that any number that appears in the vector is
  #  equally likely to be chosen regardless of how often it appears. I'm not sure if this is good or bad.
  # TK - check performance impact.
  uniqueCanBump1 <- !duplicated(vec)

  # The element that we change should be less than the maximum (increment) or greater than the minimum (decrement).
  notEdge1 <- if (incFirst) (vec <= maxToInc) else (vec >= minToDec)
  indexCanBump1 <- uniqueCanBump1 & notEdge1

  # If we can't find an element to change, just return the original vector and let our caller sort it out.
  if (sum(indexCanBump1) == 0) {
    return(vec_original)
  }

  # Unless we have no other choice:
  # - If we want to make the SD larger, there is no point in incrementing the smallest element, or decrementing the largest
  # - If we want to make the SD smaller, there is no point in decrementing the smallest element, or incrementing the largest
  if (increaseSD) {
    noPoint1 <- if (incFirst) (vec == min(vec)) else (vec == max(vec))
  } else {
    noPoint1 <- if (incFirst) (vec == maxToInc) else (vec == minToDec)
  }
  indexCanBump1Try <- indexCanBump1 & (!noPoint1)
  if (any(indexCanBump1Try)) {
    indexCanBump1 <- indexCanBump1Try
  }

  whichCanBump1 <- which(indexCanBump1)
  whichWillBump1 <- whichCanBump1[as.integer(runif(1) * length(whichCanBump1)) + 1]
  willBump1 <- vec[whichWillBump1]
  new1 <- poss_non_restricted[which(poss_non_restricted == willBump1) + ifelse(incFirst, 1, -1)]
  gap1 <- new1 - vec[whichWillBump1] # Note when restricted values have been skipped
  vec[whichWillBump1] <- new1

  # At this point we can decide to only change one of the elements (decrement one without incrementing another, or vice versa).
  # This enables us to explore different means that still round to the same target value.
  # So here we perform the first increment or decrement first, and see if the mean is still GRIM-consistent with the target mean.
  # If it is, then in a proportion of cases we don't adjust the other cell.
  newFullVec <- c(vec, fixed_responses)
  newMean <- mean(newFullVec)
  meanChanged <- (round(newMean, m_prec) != target_mean) # new mean is no longer GRIM-consistent

  if (meanChanged || (runif(1) < 0.4)) {
    vecBump2 <- vec # make a scratch copy of the input vector so we can change it
    vecBump2[whichWillBump1] <- NA # remove the element chosen in part 1...
    uniqueCanBump2 <- !duplicated(vecBump2) # ... but if there was more than one copy of that, it's still a candidate
    notEdge2 <- if (!incFirst) (vecBump2 <= maxToInc) else (vecBump2 >= minToDec)
    indexCanBump2 <- uniqueCanBump2 & notEdge2 & (!is.na(vecBump2))

    # If we can't find an element to change in the opposite direction to the first, then if the mean with the first change is still OK,
    #  we return either the vector with that change. Otherwise we return the original vector and let our caller sort it out.
    if (sum(indexCanBump2) == 0) {
      return(if (meanChanged) vec_original else vec)
    }

    # Unless we have no other choice:
    # - If we want to make the SD larger:
    #   - If in step 1 we chose an element to increment, there is no point in now changing (decrementing) a larger one
    #   - If in step 1 we chose an element to decrement, there is no point in now changing (incrementing) a smaller one
    # - If we want to make the SD smaller:
    #   - If in step 1 we chose an element to increment, there is no point in now changing (decrementing) a smaller one
    #   - If in step 1 we chose an element to decrement, there is no point in now changing (incrementing) a larger one
    # There is also no point in incrementing an element that is equal to the new value of the one that we have already chosen.
    noPoint2 <- ((if (increaseSD == incFirst) (vec > willBump1) else (vec < willBump1)) |
      (vec == (new1))
    )
    indexCanBump2Try <- indexCanBump2 & (!noPoint2)
    if (any(indexCanBump2Try)) {
      indexCanBump2 <- indexCanBump2Try
    }

    whichCanBump2 <- which(indexCanBump2)
    whichWillBump2 <- whichCanBump2[as.integer(runif(1) * length(whichCanBump2)) + 1]
    willBump2 <- vec[whichWillBump2]
    new2 <- poss_non_restricted[which(poss_non_restricted == willBump2) + ifelse(incFirst, -1, 1)]
    gap2 <- new2 - vec[whichWillBump2] # Note when restricted values have been skipped

    gap_resolved <- NA
    # Go into restricted values handling only when necessary - should be good for performance, but
    # leads to more complex backtracking here.
    if (!.equalish(abs(gap1), abs(gap2))) {
      gap_resolved <- FALSE
      poss <- which(.equalish(diff(poss_non_restricted), abs(gap1)))
      if (length(poss) > 1) {
        low <- poss_non_restricted[poss]
        up <- poss_non_restricted[poss + 1]
        if (incFirst) {
          # Should not now move down from target - otherwise we are simply reversing course
          target <- which(up == new1)
        } else {
          target <- which(low == new1)
        }
        up <- up[-target]
        low <- low[-target]

        if (incFirst) {
          from <- up[up %in% vec]
          to <- low[up %in% vec]
        } else {
          to <- up[low %in% vec]
          from <- low[low %in% vec]
        }
        if (length(from) > 0) {
          replace <- sample(seq_along(from), 1)
          vec[cumsum(cumsum(vec == from[replace])) == 1] <- to[replace]
          gap_resolved <- TRUE
        }
      }

      if (!gap_resolved) {
        # Cannot do a single second replacement without undoing change so far
        restricted_runs <- rle(poss_values %in% poss_non_restricted)
        longest_run <- max(restricted_runs$lengths[restricted_runs$values == FALSE])
        vec_backup <- vec
        for (i in seq_len(longest_run)) {
          vec <- vec_backup
          replaced <- 0
          i <- i + 1 # Gap of 1 suggest 2 steps might be needed
          poss <- which(.equalish(diff(poss_non_restricted), gap1 / i))
          if (length(poss) > 0) {
            low <- poss_non_restricted[poss]
            up <- poss_non_restricted[poss + 1]

            for (j in seq_len(i)) {
              if (incFirst) {
                from <- up[up %in% vec]
                to <- low[up %in% vec]
              } else {
                to <- up[low %in% vec]
                from <- low[low %in% vec]
              }
              if (length(from) > 0) {
                replaced <- replaced + 1
                replace <- sample(seq_along(from), 1)
                vec[cumsum(cumsum(vec == from[replace])) == 1] <- to[replace]
              } else {
                break
              }
            }
            if (replaced == i) {
              gap_resolved <- TRUE
              break
            }
          }
        }


        if (!gap_resolved) {
          # No way to get this done with multiple replacements - so, exit
          return(if (meanChanged) vec_original else vec)
        }
      }
    } else {
      vec[whichWillBump2] <- new2
    }
  }

   newFullVec <- c(vec, fixed_responses)
  newMean <- mean(newFullVec)
  meanChanged <- (round(newMean, m_prec) != target_mean) # new mean is no longer GRIM-consistent

  # Floating point issues lead to mean drift with multi-item scales - curtail this straight-away
  if (meanChanged) return(vec_original)

  #if(!.equalish(sum(added), sum(removed)) & meanChanged) browser()

    #message(gap_resolved, "Mean diff detected when adding ", added, " while removing ", removed)

  return(vec)

}

.equalish <- function(x, y, tol = rSprite.dust) {
  x <= (y + rSprite.dust) &
    x >= (y - rSprite.dust)
}

#' GRIM test for mean
#'
#' This function tests whether a given mean (with a specific precision) can
#' result from a sample of a given size based on integer responses to one or more
#' items. The test is based on Brown & Heathers (2017).
#' If `return_values = TRUE` and if there is more than one precise mean compatible
#' with the given parameters, all possible means are returned. In that case, if the
#' given mean is not consistent, the closest consistent mean is returned with a
#' warning.
#'
#' @param return_values Should all means consistent with the given parameters be returned?
#' @inheritParams set_parameters
#'
#' @return Either TRUE/FALSE, or all possible means (if test passes)/closest consistent mean (if test fails)
#' @export
#'
#' @examples
#' # A sample of 28 integers cannot result in a mean of 5.19. This is shown by
#' GRIM_test(5.19, 28)
#'
#' # To find the closest possible mean, set return_values to TRUE
#' GRIM_test(5.19, 28, return_values = TRUE)
#'
#' @references
#' \insertRef{brown2017grim}{rsprite2}


GRIM_test <- function(mean, n_obs, m_prec = NULL, n_items = 1, return_values = FALSE) {
  if (is.null(m_prec)) {
    m_prec <- max(nchar(sub("^[0-9]*", "", mean)) - 1, 0)
  }

  assert_count(m_prec)
  assert_count(n_obs)
  assert_count(n_items)
  assert_logical(return_values)
  assert_number(mean)

  if (n_obs * n_items > 10 ^ m_prec) {
    warning("The sample size (x number of items) is too big compared to the precision of the reported mean. The GRIM test is only meaningful when N < 10 ^ precision (e.g. N < 100 for means reported to two decimal places).")
  }

  int <- round(mean * n_obs * n_items) # nearest integer
  frac <- int / (n_obs * n_items) # mean resulting from nearest integer
  dif <- abs(mean - frac)
  granule <- ((0.1^m_prec) / 2) + rSprite.dust # allow for rounding errors
  if (dif > granule) {
    if (!return_values) {
      return(FALSE)
    }
    valid_mean <- round(frac, m_prec)
    prec_format <- paste("%.", m_prec, "f", sep = "")
    warning("Mean ", sprintf(prec_format, mean), " fails GRIM test - closest consistent value: ", sprintf(prec_format, valid_mean))
    return(valid_mean)
  } else {
    if (!return_values) {
      return(TRUE)
    }
    possible_means <- frac
    i <- 1
    original_int <- int
    while (TRUE) {
      int <- int + i
      frac <- int / (n_obs * n_items) # mean resulting from nearest integer
      dif <- abs(mean - frac)
      if (dif > granule) break()
      possible_means <- c(possible_means, frac)
      i <- i + 1
    }
    i <- 1
    int <- original_int
    while (TRUE) {
      int <- int - i
      frac <- int / (n_obs * n_items) # mean resulting from nearest integer
      dif <- abs(mean - frac)
      if (dif > granule) break()
      possible_means <- c(possible_means, frac)
      i <- i + 1
    }
    return(possible_means)
  }

  stop("Branching error - should not get here")
}

# Determine minimum and maximum SDs for given scale ranges, N, and mean.
.sd_limits <- function(n_obs, mean, min_val, max_val, sd_prec = NULL, n_items = 1) {

  if (is.null(sd_prec)) {
    sd_prec <- max(nchar(sub("^[0-9]*", "", mean)) - 1, 0)
  }

  result <- c(-Inf, Inf)

  aMax <- min_val                                # "aMax" means "value of a to produce the max SD"
  aMin <- floor(mean*n_items)/n_items
  bMax <- max(max_val, min_val + 1, aMin + 1)   # sanity check (just max_val would normally be ok)
  bMin <- aMin + 1/n_items
  total <- round(mean * n_obs * n_items)/n_items

  poss_values <- max_val
  for (i in seq_len(n_items)) {
    poss_values <- c(poss_values, min_val:(max_val-1) + (1 / n_items) * (i - 1))
  }
  poss_values <- sort(poss_values)

  for (abm in list(c(aMin, bMin, 1), c(aMax, bMax, 2))) {

    a <- abm[1]
    b <- abm[2]
    m <- abm[3]


    k <- round((total - (n_obs * b)) / (a - b))
    k <- min(max(k, 1), n_obs - 1)               # ensure there is at least one of each of two numbers
    vec <- c(rep(a, k), rep(b, n_obs - k))
    diff <- sum(vec) - total

    if ((diff < 0)) {
      vec <- c(rep(a, k - 1), a + abs(diff), rep(b, n_obs - k))
    }
    else if ((diff > 0)) {
      vec <- c(rep(a, k), b - diff, rep(b, n_obs - k - 1))
    }

    if (round(mean(vec), sd_prec) != round(mean, sd_prec) | !all(floor(vec*10e9) %in% floor(poss_values*10e9))) {
      stop("Error in calculating range of possible standard deviations")
    }

    result[m] <- round(sd(vec), sd_prec)
    }

  return(result)
}


#' GRIMMER test for standard deviation
#'
#' This function tests whether a given standard deviation (with a specific precision)
#' can result from a sample of a given size based on integer responses to one or more
#' items. The test was first proposed by [Anaya (2016)](https://peerj.com/preprints/2400/); here, the algorithm
#' developed by [Allard (2018)](https://aurelienallard.netlify.app/post/anaytic-grimmer-possibility-standard-deviations/) is used,
#' extended by Aurélien Allard to support multi-item scales.
#'
#' @inheritParams set_parameters
#' @param min_val (Optional) Scale minimum. If provided alongside max_val, the function checks whether the SD is consistent with that range.
#' @param max_val (Optional) Scale maximum.
#'
#' @return Logical TRUE/FALSE indicating whether given standard deviation is possible, given the other parameters
#' @export
#'
#' @examples
#' # A sample of 18 integers with mean 3.44 cannot have an SD of 2.47. This is shown by
#' GRIMMER_test(mean = 3.44, sd = 2.47, n_obs = 18)
#'
#'
#' @references
#' \insertRef{anaya2016grimmer}{rsprite2}

# ToDos:
# - add return_values argument to return possible SDs


# # Example inputs:
# mean    <- 1.03
# sd      <- 0.41
# n_obs   <- 40
# n_items <- 1
# m_prec  <- 2
# sd_prec <- 2
# min_val <- NULL
# max_val <- NULL


GRIMMER_test <- function(mean, sd, n_obs, m_prec = NULL, sd_prec = NULL, n_items = 1, min_val = NULL, max_val = NULL) {
  if (is.null(m_prec)) {
    m_prec <- max(nchar(sub("^[0-9]*", "", mean)) - 1, 0)
  }

  if (is.null(sd_prec)) {
    sd_prec <- max(nchar(sub("^[0-9]*", "", sd)) - 1, 0)
  }

  # IN SCRUTINY: specified checkmate namespace
  checkmate::assert_count(m_prec)
  checkmate::assert_count(sd_prec)
  checkmate::assert_count(n_obs)
  checkmate::assert_count(n_items)
  checkmate::assert_number(mean)
  checkmate::assert_number(sd)

  effective_n = n_obs * n_items

  # Applies the GRIM test, and computes the possible mean.
  sum <- mean * effective_n
  realsum <- round(sum)
  realmean <- realsum / effective_n

  #Checks whether mean and SD are within possible range
  if (!is.null(min_val) & !is.null(max_val)) {
    if (mean < min_val | mean > max_val) {
      warning("The mean must be between the scale minimum and maximum")
      return(FALSE)
    }
    sd_limits <- .sd_limits(n_obs, mean, min_val, max_val, sd_prec, n_items)
    if (sd < sd_limits[1] | sd > sd_limits[2]) {
      warning("Given the scale minimum and maximum, the standard deviation has to be between ", sd_limits[1], " and ", sd_limits[2], ".")
      return(FALSE)
    }
  }
  # Creates functions to round a number consistently up or down, when the last digit is 5
  round_down <- function(number, decimals = 2) {
    to_round <- number * 10^(decimals + 1) - floor(number * 10^(decimals)) * 10
    number_rounded <- ifelse(to_round == 5,
                             floor(number * 10^decimals) / 10^decimals,
                             round(number, digits = decimals))
    return(number_rounded)
  }

  round_up <- function(number, decimals = 2) {
    to_round <- number * 10^(decimals + 1) - floor(number * 10^(decimals)) * 10
    number_rounded <- ifelse(to_round == 5,
                             ceiling(number * 10^decimals) / 10^decimals,
                             round(number, digits = decimals))
    return(number_rounded)
  }

  # Applies the GRIM test, to see whether the reconstituted mean is the same as the reported mean (with both down and up rounding)

  consistent_down <- round_down(number = realmean, decimals = m_prec) == mean
  consistent_up <- round_up(number = realmean, decimals = m_prec) == mean

  if (!consistent_down & !consistent_up) {
    warning("GRIM inconsistent - so GRIMMER test cannot be run. See ?GRIM_test")
    return(FALSE)
  }

  # Computes the lower and upper bounds for the sd.

  Lsigma <- ifelse(sd < 5 / (10^(sd_prec+1)), 0, sd - 5 / (10^(sd_prec+1)))
  Usigma <- sd + 5 / (10^(sd_prec+1))

  # Computes the lower and upper bounds for the sum of squares of items.

  lower_bound <- ((n_obs - 1) * Lsigma^2 + n_obs * realmean^2)*n_items^2
  upper_bound <- ((n_obs - 1) * Usigma^2 + n_obs * realmean^2)*n_items^2

  # Checks that there is at least an integer between the lower and upperbound

  if (ceiling(lower_bound) > floor(upper_bound)) {
    return(FALSE)
  }

  # Takes a vector of all the integers between the lowerbound and upperbound

  possible_integers <- ceiling(lower_bound):floor(upper_bound)

  # Creates the predicted variance and sd

  Predicted_Variance <- (possible_integers/n_items^2 - n_obs * realmean^2) / (n_obs - 1)
  Predicted_SD <- sqrt(Predicted_Variance)

  # Computes whether one Predicted_SD matches the SD (trying to round both down and up)

  Rounded_SD_down <- round_down(Predicted_SD, sd_prec)
  Rounded_SD_up <- round_up(Predicted_SD, sd_prec)

  Matches_SD <- Rounded_SD_down == sd | Rounded_SD_up == sd

  if (!any(Matches_SD)) {
    return(FALSE)
  }

  # Computes whether there is an integer of the correct oddness between the lower and upper bounds.
  oddness <- realsum %% 2
  Matches_Oddness <- possible_integers %% 2 == oddness
  return(any(Matches_SD & Matches_Oddness))

  return(TRUE)
}
lhdjung/scrutiny documentation built on Sept. 28, 2024, 12:14 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
lhdjung/scrutiny
Error Detection in Science

special-scripts/grimmer-rsprite2-original/core-functions.R
In lhdjung/scrutiny: Error Detection in Science

R Package Documentation

Browse R Packages

We want your feedback!

lhdjung/scrutiny Error Detection in Science

special-scripts/grimmer-rsprite2-original/core-functions.R In lhdjung/scrutiny: Error Detection in Science

R Package Documentation

Browse R Packages

We want your feedback!

lhdjung/scrutiny
Error Detection in Science

special-scripts/grimmer-rsprite2-original/core-functions.R
In lhdjung/scrutiny: Error Detection in Science