cNORM: Continuous Norming

Documented in checkWeights computeWeights standardizeRakingWeights

#' Weighting of cases through iterative proportional fitting (Raking)
#'
#' Computes and standardizes weights via raking to compensate for non-stratified
#' samples. It is based on the implementation in the survey R package. It reduces
#' data collection #' biases in the norm data by the means of post stratification,
#' thus reducing the effect of unbalanced data in percentile estimation and norm
#' data modeling.
#'
#' This function computes standardized raking weights to overcome biases in norm
#' samples. It generates weights, by drawing on the information of population
#' shares (e. g. for sex, ethnic group, region ...) and subsequently reduces the
#' influence of over-represented groups or increases underrepresented cases. The
#' returned weights are either raw or standardized and scaled to be larger than 0.
#'
#' Raking in general has a number of advantages over post stratification and it
#' additionally allows cNORM to draw on larger datasets, since less cases have
#' to be removed during stratification. To use this function, additionally to the
#' data, a data frame with stratification variables has to be specified. The data
#' frame should include a row with (a) the variable name, (b) the level of the
#' variable and (c) the according population proportion.
#'
#' @param data data.frame with norm sample data.
#' @param population.margins A data.frame including three columns, specifying the
#' variable name in the original dataset used for data stratification, the factor
#' level of the variable and the according population share. Please ensure, the
#' original data does not include factor levels, not present in the
#' population.margins. Additionally, summing up the shares of the different
#' levels of a variable should result in a value near 1.0. The first column must
#' specify the name of the stratification variable, the second the level and
#' the third the proportion
#' @param standardized If TRUE (default), the raking weights are scaled to
#' weights/min(weights)
#' @return a vector with the standardized weights
#' @examples
#' # cNORM features a dataset on vocabulary development (ppvt)
#' # that includes variables like sex or migration. In order
#' # to weight the data, we have to specify the population shares.
#' # According to census, the population includes 52% boys
#' # (factor level 1 in the ppvt dataset) and 70% / 30% of persons
#' # without / with a a history of migration (= 0 / 1 in the dataset).
#' # First we set up the popolation margins with all shares of the
#' # different levels:
#'
#' margins <- data.frame(variables = c("sex", "sex",
#'                                     "migration", "migration"),
#'                       levels = c(1, 2, 0, 1),
#'                       share = c(.52, .48, .7, .3))
#' head(margins)
#'
#' # Now we use the population margins to generate weights
#' # through raking
#'
#' weights <- computeWeights(ppvt, margins)
#'
#'
#' # There are as many different weights as combinations of
#' # factor levels, thus only four in this specific case
#'
#' unique(weights)
#'
#'
#' # To include the weights in the cNORM modelling, we have
#' # to pass them as weights. They are then used to set up
#' # weighted quantiles and as weights in the regession.
#'
#' model <- cnorm(raw = ppvt$raw,
#'                group=ppvt$group,
#'                weights = weights)
#' @export
computeWeights <- function(data, population.margins, standardized = TRUE) {
  data <- as.data.frame(data)
  names(population.margins) <- c("sv", "level", "proportion")
  sv <- unique(population.margins[,1])

  # check if all variables are present in the dataset
  missing.variables <- sv[!sv %in% names(data)]
  if(length(missing.variables) > 0) {
    stop(paste0("Weighting aborted due to missing variable(s): ",
                paste(missing.variables, collapse = ", "),
                " not present in the dataset."))
  }

  props <- as.data.frame(prop.table(xtabs(formula(paste0("~",
                                                         paste(sv, collapse = " + "))),
                                          data = data)))
  props.adjusted <- props

  # Check if every level of every sv in the marginals is contained
  # in the data set and vice versa.
  for(lev in 1:length(sv)) {
    l1 <- unique(props[sv[lev]])
    l2 <- unique(data[sv[lev]])

    if(!all(as.character(l1[,1]) %in% as.character(l2[,1])) ||
       !all(as.character(l2[,1]) %in% as.character(l1[,1]))) {
      stop("Levels of the data and the population marginals in variable ", sv[lev],
           " do not match.\n  Please make sure, every level of every variable is contained in the data set\n at least once and vice versa.")
    }
  }

  # Check if sum of proportions lies between 0.95 and 1.05
  for(marg in 1:length(sv)) {
    marg_sum <- abs(sum(population.margins[population.margins$sv == sv[marg],]$proportion) - 1)
    if(marg_sum > 0.05) {
      warning(paste("Sum of proportions of variable", sv[marg],
                    "is not within [0.95;1.05].\nPlease make sure, the proportions for every single stratification variable sum up to almost 1.00"))
    }
  }

  weights <- rep(1, length.out = nrow(props))
  w <- rep(1, length.out = nrow(props))

  stop.loop <- FALSE
  for(repetitions in 1:100) {
    for(i in 1:length(sv)) {
      sv.sum <- aggregate(formula(paste0("Freq ~ ", sv[i])), data = props.adjusted, FUN = sum)
      tmp <- population.margins[which(population.margins$sv %in% sv[i]), ]

      for(j in 1:nrow(sv.sum)) {
        # Convert current level to character for consistent comparison
        current_level <- as.character(sv.sum[j, 1])

        # Find matching row in population margins by comparing character values
        match_idx <- which(as.character(tmp$level) == current_level)

        if(length(match_idx) == 0) {
          warning(paste("No matching level found for", current_level, "in variable", sv[i],
                        "- check your population.margins data"))
          next
        }

        indizes <- which(as.character(props[, sv[i]]) == current_level)

        if(length(indizes) == 0) {
          warning(paste("No matching observations found for level", current_level,
                        "in variable", sv[i]))
          next
        }

        weights[indizes] <- weights[indizes] *
          tmp$proportion[match_idx] /
          sv.sum[j, 2]

        props.adjusted$Freq <- props$Freq * weights

        if(sum(abs(w - weights)) < 0.000001) {
          cat(paste0("Raking converged normally after ", repetitions, " iterations.\n"))
          stop.loop <- TRUE
          break
        }

        w <- weights
      }
      if(stop.loop)
        break
    }
    if(stop.loop)
      break
  }

  checkWeights(weights)

  if(standardized)
    weights <- standardizeRakingWeights(weights)

  stratification.weights <- rep(1, length.out = nrow(data))

  if(length(sv) > 1) {
    # For multiple stratification variables
    data_combined <- do.call("paste", c(lapply(sv, function(v) as.character(data[[v]])), sep="|"))
    props_combined <- do.call("paste", c(lapply(sv, function(v) as.character(props[[v]])), sep="|"))

    for(i in 1:length(weights)) {
      pattern <- props_combined[i]
      stratification.weights[data_combined == pattern] <- weights[i]
    }
  } else {
    # For a single stratification variable
    sv_name <- sv[1]

    # Get unique levels from props
    unique_levels <- unique(props[[sv_name]])

    # For each level, assign the appropriate weight
    for(i in 1:length(unique_levels)) {
      level_val <- unique_levels[i]
      level_idx <- which(props[[sv_name]] == level_val)[1]  # Get first matching index

      # Convert both to character for comparison
      stratification.weights[as.character(data[[sv_name]]) == as.character(level_val)] <- weights[level_idx]
    }
  }

  return(stratification.weights)
}



#' Check, if NA or values <= 0 occur and issue warning
#' @param weights Raking weights
checkWeights <- function(weights){
  # Get weights from the survey object
  if(min(weights<=0)){
    warning("Negative values or zeros occured during raking. Using the raking weights is not recommended.")
  }else if(sum(is.na(weights)>0)){
    warning("Undefined value occured during raking. Using the raking weights is not recommended.")
  }
}

#' Function for standardizing raking weights
#' Raking weights get divided by the smallest weight. Thereby, all weights
#' become larger or equal to 1 without changing the ratio of the weights
#' to each other.
#' @param weights Raking weights computed by computeRakingWeightsStandardized()
#' @param weights Raking weights computed by computeWeights()
#' @return the standardized weights
standardizeRakingWeights <- function(weights){
  weights_min <- min(weights)
  if(min(weights) <=0){
    warning("Smallest raking weight is zero or below. Weights can not be standardized.\nTherefore, non-standardized weights are used.")
  }
  weights <- weights/weights_min
  return(weights)
}