R/ipf.R

Defines functions sample_ipf subset_pums align_pums fill_cont_table update_freqs get_targets sample_with_cont samp_ipf_inds assign_weights calc_dists get_ord_dists get_cat_dists

Documented in align_pums assign_weights calc_dists fill_cont_table get_targets sample_ipf sample_with_cont subset_pums update_freqs

#' Sample households PUMS accoording to IPF 
#' 
#' @param n_house number of households to sample 
#' @param pums_h household pums 
#' @param pums_p people pums, for appending on certain variables 
#' @param marginals list with characteristics of marginals 
#' @param alpha number between 0 and 1, weight of categorical variables 
#' @param k number between 0 and 1, weight of orginal variables 
#' @param puma_id id indicating the current puma 
#' @param place_id id indicating the current region
#' 
#' @export
#' 
#' @return households data frame with re-sampled pums_h households 
sample_ipf <- function(n_house, pums_h, pums_p, marginals, alpha = 0, k = .001, 
                       puma_id = NULL, place_id = NULL) {
    if (!requireNamespace("mipfp", quietly = TRUE)) {
      stop("mipfp package needed for sample_ipf function to work.", call. = FALSE)
    }
  
    # Step 1: Align PUMS with Marginals
    pums <- subset_pums(pums_h = pums_h, pums_p = pums_p, marginals = marginals, puma_id = puma_id)
    pums <- align_pums(pums, marginals)

    # Step 2: Fill in the contingency table
    table <- fill_cont_table(pums = pums, marginals = marginals, place_id = place_id, n_house = n_house)
    # Write out the contingency table HERE.
    
    # Step 3: Sample with contingency table weights 
    households <- sample_with_cont(pums = pums, table = table, alpha = alpha, k = k, marginals = marginals)
  
  return(households)
}

#' Align pums with marginal totals 
#' 
#' @param pums_h dataframe of household pums 
#' @param pums_p dataframe of people pums
#' @param marginals list of marginals totals 
#' @param puma_id id for subsetting the pums 
#' @return pums with only relevant 
subset_pums <- function(pums_h, pums_p, marginals, puma_id) {
  # Subset PUMS to only marginal and ID variables ----
  var_names <- names(marginals)
  id_names <- c("SERIALNO", "puma_id", "RELP")
  household_inds <- which(names(pums_h) %in% c(id_names, var_names))
  people_inds <- which(names(pums_p) %in% c(id_names, var_names))
  stopifnot(length(c(household_inds, people_inds)) == length(var_names) + (2 * length(id_names)) - 1)
  
  pums_h_ipf <- pums_h[, household_inds]
  pums_p_ipf <- pums_p[, people_inds]
  stopifnot(all.equal(pums_h_ipf, pums_h[, household_inds]))
  
  # Only keep reference people in person level PUMS 
  pums_p_ipf <- pums_p_ipf[which(pums_p_ipf$RELP == 0), ]
  
  # Join the people variables onto the household vars 
  pums_h_ipf$row_index <- 1:nrow(pums_h_ipf)
  pums <- plyr::join(x = pums_h_ipf, y = pums_p_ipf, type = "left", by = "SERIALNO")
  stopifnot(pums$row_index == 1:nrow(pums_h_ipf))
  stopifnot(all.equal(pums[, 1:length(household_inds)], pums_h[, household_inds]))

  # Remove row index and return the subsetted pums 
  pums <- pums[, -which(duplicated(names(pums)))]
  pums <- pums[, which(names(pums) %in% c(var_names, id_names))]
  
  return(pums)
}

#' Match the pums variables with marginal totals 
#' 
#' @param pums dataframe subsetted to only the marginal vars
#' @param marginals list containing all of the marginal totals 
#' @param suffix what we add to the variable name to create the new variable name.  Default is "_marg"
#' @export
#' @return pums dataframe with the marginal columns binded on
align_pums <- function(pums, marginals, suffix="_marg") {
  var_names <- names(marginals)
  # Loop through each marginal variable, extract the lookup 
  # table, create a new column with the marginal totals 
  # corresponding to the lookup, and add this column to pums 
  for (var in var_names) {
    marg <- marginals[[var]]
    marg_lookup <- marg$lookup
    num_cats <- nrow(marg_lookup)
    new_col <- rep(NA, nrow(pums))    
    
    # Create new column based on the marginal lookup 
    for (row in 1:num_cats) {
      lookup_row <- marg_lookup[row, ]
      marg_rows <- which(pums[, var] >= lookup_row$lower & pums[, var] <= lookup_row$upper)
      new_col[marg_rows] <- lookup_row$marg_names
    }
    # Turn the new column into an ordered factor to match 
    # the ordering of the marginal data-frame 
    new_col <- factor(new_col, levels = marg_lookup$marg_names)

    # Append new column to the pums 
    pums <- data.frame(pums, new_col)
    names(pums)[ncol(pums)] <- paste0(var, suffix)
  }
  
  return(pums)
}

#' Fill marginal contingency with ipf
#' 
#' @param pums dataframe of aligned pums data
#' @param marginals list with marginal totals 
#' @param place_id ID saying which column to extract 
#' the marginals from 
#' @param n_house number of households to sample
#' 
#' @return table a data-frame containing all of the 
#' marginal combinations and their frequencies 
fill_cont_table <- function(pums, marginals, place_id, n_house) {

  # Initialize the seed table 
  marg_col_inds <- grep(pattern = "_marg", names(pums))
  marg_cols <- pums[, marg_col_inds, drop = FALSE]
  seed <- table(marg_cols)
  
  # Extract and scale target_data and target_list 
  targets <- get_targets(marg_cols, marginals, place_id, n_house)
  target_data <- targets$target_data
  target_list <- targets$target_list

  # Fill contingency table with ipf 
  ipf_fit <- mipfp::Ipfp(seed = seed, target.list = target_list, target.data = target_data)
  ipf_tab <- ipf_fit$x.hat

# # Write out the IPF Tables for analysis 
# tab_loc <- file.path("/home/lee/Dropbox/ipf_midas/synthpop_ipf_unif/tables", paste0("table_", place_id))
# saveRDS(object = ipf_tab, file = tab_loc)
    table <- as.data.frame(ipf_tab)
    if(length(dim(ipf_tab)) <= 1){
        colnames(table)[1] <- paste0(names(marginals), "_marg")
    }
  stopifnot(sum(table$Freq) - n_house < 1)

  # Round frequencies to whole numbers, update so
  # it matches the n_house total # of households 
  table$Freq <- round(table$Freq, digits = 0)
  
  # In the rare case where none of the frequncies round up 
  # to one, randomly select n_house of the highest 90th percent
  # quantile...
  if (all(table$Freq == 0)) {
    table <- as.data.frame(ipf_tab)
    high_freq_val <- quantile(table$Freq, probs = .9)
    remaining_inds <- which(table$Freq > high_freq_val)
    inds <- sample(x = remaining_inds, 
                   size = n_house, 
                   replace = TRUE,
                   prob = table$Freq[remaining_inds])
    ind_tab <- table(inds)
    table$Freq[as.numeric(names(ind_tab))] <- as.numeric(ind_tab)
    table$Freq <- round(table$Freq, digits = 0)
  }

  while (sum(table$Freq) != n_house) {
    table$Freq <- update_freqs(table$Freq, n_house)
  }

  return(table)  
}

#' Update frequencies to match # of households 
#' 
#' @param freqs current number of households 
#' @param n_house number of households 
#' 
update_freqs <- function(freqs, n_house) {
  n <- sum(freqs)
  update_ind <- sample(x = which(freqs > 0), size = 1, replace = TRUE)  
  if (n < n_house) {
    freqs[update_ind] <- freqs[update_ind] + 1
  } else if (n > n_house) {
    freqs[update_ind] <- freqs[update_ind] - 1
  }
  
  return(freqs)
}

#' Obtain the target marginals for IPF 
#' 
#' @param marg_cols dataframe of marginal columns 
#' @param marginals list of marginal data 
#' @param place_id id corresponding to the rows of marginals 
#' @param n_house number of households in this sample 
#' 
#' @return list with target_data and target list for this particular
#' place_id 
get_targets <- function(marg_cols, marginals, place_id, n_house) {
  number_vars <- ncol(marg_cols)
  target_data <- vector(mode = "list", length = number_vars)
  target_list <- vector(mode = "list", length = number_vars)
  
  var_names <- names(marg_cols)
  marg_names <- gsub(pattern = "_marg", "", x = var_names)
  for (var in marg_names) {
    # Extract the corresponding marginal row for this variable + place 
    marg_df <- marginals[[var]]$df
    place_row <- which(marg_df$place_id == place_id)
    marg_row <- marg_df[place_row, -1]

    # If all the marginals are 0, set the marginal row 
    # to the average for the state 
    if (all(marg_row == 0)) {
      marg_avg <- unlist(lapply(marg_df, mean)[-1])
      marg_row[1, ] <- marg_avg
    }
    
    # Scale the marginals by the n_house 
    marg_row_props <- as.numeric(marg_row) / sum(as.numeric(marg_row))
    marg_row_scaled <- n_house * marg_row_props
    
    # Update target data and list 
    dim <- which(var == marg_names)
    target_list[[dim]] <- dim
    target_data[[dim]] <- marg_row_scaled
  }
  # Verify that all the marginal data add up to the population total 
  stopifnot(all((unlist(lapply(target_data, sum)) - n_house) < 1))

  return(list(target_list = target_list, target_data = target_data))  
}

#' Sample from pums 
#' @param pums dataframe with marginal columns 
#' @param table dataframe with marginal combinations and frequencies 
#' @param alpha number between 0 and 1, weight of categorical variables 
#' @param k number between 0 and 1, weight of
#' orginal variables 
#' @param marginals list with marginal data 
#' 
#' @return indices of sampled households 
sample_with_cont <- function(pums, table, alpha, k, marginals) {


  # Remove marginal combinations with 0 frequencies 
  table <- table[which(table$Freq != 0), ]
  
  # Create a weight matrix for every contingency 
  # table/pums row combination 
  weights <- assign_weights(pums = pums, table = table, alpha = alpha, 
                            k = k, marginals = marginals)
  
  # Sample the correct frequency for every non-zero marginal 
  # combination, using the weights 
  inds <- list()
  n <- nrow(table)
  
  for (i in 1:n) {  
    m <- table[i, "Freq"]
    m_weights <- weights[, i]
    inds[[i]] <- samp_ipf_inds(m, m_weights) 
  }
  inds <- unlist(inds)
  
  return(inds)  
}

samp_ipf_inds <- function(n, weights) {
  # Replace NA weights with 0's 
  weights[is.na(weights)] <- 0
  
  # Sample n inds using the weights 
  m <- length(weights)
  sample_inds <- sample(1:m, n, prob = weights, replace = TRUE)    
  stopifnot(length(sample_inds) == n)
  
  return(sample_inds)
}

#' Assign weights for ipf-based sampling 
#' 
#' @param pums dataframe with marginal columns 
#' @param table dataframe with marginal combinations and frequencies 
#' @param alpha number between 0 and 1, weight of categorical variables 
#' @param k number between 0 and 1, weight of orginal variables 
#' @param marginals list with marginal data 
#' 
#' @return weight matrix, rows = rows in pums, columns = rows in table 
assign_weights <- function(pums, table, alpha, k, marginals) {
  # Initialize a distance matrix 
  n <- nrow(pums)
  m <- nrow(table)
  dist_mat <- matrix(0, nrow = n, ncol = m)  
  # Calculate the weights for each column of the distance matrix 
    for (i in 1:m) {
        col_weights <- calc_dists(pums = pums, table_row = table[i, ], 
                                  alpha = alpha, k = k, marginals = marginals)
    dist_mat[, i] <- col_weights
  }
  
  # Convert the distance matrix into probability matrix 
  pums_dist_sums <- colSums(dist_mat, na.rm = TRUE)
  sum_df <- matrix(rep(pums_dist_sums, n), ncol = m, byrow = TRUE)
  probs <- dist_mat / sum_df
  
  return(probs)
}


#' Calculate distance b/w cont table row and pums 
#' 
#' @param pums dataframe with marginal columns 
#' @param table_row row of table to calculate distance 
#' @param alpha number between 0 and 1, weight of categorical variables 
#' @param k number between 0 and 1, weight of orginal variables 
#' @param marginals list with marginal data 
#' 
calc_dists <- function(pums, table_row, alpha, k, marginals) {
  # Get the ordinal and categorical variable names
  var_types <- lapply(marginals, function(x) x[["type"]])
  ord_vars <- names(which(var_types == "ord"))
  cat_vars <- names(which(var_types == "cat"))
  ord_df <- 1
  cat_df <- 1
   
  # Get ordinal distances 
  if (length(ord_vars) > 0) {
    ord_df <- sapply(1:length(ord_vars), get_ord_dists, ord_vars, table_row, pums, alpha, k)
  }
 
  # Get categorical distances 
  if (length(cat_vars) > 0) {
    cat_df <- sapply(1:length(cat_vars), get_cat_dists, cat_vars, table_row, pums, alpha, k)
  }

  # Combine ordinal and categorical distances and return   
  df <- cbind(ord_df, cat_df)
  dists <- apply(df, 1, prod)
  
  return(dists)
}

get_ord_dists <- function(ind, ord_vars, table_row, pums, alpha, k) {
  # Extract the orginal variable from pums 
  ord_var_name <- paste0(ord_vars[ind], "_marg")  
  pums_var <- pums[, ord_var_name]
  stopifnot(class(pums_var) == "factor")
  
  # Calculate the range of the ordinal variable 
  min_level <- min(as.integer(pums_var), na.rm = TRUE)
  max_level <- max(as.integer(pums_var), na.rm = TRUE)
  r <- max_level - min_level
  
  dip <- as.integer(pums_var)
  dic <- as.integer(table_row[ord_var_name])
  dist <- 1 - abs((dip - dic) / r)^k
     
  return(dist)
}

get_cat_dists <- function(ind, cat_vars, table_row, pums, alpha, k) {

  # Get PUMS rows of categorical val 
  cat_var_name <- paste0(cat_vars[ind], "_marg")
  dip <- as.character(pums[, cat_var_name])
  
    ## Get value of cat val, calculate the distance and return
    cat_val <- table_row[cat_var_name]
  dic <- lapply(cat_val, as.character)[[1]]
  dist <- ifelse(dip == dic, alpha, 1 - alpha) 
  dist <- 1 - dist
  
  return(dist)
}

Try the spew package in your browser

Any scripts or data that you put into this service are public.

spew documentation built on Nov. 17, 2017, 7:36 a.m.