R/initialization_functions.R

Defines functions murphy_murphy_initialize forest_init mclust_init spectral_init prepare_deterministic_init prepare_regression_data

Documented in prepare_regression_data

#' Prepare Data
#' 
#' Package data to use in FactorHet. Called internally by `FactorHet_mbo`;
#' rarely called directly by the user.
#' 
#' @keywords internal
prepare_regression_data <- function(formula, design, moderator = NULL, 
  group = NULL, task = NULL, 
  choice_order = NULL, weights = NULL,
  single_intercept = NULL,
  forced_randomize = NULL,
  rare_threshold = NULL, weight_dlist = FALSE,
  rare_verbose = NULL){
  
  default_options <- FactorHet_control()
  if (is.null(forced_randomize)){
    forced_randomize <- default_options$forced_randomize
  }
  if (is.null(rare_threshold)){
    rare_threshold <- default_options$rare_threshold
  }
  if (is.null(rare_verbose)){
    rare_verbose <- default_options$rare_verbose
  }
  
  # Prepare the data
  prep_data <- prepare_formula(fmla_main = formula, fmla_moderator = moderator, weights = weights,
                               group = group, task = task, choice_order = choice_order, design = design)
  
  do_interactions <- prep_data$do_interactions
  design <- prep_data$design
  y <- design[[prep_data$outcome]]
  
  weights <- prep_data$weights
  
  if (!is.null(prep_data$name_weights)){
    if (prep_data$name_weights %in% names(design)){
      stop('Name of "weights" duplicated in design factors or outcome. Rename "weights"!')
    }
    
    design[[prep_data$name_weights]] <- weights
  }
  
  group <- data.frame(design[prep_data$name_group])
  task <- data.frame(design[prep_data$name_task])
  choice_order <- data.frame(design[prep_data$name_choice_order])
  
  
  if (ncol(group) == 0){group <- NULL}else{group <- as.character(as.vector(group[,1]))}
  if (ncol(task) == 0){task <- NULL}else{task <- as.character(as.vector(task[,1]))}
  if (ncol(choice_order) == 0){choice_order <- unique_choice <- NULL}else{
    choice_order <- as.character(as.vector(choice_order[,1]))
    unique_choice <- sort(unique(choice_order))
    if (any(is.na(choice_order))){stop('No missingness permitted for "choice_order"')}
    if (length(unique_choice) != 2){stop('choice_order must have exactly two unique values indexing the "left" and "right" profiles.')}
    
  }
  
  conjoint_names <- prep_data[c('name_group', 'name_task', 'name_choice_order')]
  factor_names <- prep_data$factor_names
  formula_recons <- prep_data$formula_recons
  formula_mod <- prep_data$formula_mod
  
  rm(prep_data)
  
  if (is.null(group)){
    group <- 1:nrow(design)
    null_group <- TRUE
  }else{
    null_group <- FALSE
  }
  unique_group <- unique(group)
  if (any(is.na(group))){stop('No Missingness Allowed for Group')}
  n_G <- length(unique(group))
  
  #Build the moderator variables
  if (is.null(moderator)){
    moderator <- ~ 1
  }
  
  
  concom_W <- create_moderator(design = design, moderator = moderator, group = group, unique_group = unique_group)
  W <- concom_W$W
  args_W <- concom_W$args_W
  ncol_W <- concom_W$ncol_W
  W_full <- concom_W$W_full
  
  #From the design, get the level of each factor
  # factor_names <- enquo(factor_names)
  # factor_levels <- select(.data = as.data.frame(design), !! factor_names)
  factor_levels <- lapply(as.data.frame(design)[,factor_names], FUN=function(i){
    if (any(class(i) == 'factor')){
      return(levels(i))
    }else{
      return(sort(unique(i)))
    }
  })
  ordered_factors <- sapply(as.data.frame(design)[,factor_names], FUN=function(i){
    is.ordered(i)
  })
  
  #Create the sparsity penalty
  #and the ANOVA sum to zero constraints
  penalty_for_regression <- create_penalty(factor_levels, weight_dlist = weight_dlist,
                                           ordered_factors = ordered_factors,
                                           make_interactions = do_interactions)
  X <- create_data(design, penalty_for_regression)
  
  #Remove rare levels (i.e. below rare_threshold # of observations)
  rare_output <- remove_rare_levels(X = X, penalty_for_regression = penalty_for_regression, 
                             rare_threshold = rare_threshold, verbose = rare_verbose)
  
  X <- rare_output$X
  penalty_for_regression <- rare_output$penalty_for_regression
  rare_col <- rare_output$rare
  rare_fmt_col <- rare_output$fmt_rare
  rare_unstd_col <- rare_output$unstd_rare
  
  rm(rare_output)
  
  if (length(factor_names) > 1){
    add_restrictions <- c()
    combo_factors <- combn(factor_names, 2)
    for (i in 1:ncol(combo_factors)){
      combo_i <- combo_factors[,i]
      tab_cols <- table(design[[combo_i[1]]], design[[combo_i[2]]])
      if (any(tab_cols == 0)){
        
        ind_restrict <- which(tab_cols == 0, arr.ind = TRUE)
        ind_restrict <- cbind(rownames(tab_cols)[ind_restrict[,'row']],
              colnames(tab_cols)[ind_restrict[,'col']])
        ind_restrict <- paste(combo_i[1], '(', ind_restrict[,1], ')-', combo_i[2], '(', ind_restrict[,2] ,')', sep = '')
        add_restrictions <- c(add_restrictions, ind_restrict)
      }
    }
    
    # Are there any additional restrictions?
    new_restrictions <- setdiff(add_restrictions, rare_fmt_col)
    if (length(new_restrictions) > 0){
      message('Adding ', length(new_restrictions), ' additional randomization restrictions because no observations at the intersection of these levels.')
      message(paste0(new_restrictions, collapse='\n'))
      rare_fmt_col <- c(rare_fmt_col, new_restrictions)
    }
  }

  Fmatrix <- penalty_for_regression[["F"]]
  Dlist <- penalty_for_regression$D
  constraint <- penalty_for_regression$constraint
  basis_M <- penalty_for_regression$constraint_basis
  
  term_position <- penalty_for_regression$term_position
  
  coef_names <- penalty_for_regression$coef
  
  
  #Clip any small values below 1e-15 to be zero
  clip_tiny <- floor(-1/2*log(.Machine$double.eps)/log(10))
  if (clip_tiny < 7){
    clip_tiny <- 7
  }

  basis_M <- drop0(absolute_zap(basis_M, clip_tiny))
  #If choice order provided, DO FORCED CHOICE
  if (!is.null(choice_order)){
    
    if (null_group){stop('Group must be provided for forced choice')}
    forced_output <- make_forced_choice(y = y, X = X, group = group, task = task, 
      choice_order = choice_order, unique_choice = unique_choice,
      weights = weights,
      estimate_intercept = TRUE, #require intercept
      randomize = forced_randomize)
    X <- forced_output$X
    y <- forced_output$y
    weights <- forced_output$weights
    group <- forced_output$group
    task <- forced_output$task
    
    rm(forced_output)
    gc()
    use_forced_choice <- TRUE
  }else{
    use_forced_choice <- FALSE
  }
  
  if (is.null(single_intercept)){#If not provided (default),
    #then do a varying intercept if factorial and
    #single intercept if conjoint (forced choice)
    single_intercept <- use_forced_choice
  }
  group_mapping <- sparseMatrix(i = 1:nrow(X), j = match(group, unique_group), x = 1)

  data_output <- mget(ls())
  class(data_output) <- 'FactorHet_data'
  return(data_output)
}

prepare_deterministic_init <- function(data, K, method, iterations = NULL){
  
  if (!inherits(data, 'FactorHet_data')){
    stop('prepare_deterministic_init needs an object from "prepare_regression_data".')
  }
  
  main_X <- data$X
  levels_X <- c('(Intercept)', unlist(mapply(names(data$factor_levels), data$factor_levels, SIMPLIFY = FALSE, FUN=function(i,j){paste0(i, '(', j, ')')})))
  if (length(setdiff(levels_X, colnames(main_X))) != 0){
    stop('Missing levels in X?')
  }
  main_X <- main_X[ , colnames(main_X) %in% levels_X]
  
  if (K == 1){
    out <- matrix(1, nrow = length(data$unique_group))
  }else if (method == 'forest'){
    out <- forest_init(y = data$y, X = main_X, 
        G = data$group_mapping, W = data$W, K = K)
  }else if (method == 'spectral'){
    out <- spectral_init(data$W, K)
  }else if (method == 'mclust'){
    out <- mclust_init(data$W, K)
  }else if (grepl(method, pattern='^mm')){
    
    if (grepl(method, pattern='^mm_spectral')){
      mm_method <- 'spectral'
    }else if (grepl(method, pattern='^mm_mclust')){
      mm_method <- 'mclust'
    }else if (grepl(method, pattern='^mm_forest')){
      mm_method <- 'forest'
    }else{stop('mm_ must have "spectral" or "mclust" after')}
    
    out <- murphy_murphy_initialize(y = as.numeric(data$y), X = main_X, W = data$W, K = K,
      group_mapping = data$group_mapping, method = mm_method,
      weights = data$weights, iterations = iterations,
      probabilistic = grepl(method, pattern='prob'))

  }else{stop('Invalid deterministic init method')}
  
  out <- list('group_E.prob' = data.frame(
    group = data$unique_group, out))
  
  return(out)
}

#' @importFrom stats sd kmeans
spectral_init <- function(W, K){
  if (ncol(W) == 1 & all(W[,1] == 1)){
    message('mclust initialization requires moderators. Using "random_member" instead.')
    group_E.prob <- sparseMatrix(
      i = 1:nrow(W), j = sample(1:K, nrow(W), replace = TRUE),
      x = 1
    )
    return(group_E.prob)
  }
  if (!requireNamespace('FNN', quietly = TRUE) & !requireNamespace('RSpectra', quietly = TRUE)){
    stop('Spectral initialization requires "FNN" and "RSpectra" installed.')
  }

  scale_W <- apply(W, MARGIN = 2, FUN=function(i){
    if (sd(i) == 0){
      return(rep(0, length(i)))
    }else{
      return(scale(i))
    }
  })
  cov_W <- var(scale_W)
  diag(cov_W)[which(diag(cov_W) == 0)] <- 1
  cov_W <- eigen(cov_W)
  scale_W <- scale_W %*% (cov_W$vectors) %*% diag(x = 1/sqrt(cov_W$values))
  
  check_FNN <- requireNamespace('FNN', quietly = TRUE)
  if (!check_FNN){
    stop('spectral initialization requires FNN')
  }else{
    knn_W <- FNN::get.knn(scale_W, k=10)
  }

  term_1 <- data.frame(expand.grid(Var1 = 1:nrow(knn_W$nn.index), 
    Var2 = 1:ncol(knn_W$nn.index), stringsAsFactors = FALSE))
  term_1$index <- as.vector(knn_W$nn.index)

  term_2 <- data.frame(expand.grid(Var1 = 1:nrow(knn_W$nn.dist), 
    Var2 = 1:ncol(knn_W$nn.dist), stringsAsFactors = FALSE))
  term_2$dist <- as.vector(knn_W$nn.dist)

  knn_W <- merge(term_1, term_2, by = c('Var1', 'Var2'))
  # Old version that uses reshape2
  # aaa <- merge(melt(knn_W$nn.index, value.name = 'index'),
  #                    melt(knn_W$nn.dist, value.name = 'dist'), 
  #                    by = c('Var1', 'Var2'))
  graph_W <- sparseMatrix(i = knn_W$Var1, j = knn_W$index, x = 1, dims = c(nrow(scale_W), nrow(W)))
  graph_W <- graph_W + t(graph_W)
  graph_W@x[graph_W@x != 2] <- 0
  # graph_W@x[graph_W@x == 1] <- 2
  graph_W <- drop0(graph_W)
  graph_W <- graph_W/2
  graph_W <- as(graph_W, 'dgTMatrix')
  graph_W <- with(attributes(graph_W), cbind(i, j) + 1)
  knn_W$id <- paste(knn_W$Var1, knn_W$index, sep = '-')
  rotate_K <- knn_W
  rotate_K$id <- paste(knn_W$index, knn_W$Var1, sep = '-')
  knn_W <- rbind(knn_W, rotate_K)
  weight <- exp(-1/2 * knn_W$dist[match(paste(graph_W[,1], graph_W[,2], sep = '-'), knn_W$id)]^2)
  graph_W <- sparseMatrix(i = graph_W[,1], j = graph_W[,2], x = weight,
                          dims = c(nrow(scale_W), nrow(W)))
  diag(graph_W)[] <- 1
  #Un-normalized graph Laplacian
  graph_L <- Diagonal(x = rowSums(graph_W)) - graph_W
  method <- 'ShiMalik'    
  if (method == 'ShiMalik'){
    L_norm <- Diagonal(x= 1/rowSums(graph_W)) %*% graph_L 
    eigen_L <- RSpectra::eigs(A = L_norm, k = K, which = 'SM')$vectors
  }
  #Separate "real" and imaginary component.
  #Shouldn't be much imaginary but...
  eigen_L <- cbind(Re(eigen_L), Im(eigen_L))
  eigen_only <- eigen_L
  
  eigen_L <- cbind(scale_W, eigen_L)
  
  rotate_L <- apply(eigen_L, MARGIN = 2, FUN=function(i){
    if (sd(i) == 0){
      return(rep(0, length(i)))
    }else{
      return(scale(i))
    }
  })
  rotate_L <- rotate_L[, which(apply(rotate_L, MARGIN = 2, FUN=function(i){all(abs(i) < 1e-6)}) != TRUE)]
  
  cov_W <- var(rotate_L)
  diag(cov_W)[which(diag(cov_W) == 0)] <- 1
  cov_W <- eigen(cov_W)
  cov_W$values[cov_W$values < 0] <- 0
  fmt_eigen <- ifelse(cov_W$values <= 0, 0, 1/sqrt(cov_W$values))
  rotate_L <- rotate_L %*% (cov_W$vectors) %*% diag(x = fmt_eigen)
  rotate_L <- rotate_L[, which(apply(rotate_L, MARGIN = 2, FUN=function(i){all(abs(i) < 1e-6)}) != TRUE)]
  
  eigen_kmeans <- kmeans(eigen_only, centers = K, 
   iter.max = 50, 
   nstart = 5000)
  
  group_E.prob <- as.matrix(sparseMatrix(i = 1:nrow(W), j = eigen_kmeans$cluster, x = 1))

  return(group_E.prob)
}

mclust_init <- function(W, K){
  
  if (ncol(W) == 1 & all(W[,1] == 1)){
    message('mclust initialization requires moderators. Using "random_member" instead.')
    group_E.prob <- sparseMatrix(
      i = 1:nrow(W), j = sample(1:K, nrow(W), replace = TRUE),
      x = 1
    )
    return(group_E.prob)
  }
  
  if (requireNamespace('mclust', quietly = TRUE)){
    hcVVV <- mclust::hcVVV
    fit_hc <- mclust::hc(data = W, modelName = 'VVV', use = 'SPH')
    classif <- as.vector(mclust::hclass(fit_hc, G = K))
    group_E.prob <- sparseMatrix(i = 1:nrow(W), j = classif, x = 1)
    group_E.prob <- as.matrix(group_E.prob)
  }else{
    stop('"mclust" initialization requires "mclust" installed.')
  }
  
  return(group_E.prob)
}

forest_init <- function(y, X, G, W, K){
  use_ranger <- requireNamespace('ranger', quietly = TRUE)
  if (use_ranger){
    big_W <- as.matrix(G %*% W)
    easy_message('Starting Random Forests')
    run_cf <- sapply(1:ncol(X), FUN=function(i){
      est_ranger <- ranger::ranger(factor(y) ~ ., data = data.frame(cbind(big_W, treat = X[,i])), probability = TRUE)
      pred_1 <- predict(est_ranger, data = data.frame(cbind(W, treat = 1)))$prediction[,"1"]
      pred_0 <- predict(est_ranger, data = data.frame(cbind(W, treat = -1)))$prediction[,"1"]
      return(pred_1 - pred_0)
    })
    colnames(run_cf) <- colnames(X)
    message('Performing K-Means on Univariate Heterogeneous Effects')
    fit_km <- kmeans(x = run_cf, centers = K, iter.max = 150, nstart = 2000)
    message('Completed Forest Initialization')
    assign <- as.matrix(sparseMatrix(i = 1:nrow(W), j = fit_km$cluster, x = 1))
    return(assign)
    # return(list(assign = assign, km = fit_km, causal = run_cf))
  }else{stop('"ranger" must be installed for "forest" initialization.')}
}

# Initialization Based on Murphy/Murphy (2020); see the appendix of Goplerud et
# al. for discussion.
#' @importFrom stats median
murphy_murphy_initialize <- function(y, X, W, K, group_mapping, weights,
                                     method = 'mclust', iterations = NULL,
                                     probabilistic = FALSE){
  if (is.null(iterations)){
    if (probabilistic){iterations <- 100}else{iterations <- 50}
  }
  if (K == 1){stop('No initalization needed or permitted if K = 1')}
  # Step 1: Cluster based on moderators 
  if (method == 'forest'){
    classif <- forest_init(y = y, X = X, K = K, G = group_mapping, W = W) %*% 1:K
  }else if (method == 'mclust'){
    if (requireNamespace('mclust', quietly = TRUE)){
      if (ncol(W) == 1 & all(W[,1] == 1)){
        message('mclust initialization requires moderators. Using "random_member" instead.')
        classif <- sample(1:K, nrow(W), replace = TRUE)
      }else{
        classif <- mclust_init(W, K) %*% 1:K
      }
    }else{
      stop('"mclust" initialization requires "mclust" installed.')
    }
  }else if (method == 'spectral'){
    if (ncol(W) == 1 & all(W[,1] == 1)){
      message('spectral initialization requires moderators. Using "random_member" instead.')
      classif <- sample(1:K, nrow(W), replace = TRUE)
    }else{
      classif <- spectral_init(W, K)
      classif <- rowSums(classif %*% Diagonal(x = 1:K))
    }
  }else{stop('Invalid method to Murphy/Murphy')}
  init_classif <- classif
  
  if (probabilistic){
    init_classif <- classif <- as.matrix(sparseMatrix(i = 1:nrow(W), j = classif, x = 1))
  }
  # Step 2a: Loop over logistic regression and reclassify
  counter <- 0
  converge <- FALSE
  while (counter < iterations & converge == FALSE){
    
    if (probabilistic){
      group_E.prob <- classif
    }else{
      group_E.prob <- as.matrix(sparseMatrix(i = 1:nrow(W), j = classif, x = 1))
    }
    
    obs.E.prob <- apply(group_E.prob, MARGIN = 2, FUN=function(i){as.vector(group_mapping %*% i)})
    
    if (probabilistic){
      
      beta <- sapply(list_from_cols(obs.E.prob), FUN=function(k){
        simple_logit(y = y, X = X, obs.E.prob = k, iterations = 15,
                     weights = weights,
                     beta_method = 'cpp', beta_cg_it = 10, prec_ridge = 1/4)
      })
      
    }else{
      beta <- sapply(list_from_cols(obs.E.prob), FUN=function(k){
        simple_logit(y = y[which(k == 1)], X = X[which(k == 1),], iterations = 25, 
                     weights = weights[which(k == 1)],
                     beta_method = 'cpp', beta_cg_it = 10, prec_ridge = 1/4)
      })
    }
    rownames(beta) <- NULL
    pmat <- plogis(as.matrix(X %*% beta))
    brier <- apply(pmat, MARGIN = 2, FUN=function(i){ifelse(y == 1, log(i), log(1-i))})
    sum_brier <- t(group_mapping) %*% brier
    
    if (probabilistic){
      new_classif <- softmax_matrix(as.matrix(sum_brier))
      
      change <- max(abs(classif - new_classif))
      median_change <- median(abs(classif - new_classif))
      if (change < 1e-2){
        converge <- TRUE; break
      }
    }else{
      
      new_classif <- apply(sum_brier, MARGIN = 1, which.max)
      change <- sum(classif != new_classif)
      if (change == 0){converge <- TRUE; break}
      
    }
    counter <- counter + 1
    classif <- new_classif
  }
  
  if (converge == FALSE){
    message('Murphy/Murphy initalization did not fully converge.')
    if (probabilistic){
      message(paste0('The largest change in any cluster probability was ', round(change, 2)))
      message(paste0('The median change in cluster probability was ', round(median_change, 3)))
    }else{
      message(paste0(change, ' observations out of ', length(classif), ' changed cluster membership at the final iteration.'))
    }
  }
  
  if (probabilistic){
    group_E.prob <- as.matrix(new_classif)
    
  }else{
    group_E.prob <- sparseMatrix(i = 1:nrow(W), j = classif, x = 1)
    group_E.prob <- as.matrix(group_E.prob)
    
  }
  return(group_E.prob)
}

Try the FactorHet package in your browser

Any scripts or data that you put into this service are public.

FactorHet documentation built on April 4, 2025, 5:44 a.m.