R/initialization_functions.R
In FactorHet: Estimate Heterogeneous Effects in Factorial Experiments Using Grouping and Sparsity

Documented in prepare_regression_data

#' Prepare Data
#' 
#' Package data to use in FactorHet. Called internally by `FactorHet_mbo`;
#' rarely called directly by the user.
#' 
#' @keywords internal
prepare_regression_data <- function(formula, design, moderator = NULL, 
  group = NULL, task = NULL, 
  choice_order = NULL, weights = NULL,
  single_intercept = NULL,
  forced_randomize = NULL,
  rare_threshold = NULL, weight_dlist = FALSE,
  rare_verbose = NULL){
  
  default_options <- FactorHet_control()
  if (is.null(forced_randomize)){
    forced_randomize <- default_options$forced_randomize
  }
  if (is.null(rare_threshold)){
    rare_threshold <- default_options$rare_threshold
  }
  if (is.null(rare_verbose)){
    rare_verbose <- default_options$rare_verbose
  }
  
  # Prepare the data
  prep_data <- prepare_formula(fmla_main = formula, fmla_moderator = moderator, weights = weights,
                               group = group, task = task, choice_order = choice_order, design = design)
  
  do_interactions <- prep_data$do_interactions
  design <- prep_data$design
  y <- design[[prep_data$outcome]]
  
  weights <- prep_data$weights
  
  if (!is.null(prep_data$name_weights)){
    if (prep_data$name_weights %in% names(design)){
      stop('Name of "weights" duplicated in design factors or outcome. Rename "weights"!')
    }
    
    design[[prep_data$name_weights]] <- weights
  }
  
  group <- data.frame(design[prep_data$name_group])
  task <- data.frame(design[prep_data$name_task])
  choice_order <- data.frame(design[prep_data$name_choice_order])
  
  
  if (ncol(group) == 0){group <- NULL}else{group <- as.character(as.vector(group[,1]))}
  if (ncol(task) == 0){task <- NULL}else{task <- as.character(as.vector(task[,1]))}
  if (ncol(choice_order) == 0){choice_order <- unique_choice <- NULL}else{
    choice_order <- as.character(as.vector(choice_order[,1]))
    unique_choice <- sort(unique(choice_order))
    if (any(is.na(choice_order))){stop('No missingness permitted for "choice_order"')}
    if (length(unique_choice) != 2){stop('choice_order must have exactly two unique values indexing the "left" and "right" profiles.')}
    
  }
  
  conjoint_names <- prep_data[c('name_group', 'name_task', 'name_choice_order')]
  factor_names <- prep_data$factor_names
  formula_recons <- prep_data$formula_recons
  formula_mod <- prep_data$formula_mod
  
  rm(prep_data)
  
  if (is.null(group)){
    group <- 1:nrow(design)
    null_group <- TRUE
  }else{
    null_group <- FALSE
  }
  unique_group <- unique(group)
  if (any(is.na(group))){stop('No Missingness Allowed for Group')}
  n_G <- length(unique(group))
  
  #Build the moderator variables
  if (is.null(moderator)){
    moderator <- ~ 1
  }
  
  
  concom_W <- create_moderator(design = design, moderator = moderator, group = group, unique_group = unique_group)
  W <- concom_W$W
  args_W <- concom_W$args_W
  ncol_W <- concom_W$ncol_W
  W_full <- concom_W$W_full
  
  #From the design, get the level of each factor
  # factor_names <- enquo(factor_names)
  # factor_levels <- select(.data = as.data.frame(design), !! factor_names)
  factor_levels <- lapply(as.data.frame(design)[,factor_names], FUN=function(i){
    if (any(class(i) == 'factor')){
      return(levels(i))
    }else{
      return(sort(unique(i)))
    }
  })
  ordered_factors <- sapply(as.data.frame(design)[,factor_names], FUN=function(i){
    is.ordered(i)
  })
  
  #Create the sparsity penalty
  #and the ANOVA sum to zero constraints
  penalty_for_regression <- create_penalty(factor_levels, weight_dlist = weight_dlist,
                                           ordered_factors = ordered_factors,
                                           make_interactions = do_interactions)
  X <- create_data(design, penalty_for_regression)
  
  #Remove rare levels (i.e. below rare_threshold # of observations)
  rare_output <- remove_rare_levels(X = X, penalty_for_regression = penalty_for_regression, 
                             rare_threshold = rare_threshold, verbose = rare_verbose)
  
  X <- rare_output$X
  penalty_for_regression <- rare_output$penalty_for_regression
  rare_col <- rare_output$rare
  rare_fmt_col <- rare_output$fmt_rare
  rare_unstd_col <- rare_output$unstd_rare
  
  rm(rare_output)
  
  if (length(factor_names) > 1){
    add_restrictions <- c()
    combo_factors <- combn(factor_names, 2)
    for (i in 1:ncol(combo_factors)){
      combo_i <- combo_factors[,i]
      tab_cols <- table(design[[combo_i[1]]], design[[combo_i[2]]])
      if (any(tab_cols == 0)){
        
        ind_restrict <- which(tab_cols == 0, arr.ind = TRUE)
        ind_restrict <- cbind(rownames(tab_cols)[ind_restrict[,'row']],
              colnames(tab_cols)[ind_restrict[,'col']])
        ind_restrict <- paste(combo_i[1], '(', ind_restrict[,1], ')-', combo_i[2], '(', ind_restrict[,2] ,')', sep = '')
        add_restrictions <- c(add_restrictions, ind_restrict)
      }
    }
    
    # Are there any additional restrictions?
    new_restrictions <- setdiff(add_restrictions, rare_fmt_col)
    if (length(new_restrictions) > 0){
      message('Adding ', length(new_restrictions), ' additional randomization restrictions because no observations at the intersection of these levels.')
      message(paste0(new_restrictions, collapse='\n'))
      rare_fmt_col <- c(rare_fmt_col, new_restrictions)
    }
  }

  Fmatrix <- penalty_for_regression[["F"]]
  Dlist <- penalty_for_regression$D
  constraint <- penalty_for_regression$constraint
  basis_M <- penalty_for_regression$constraint_basis
  
  term_position <- penalty_for_regression$term_position
  
  coef_names <- penalty_for_regression$coef
  
  
  #Clip any small values below 1e-15 to be zero
  clip_tiny <- floor(-1/2*log(.Machine$double.eps)/log(10))
  if (clip_tiny < 7){
    clip_tiny <- 7
  }

  basis_M <- drop0(absolute_zap(basis_M, clip_tiny))
  #If choice order provided, DO FORCED CHOICE
  if (!is.null(choice_order)){
    
    if (null_group){stop('Group must be provided for forced choice')}
    forced_output <- make_forced_choice(y = y, X = X, group = group, task = task, 
      choice_order = choice_order, unique_choice = unique_choice,
      weights = weights,
      estimate_intercept = TRUE, #require intercept
      randomize = forced_randomize)
    X <- forced_output$X
    y <- forced_output$y
    weights <- forced_output$weights
    group <- forced_output$group
    task <- forced_output$task
    
    rm(forced_output)
    gc()
    use_forced_choice <- TRUE
  }else{
    use_forced_choice <- FALSE
  }
  
  if (is.null(single_intercept)){#If not provided (default),
    #then do a varying intercept if factorial and
    #single intercept if conjoint (forced choice)
    single_intercept <- use_forced_choice
  }
  group_mapping <- sparseMatrix(i = 1:nrow(X), j = match(group, unique_group), x = 1)

  data_output <- mget(ls())
  class(data_output) <- 'FactorHet_data'
  return(data_output)
}

prepare_deterministic_init <- function(data, K, method, iterations = NULL){
  
  if (!inherits(data, 'FactorHet_data')){
    stop('prepare_deterministic_init needs an object from "prepare_regression_data".')
  }
  
  main_X <- data$X
  levels_X <- c('(Intercept)', unlist(mapply(names(data$factor_levels), data$factor_levels, SIMPLIFY = FALSE, FUN=function(i,j){paste0(i, '(', j, ')')})))
  if (length(setdiff(levels_X, colnames(main_X))) != 0){
    stop('Missing levels in X?')
  }
  main_X <- main_X[ , colnames(main_X) %in% levels_X]
  
  if (K == 1){
    out <- matrix(1, nrow = length(data$unique_group))
  }else if (method == 'forest'){
    out <- forest_init(y = data$y, X = main_X, 
        G = data$group_mapping, W = data$W, K = K)
  }else if (method == 'spectral'){
    out <- spectral_init(data$W, K)
  }else if (method == 'mclust'){
    out <- mclust_init(data$W, K)
  }else if (grepl(method, pattern='^mm')){
    
    if (grepl(method, pattern='^mm_spectral')){
      mm_method <- 'spectral'
    }else if (grepl(method, pattern='^mm_mclust')){
      mm_method <- 'mclust'
    }else if (grepl(method, pattern='^mm_forest')){
      mm_method <- 'forest'
    }else{stop('mm_ must have "spectral" or "mclust" after')}
    
    out <- murphy_murphy_initialize(y = as.numeric(data$y), X = main_X, W = data$W, K = K,
      group_mapping = data$group_mapping, method = mm_method,
      weights = data$weights, iterations = iterations,
      probabilistic = grepl(method, pattern='prob'))

  }else{stop('Invalid deterministic init method')}
  
  out <- list('group_E.prob' = data.frame(
    group = data$unique_group, out))
  
  return(out)
}

#' @importFrom stats sd kmeans
spectral_init <- function(W, K){
  if (ncol(W) == 1 & all(W[,1] == 1)){
    message('mclust initialization requires moderators. Using "random_member" instead.')
    group_E.prob <- sparseMatrix(
      i = 1:nrow(W), j = sample(1:K, nrow(W), replace = TRUE),
      x = 1
    )
    return(group_E.prob)
  }
  if (!requireNamespace('FNN', quietly = TRUE) & !requireNamespace('RSpectra', quietly = TRUE)){
    stop('Spectral initialization requires "FNN" and "RSpectra" installed.')
  }

  scale_W <- apply(W, MARGIN = 2, FUN=function(i){
    if (sd(i) == 0){
      return(rep(0, length(i)))
    }else{
      return(scale(i))
    }
  })
  cov_W <- var(scale_W)
  diag(cov_W)[which(diag(cov_W) == 0)] <- 1
  cov_W <- eigen(cov_W)
  scale_W <- scale_W %*% (cov_W$vectors) %*% diag(x = 1/sqrt(cov_W$values))
  
  check_FNN <- requireNamespace('FNN', quietly = TRUE)
  if (!check_FNN){
    stop('spectral initialization requires FNN')
  }else{
    knn_W <- FNN::get.knn(scale_W, k=10)
  }

  term_1 <- data.frame(expand.grid(Var1 = 1:nrow(knn_W$nn.index), 
    Var2 = 1:ncol(knn_W$nn.index), stringsAsFactors = FALSE))
  term_1$index <- as.vector(knn_W$nn.index)

  term_2 <- data.frame(expand.grid(Var1 = 1:nrow(knn_W$nn.dist), 
    Var2 = 1:ncol(knn_W$nn.dist), stringsAsFactors = FALSE))
  term_2$dist <- as.vector(knn_W$nn.dist)

  knn_W <- merge(term_1, term_2, by = c('Var1', 'Var2'))
  # Old version that uses reshape2
  # aaa <- merge(melt(knn_W$nn.index, value.name = 'index'),
  #                    melt(knn_W$nn.dist, value.name = 'dist'), 
  #                    by = c('Var1', 'Var2'))
  graph_W <- sparseMatrix(i = knn_W$Var1, j = knn_W$index, x = 1, dims = c(nrow(scale_W), nrow(W)))
  graph_W <- graph_W + t(graph_W)
  graph_W@x[graph_W@x != 2] <- 0
  # graph_W@x[graph_W@x == 1] <- 2
  graph_W <- drop0(graph_W)
  graph_W <- graph_W/2
  graph_W <- as(graph_W, 'dgTMatrix')
  graph_W <- with(attributes(graph_W), cbind(i, j) + 1)
  knn_W$id <- paste(knn_W$Var1, knn_W$index, sep = '-')
  rotate_K <- knn_W
  rotate_K$id <- paste(knn_W$index, knn_W$Var1, sep = '-')
  knn_W <- rbind(knn_W, rotate_K)
  weight <- exp(-1/2 * knn_W$dist[match(paste(graph_W[,1], graph_W[,2], sep = '-'), knn_W$id)]^2)
  graph_W <- sparseMatrix(i = graph_W[,1], j = graph_W[,2], x = weight,
                          dims = c(nrow(scale_W), nrow(W)))
  diag(graph_W)[] <- 1
  #Un-normalized graph Laplacian
  graph_L <- Diagonal(x = rowSums(graph_W)) - graph_W
  method <- 'ShiMalik'    
  if (method == 'ShiMalik'){
    L_norm <- Diagonal(x= 1/rowSums(graph_W)) %*% graph_L 
    eigen_L <- RSpectra::eigs(A = L_norm, k = K, which = 'SM')$vectors
  }
  #Separate "real" and imaginary component.
  #Shouldn't be much imaginary but...
  eigen_L <- cbind(Re(eigen_L), Im(eigen_L))
  eigen_only <- eigen_L
  
  eigen_L <- cbind(scale_W, eigen_L)
  
  rotate_L <- apply(eigen_L, MARGIN = 2, FUN=function(i){
    if (sd(i) == 0){
      return(rep(0, length(i)))
    }else{
      return(scale(i))
    }
  })
  rotate_L <- rotate_L[, which(apply(rotate_L, MARGIN = 2, FUN=function(i){all(abs(i) < 1e-6)}) != TRUE)]
  
  cov_W <- var(rotate_L)
  diag(cov_W)[which(diag(cov_W) == 0)] <- 1
  cov_W <- eigen(cov_W)
  cov_W$values[cov_W$values < 0] <- 0
  fmt_eigen <- ifelse(cov_W$values <= 0, 0, 1/sqrt(cov_W$values))
  rotate_L <- rotate_L %*% (cov_W$vectors) %*% diag(x = fmt_eigen)
  rotate_L <- rotate_L[, which(apply(rotate_L, MARGIN = 2, FUN=function(i){all(abs(i) < 1e-6)}) != TRUE)]
  
  eigen_kmeans <- kmeans(eigen_only, centers = K, 
   iter.max = 50, 
   nstart = 5000)
  
  group_E.prob <- as.matrix(sparseMatrix(i = 1:nrow(W), j = eigen_kmeans$cluster, x = 1))

  return(group_E.prob)
}

mclust_init <- function(W, K){
  
  if (ncol(W) == 1 & all(W[,1] == 1)){
    message('mclust initialization requires moderators. Using "random_member" instead.')
    group_E.prob <- sparseMatrix(
      i = 1:nrow(W), j = sample(1:K, nrow(W), replace = TRUE),
      x = 1
    )
    return(group_E.prob)
  }
  
  if (requireNamespace('mclust', quietly = TRUE)){
    hcVVV <- mclust::hcVVV
    fit_hc <- mclust::hc(data = W, modelName = 'VVV', use = 'SPH')
    classif <- as.vector(mclust::hclass(fit_hc, G = K))
    group_E.prob <- sparseMatrix(i = 1:nrow(W), j = classif, x = 1)
    group_E.prob <- as.matrix(group_E.prob)
  }else{
    stop('"mclust" initialization requires "mclust" installed.')
  }
  
  return(group_E.prob)
}

forest_init <- function(y, X, G, W, K){
  use_ranger <- requireNamespace('ranger', quietly = TRUE)
  if (use_ranger){
    big_W <- as.matrix(G %*% W)
    easy_message('Starting Random Forests')
    run_cf <- sapply(1:ncol(X), FUN=function(i){
      est_ranger <- ranger::ranger(factor(y) ~ ., data = data.frame(cbind(big_W, treat = X[,i])), probability = TRUE)
      pred_1 <- predict(est_ranger, data = data.frame(cbind(W, treat = 1)))$prediction[,"1"]
      pred_0 <- predict(est_ranger, data = data.frame(cbind(W, treat = -1)))$prediction[,"1"]
      return(pred_1 - pred_0)
    })
    colnames(run_cf) <- colnames(X)
    message('Performing K-Means on Univariate Heterogeneous Effects')
    fit_km <- kmeans(x = run_cf, centers = K, iter.max = 150, nstart = 2000)
    message('Completed Forest Initialization')
    assign <- as.matrix(sparseMatrix(i = 1:nrow(W), j = fit_km$cluster, x = 1))
    return(assign)
    # return(list(assign = assign, km = fit_km, causal = run_cf))
  }else{stop('"ranger" must be installed for "forest" initialization.')}
}

# Initialization Based on Murphy/Murphy (2020); see the appendix of Goplerud et
# al. for discussion.
#' @importFrom stats median
murphy_murphy_initialize <- function(y, X, W, K, group_mapping, weights,
                                     method = 'mclust', iterations = NULL,
                                     probabilistic = FALSE){
  if (is.null(iterations)){
    if (probabilistic){iterations <- 100}else{iterations <- 50}
  }
  if (K == 1){stop('No initalization needed or permitted if K = 1')}
  # Step 1: Cluster based on moderators 
  if (method == 'forest'){
    classif <- forest_init(y = y, X = X, K = K, G = group_mapping, W = W) %*% 1:K
  }else if (method == 'mclust'){
    if (requireNamespace('mclust', quietly = TRUE)){
      if (ncol(W) == 1 & all(W[,1] == 1)){
        message('mclust initialization requires moderators. Using "random_member" instead.')
        classif <- sample(1:K, nrow(W), replace = TRUE)
      }else{
        classif <- mclust_init(W, K) %*% 1:K
      }
    }else{
      stop('"mclust" initialization requires "mclust" installed.')
    }
  }else if (method == 'spectral'){
    if (ncol(W) == 1 & all(W[,1] == 1)){
      message('spectral initialization requires moderators. Using "random_member" instead.')
      classif <- sample(1:K, nrow(W), replace = TRUE)
    }else{
      classif <- spectral_init(W, K)
      classif <- rowSums(classif %*% Diagonal(x = 1:K))
    }
  }else{stop('Invalid method to Murphy/Murphy')}
  init_classif <- classif
  
  if (probabilistic){
    init_classif <- classif <- as.matrix(sparseMatrix(i = 1:nrow(W), j = classif, x = 1))
  }
  # Step 2a: Loop over logistic regression and reclassify
  counter <- 0
  converge <- FALSE
  while (counter < iterations & converge == FALSE){
    
    if (probabilistic){
      group_E.prob <- classif
    }else{
      group_E.prob <- as.matrix(sparseMatrix(i = 1:nrow(W), j = classif, x = 1))
    }
    
    obs.E.prob <- apply(group_E.prob, MARGIN = 2, FUN=function(i){as.vector(group_mapping %*% i)})
    
    if (probabilistic){
      
      beta <- sapply(list_from_cols(obs.E.prob), FUN=function(k){
        simple_logit(y = y, X = X, obs.E.prob = k, iterations = 15,
                     weights = weights,
                     beta_method = 'cpp', beta_cg_it = 10, prec_ridge = 1/4)
      })
      
    }else{
      beta <- sapply(list_from_cols(obs.E.prob), FUN=function(k){
        simple_logit(y = y[which(k == 1)], X = X[which(k == 1),], iterations = 25, 
                     weights = weights[which(k == 1)],
                     beta_method = 'cpp', beta_cg_it = 10, prec_ridge = 1/4)
      })
    }
    rownames(beta) <- NULL
    pmat <- plogis(as.matrix(X %*% beta))
    brier <- apply(pmat, MARGIN = 2, FUN=function(i){ifelse(y == 1, log(i), log(1-i))})
    sum_brier <- t(group_mapping) %*% brier
    
    if (probabilistic){
      new_classif <- softmax_matrix(as.matrix(sum_brier))
      
      change <- max(abs(classif - new_classif))
      median_change <- median(abs(classif - new_classif))
      if (change < 1e-2){
        converge <- TRUE; break
      }
    }else{
      
      new_classif <- apply(sum_brier, MARGIN = 1, which.max)
      change <- sum(classif != new_classif)
      if (change == 0){converge <- TRUE; break}
      
    }
    counter <- counter + 1
    classif <- new_classif
  }
  
  if (converge == FALSE){
    message('Murphy/Murphy initalization did not fully converge.')
    if (probabilistic){
      message(paste0('The largest change in any cluster probability was ', round(change, 2)))
      message(paste0('The median change in cluster probability was ', round(median_change, 3)))
    }else{
      message(paste0(change, ' observations out of ', length(classif), ' changed cluster membership at the final iteration.'))
    }
  }
  
  if (probabilistic){
    group_E.prob <- as.matrix(new_classif)
    
  }else{
    group_E.prob <- sparseMatrix(i = 1:nrow(W), j = classif, x = 1)
    group_E.prob <- as.matrix(group_E.prob)
    
  }
  return(group_E.prob)
}
Any scripts or data that you put into this service are public.
FactorHet documentation built on April 4, 2025, 5:44 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
FactorHet
Estimate Heterogeneous Effects in Factorial Experiments Using Grouping and Sparsity

R/initialization_functions.R
In FactorHet: Estimate Heterogeneous Effects in Factorial Experiments Using Grouping and Sparsity

Defines functions murphy_murphy_initialize forest_init mclust_init spectral_init prepare_deterministic_init prepare_regression_data

Documented in prepare_regression_data

Try the FactorHet package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

FactorHet Estimate Heterogeneous Effects in Factorial Experiments Using Grouping and Sparsity

R/initialization_functions.R In FactorHet: Estimate Heterogeneous Effects in Factorial Experiments Using Grouping and Sparsity

Defines functions murphy_murphy_initialize forest_init mclust_init spectral_init prepare_deterministic_init prepare_regression_data

Documented in prepare_regression_data

Try the FactorHet package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

FactorHet
Estimate Heterogeneous Effects in Factorial Experiments Using Grouping and Sparsity

R/initialization_functions.R
In FactorHet: Estimate Heterogeneous Effects in Factorial Experiments Using Grouping and Sparsity