R/helper_funcs.R

Defines functions get_fatigueeffect get_carryovereffect get_profileordereffect get_CRT_pval CO_stat PO_stat hiernet_group

# Obtaining main HierNet test statistic
hiernet_group = function(hiernet_object, idx, X, analysis = 0, group) {
  main = hiernet_object$bp[idx] - hiernet_object$bn[idx]
  main_contribution = vector()
  for (i in 1:length(group)) {
    main_contribution[i] = sum((main[group[[i]]] - mean(main[group[[i]]]))^2)
  }

  I_int = list()
  for (i in 1:length(idx)) {
    I_int[[i]] = (hiernet_object$th[idx[i], ] + hiernet_object$th[, idx[i]])/2
  }


  int_cont_bygroup = vector()
  tracking_all_ints = list()
  for (i in 1:length(group)) {
    in_int = I_int[group[[i]]]
    int_means = Reduce("+", in_int)/length(in_int)
    getting_int_cont = vector()
    for (j in 1:length(in_int[[1]])) {
      rel_ints = sapply(in_int, "[[", j)
      getting_int_cont[j] = sum((rel_ints - int_means[j])^2)
    }
    tracking_all_ints[[i]] = getting_int_cont
    int_cont_bygroup[i] = sum(getting_int_cont)
  }

  # divide by 2 to account for counting both left and right profile
  ts = sum(main_contribution)/2 + sum(int_cont_bygroup)/2

  if (analysis > 0) {
    main_contribution = sum(main_contribution)/2
    int_contribution = sum(int_cont_bygroup)/2
    largest_ints = largest_ints_idx = list()
    for (i in 1:length(group)) {
      largest_ints[[i]] = sort(tracking_all_ints[[i]], decreasing = TRUE)[c(1:(2*analysis))]
      rel_int_name = paste0(colnames(X)[idx][group[[i]]], collapse = ", ")
      largest_ints_idx[[i]] = paste0("interactions with ",colnames(X)[order(tracking_all_ints[[i]], decreasing = TRUE)[c(1:(2*analysis))]])

    }

    largest_ints = unlist(largest_ints)
    largest_ints_idx = unlist(largest_ints_idx)

    largest_ints_final = sort(largest_ints, decreasing = TRUE)[seq(1, (2*analysis), by = 2)]

    largest_int_contributer = largest_ints_idx[order(largest_ints, decreasing = TRUE)[seq(1, (2*analysis), by = 2)]]

    analysis_list = list()
    analysis_list$observed_TS = ts
    analysis_list$main_contirbution = main_contribution
    analysis_list$int_contribution = int_contribution
    analysis_list$largest_int_size = largest_ints_final
    analysis_list$largest_int_contributer = largest_int_contributer


    return(analysis_list)

  } else {
    return(ts)
  }
}

# Obtaining profile order effect test statistic
PO_stat = function(hiernet_object, in_idx_left, in_idx_right, in_idx_respondent) {
  main_1 = hiernet_object$bp[in_idx_left] - hiernet_object$bn[in_idx_left]
  main_2 = hiernet_object$bp[in_idx_right] - hiernet_object$bn[in_idx_right]

  # interaction effects
  within_int_list_left = list()
  within_int_list_right = list()

  for (i in 1:length(in_idx_left)) {
    within_int_list_left[[i]] = (hiernet_object$th[in_idx_left[i], in_idx_left] + hiernet_object$th[in_idx_left, in_idx_left[i]])/2
    within_int_list_right[[i]] = (hiernet_object$th[in_idx_right[i], in_idx_right] + hiernet_object$th[in_idx_right, in_idx_right[i]])/2
  }

  within_diff = unlist(Map("+", within_int_list_left, within_int_list_right))

  #between profiles
  between_int_list_left = list()
  between_int_list_right = list()

  for (i in 1:length(in_idx_left)) {
    between_int_list_left[[i]] = (hiernet_object$th[in_idx_left[i], in_idx_right] + hiernet_object$th[in_idx_right, in_idx_left[i]])/2
    between_int_list_right[[i]] = (hiernet_object$th[in_idx_right[i], in_idx_left] + hiernet_object$th[in_idx_left, in_idx_right[i]])/2
  }

  between_diff = unlist(Map("+", between_int_list_left, between_int_list_right))


  #respondent
  R_int_list_left = list()
  R_int_list_right = list()

  for (i in 1:length(in_idx_left)) {
    R_int_list_left[[i]] = (hiernet_object$th[in_idx_left[i], in_idx_respondent] + hiernet_object$th[in_idx_respondent, in_idx_left[i]])/2
    R_int_list_right[[i]] = (hiernet_object$th[in_idx_right[i], in_idx_respondent] + hiernet_object$th[in_idx_respondent, in_idx_right[i]])/2
  }

  respondent_effects = unlist(Map("+", R_int_list_left, R_int_list_right))

  #division of two in the interactions because of overcounting
  stat= sum((main_1 + main_2)^2) + sum(within_diff^2) + sum(between_diff^2) + sum((respondent_effects)^2)
  return(stat)
}

# Obtaining carryover effect test statistic
CO_stat = function(hiernet_object, idx) {
  I_list = list()
  for (i in 1:length(idx)) {
    I_list[[i]] = (hiernet_object$th[idx[i], -idx] + hiernet_object$th[-idx, idx[i]])/2
  }
  ts = sum(unlist(I_list)^2)
  return(ts)
}

# Main helper function that performs CRT to test for Y independent of X given Z
get_CRT_pval = function(x, y, xcols, left_idx, right_idx, design, B, num_cores, profileorder_constraint, lambda, non_factor_idx, in_levs, analysis, p, resample_func_1, resample_func_2, tol, resample_X, full_X, restricted_X, left_allowed, right_allowed, forced, speedup, seed, supplyown_resamples, parallel, nfolds, verbose) {
  num_x_levs = levels(x[, xcols[1]])

  # checks
  if (!(all(sapply(x[, !(1:ncol(x)) %in% non_factor_idx], function(x) is(x, "factor"))))) stop("factors provided in formula are not all factors please supply non_factor_idx")

  if (!all(in_levs %in% num_x_levs)) stop("in_levs supplied are not levels in X")
  if (length(left_idx) != length(right_idx)) stop("length left idx does not match right idx")

  if (length(xcols) != 2) stop("supply left and right column for factor X")
  if (!is.null(p)) {
    if (!is.null(in_levs)) {
      if (length(p) != length(xcols)) stop("length of supplied probability should match length of interested levels in_levs")

    } else {
      if (length(num_x_levs) != p) stop("length of supplied probability should match length of number of levels in X")

    }
    if (length(p) != length(xcols)) stop("length of supplied probability should match length of interested levels in_levs")
  }


  # forcing no profile order effect constraint
  if (profileorder_constraint) {
    x_df = x
    n = nrow(x_df)
    empty_df = x_df
    y_new = 1 - y
    for (i in 1:length(left_idx)) {
      empty_df[, left_idx[i]] = x_df[, right_idx[i]]
      empty_df[, right_idx[i]] = x_df[, left_idx[i]]
    }

    final_df = rbind(x_df, empty_df)
    final_df$Y = c(y, y_new)

  } else {
    final_df = x
    final_df$Y = y
  }




  if (is.null(in_levs)) {
    X_names = list()
    for (j in 1:length(xcols)) {
      start_name = colnames(x)[xcols[j]]
      resulting_X_name = vector()
      for (i in 1:length(num_x_levs)) {
        resulting_X_name[i] = paste0(start_name, num_x_levs[i])
      }
      X_names[[j]] = resulting_X_name
    }
    all_X_names = unlist(X_names)


  } else {
    #### focus only on the relevant levels given
    X_names = list()
    for (j in 1:length(xcols)) {
      start_name = colnames(x)[xcols[j]]
      resulting_X_name = vector()
      for (i in 1:length(in_levs)) {
        resulting_X_name[i] = paste0(start_name, in_levs[i])
      }
      X_names[[j]] = resulting_X_name
    }
    all_X_names = unlist(X_names)
  }

  # create X matrix and track index for the relevant effects
  if (is.null(forced)) {

    form = formula(paste0("Y ~ . "))
    X = model.matrix(form, final_df, contrasts.arg = lapply(final_df[, -c(non_factor_idx, ncol(final_df))], contrasts, contrasts = FALSE))[, -1]
    idx_in = (1:ncol(X))[colnames(X) %in% all_X_names]
    all_X_names = X_names
  } else {
    x_names = colnames(x)[xcols]
    forced_names = colnames(x)[forced]
    force_syntax = ""
    for (i in 1:length(forced_names)) {
      force_syntax = paste0(force_syntax, " + ", x_names[1], "*", forced_names[i], " + ", x_names[2], "*", forced_names[i])

    }
    force_syntax = substr(force_syntax, 4, nchar(force_syntax))
    form = formula(paste0("Y ~ . + ", force_syntax))
    X = model.matrix(form, final_df, contrasts.arg = lapply(final_df[, -c(non_factor_idx, ncol(final_df))], contrasts, contrasts = FALSE))[, -1]

    X_forced_names = vector()
    forced_levs = unique(x[, forced[1]])

    all_forced_names = vector()
    for (j in 1:length(forced)) {
      start_name = colnames(x)[forced[j]]
      resulting_forced_name = vector()
      for (i in 1:length(forced_levs)) {
        resulting_forced_name[i] = paste0(start_name, forced_levs[i])
      }
      all_forced_names = c(all_forced_names, resulting_forced_name)
    }

    #repeat this for X:Z and Z:X since model.matrix arbitarily assigns them
    all_possible_ints_1 = list()
    for (j in 1:length(all_forced_names)) {
      temp_vec = list()
      for (i in 1:length(X_names)) {
        temp_vec[[i]] = paste0(all_forced_names[j], ":", X_names[[i]])
      }
      all_possible_ints_1 = c(all_possible_ints_1, temp_vec)
    }

    all_possible_ints_2 = list()
    for (j in 1:length(all_forced_names)) {
      temp_vec = list()
      for (i in 1:length(X_names)) {
        temp_vec[[i]] = paste0(X_names[[i]], ":", all_forced_names[j])
      }
      all_possible_ints_2 = c(all_possible_ints_2, temp_vec)
    }

    all_possible_ints = unlist(c(all_possible_ints_1, all_possible_ints_2))

    idx_in = (1:ncol(X))[colnames(X) %in% c(all_X_names, all_possible_ints)]
    all_X_names = c(X_names, all_possible_ints_1, all_possible_ints_2)


  }

  # this is a quick harmless fix to avoid columns that have all zero
  check_cols = sapply(data.frame(X), function(x) length(unique(x)))
  trouble_cols = as.vector(which(check_cols == 1))
  if (length(trouble_cols) > 0) {
    X[, trouble_cols][1, ] = X[, trouble_cols][1, ] + 1e-5
  }

  in_col_names = colnames(X)[idx_in]

  group_idx_track = vector()
  for (i in 1:length(in_col_names)) {
    group_idx_track[i] = which((sapply(sapply(all_X_names, function(x) which(in_col_names[i] == x)), length)) >0)
  }

  group = list()

  unique_group_idx = unique(group_idx_track)
  for (i in 1:length(unique_group_idx)) {
    group[[i]] = which(group_idx_track == unique_group_idx[i])
  }

  # if in_levs is non-null find idx to resample
  if (is.null(in_levs)) {
    idx_1 = idx_2 = 1:nrow(x)
  } else {
    idx_1 = (1:nrow(x))[x[, xcols[1]] %in% c(in_levs)]
    idx_2 = (1:nrow(x))[x[, xcols[2]] %in% c(in_levs)]
  }
  num_sample_1 = length(idx_1)
  num_sample_2 = length(idx_2)

  set.seed(seed)

  # Creates only Z matrix to perform CV on Z
  if (speedup) {
    Z_df = final_df[, -xcols]
    if (is.null(non_factor_idx)) {
      Z_long = model.matrix(Y ~ . , Z_df, contrasts.arg = lapply(Z_df[, 1:(ncol(Z_df) - 1)], contrasts, contrasts = FALSE))[, -1]
    } else {
      non_factor_names = colnames(x)[non_factor_idx]
      Z_nonfactor_idx = (1:ncol(Z_df))[(colnames(Z_df) %in% non_factor_names)]
      Z_long = model.matrix(Y ~ . , Z_df, contrasts.arg = lapply(Z_df[, -c(Z_nonfactor_idx, ncol(Z_df))], contrasts, contrasts = FALSE))[, -1]

    }

    # obtains CV lambda
    best_lam = hierNet_logistic_CV(lambda, nfolds = nfolds, X = Z_long, y_var = Z_df$Y, tol = tol, constraint = profileorder_constraint, seed = seed)
    if (verbose) {
      cat("Initial step completed: finished computing cross validated lambda")
    }

  } else {
    best_lam = hierNet_logistic_CV(lambda, nfolds = nfolds, X = as.matrix(X), y_var = final_df$Y, tol = tol, constraint = profileorder_constraint, seed = seed)

  }

  if (speedup) {
    chosen_initial = sample(c(0:B), size = 1)
    if (chosen_initial != 0) {
      x_df = x
      if (is.null(supplyown_resamples)) {
        if (design == "Uniform") {
          newx_1 = factor(resample_func_1(resample_X, nrow(x)))
          newx_2 = factor(resample_func_2(resample_X, nrow(x)))
        }
        if (design == "Constrained Uniform") {
          newx_1 = factor(resample_func_1(full_X, restricted_X, nrow(x), left_allowed))
          newx_2 = factor(resample_func_2(full_X, restricted_X, nrow(x), left_allowed))
        }
        if (design == "Nonuniform") {
          newx_1 = factor(resample_func_1(resample_X, nrow(x), p))
          newx_2 = factor(resample_func_2(resample_X, nrow(x), p))
        }

      } else {
        newx_1 = factor(supplyown_resamples[[chosen_initial]][, 1])
        newx_2 = factor(supplyown_resamples[[chosen_initial]][, 2])
      }

      x_df[, xcols[1]][idx_1] = newx_1[idx_1]
      x_df[, xcols[2]][idx_2] = newx_2[idx_2]
      # forcing no profile order effect constraint
      if (profileorder_constraint) {
        n = nrow(x_df)
        empty_df = x_df
        y_new = 1 - y
        for (i in 1:length(left_idx)) {
          empty_df[, left_idx[i]] = x_df[, right_idx[i]]
          empty_df[, right_idx[i]] = x_df[, left_idx[i]]
        }

        final_df = rbind(x_df, empty_df)
        final_df$Y = c(y, y_new)

      } else {
        final_df = x_df
        final_df$Y = y
      }

      X_initial = model.matrix(form, final_df, contrasts.arg = lapply(final_df[, -c(non_factor_idx, ncol(final_df))], contrasts, contrasts = FALSE))[, -1]
      # this is a quick harmless fix to avoid columns that have all zero
      check_cols = sapply(data.frame(X_initial), function(x) length(unique(x)))
      trouble_cols = as.vector(which(check_cols == 1))
      if (length(trouble_cols) > 0) {
        X_initial[, trouble_cols][1, ] = X_initial[, trouble_cols][1, ] + 1e-5
      }

      invisible(capture.output(initial <- hierNet_logistic(as.matrix(X_initial), final_df$Y, lam= best_lam, tol = tol)))
    } else {
      invisible(capture.output(initial <- hierNet_logistic(as.matrix(X), final_df$Y, lam= best_lam, tol = tol)))
    }
    aa = initial
  } else {
    aa = NULL
  }


  invisible(capture.output(fit <- hierNet_logistic(as.matrix(X), final_df$Y, lam= best_lam, tol = tol, aa = aa)))

  obs_test_stat = hiernet_group(fit, idx = idx_in, X = X, analysis = analysis, group = group)

  ## parallel computing setup
  if (parallel) {
    cl <- snow::makeCluster(num_cores)
    doSNOW::registerDoSNOW(cl)
    if (verbose) {
      iterations <- B
      pb <- utils::txtProgressBar(max = iterations, style = 3)
      progress <- function(n) utils::setTxtProgressBar(pb, n)
      opts <- list(progress = progress)
    } else {
      opts = NULL
    }
    ##

    e <- foreach::foreach(j = 1:iterations, .combine = rbind,
                 .options.snow = opts) %dopar%
      {
        set.seed(j + B)
        library(CRTConjoint)

        x_df = x

        if (is.null(supplyown_resamples)) {
          if (design == "Uniform") {
            newx_1 = factor(resample_func_1(resample_X, nrow(x)))
            newx_2 = factor(resample_func_2(resample_X, nrow(x)))
          }
          if (design == "Constrained Uniform") {
            newx_1 = factor(resample_func_1(full_X, restricted_X, nrow(x), left_allowed))
            newx_2 = factor(resample_func_2(full_X, restricted_X, nrow(x), right_allowed))
          }
          if (design == "Nonuniform") {
            newx_1 = factor(resample_func_1(resample_X, nrow(x), p))
            newx_2 = factor(resample_func_2(resample_X, nrow(x), p))
          }

        } else {
          newx_1 = factor(supplyown_resamples[[j]][, 1])
          newx_2 = factor(supplyown_resamples[[j]][, 2])
        }

        x_df[, xcols[1]][idx_1] = newx_1[idx_1]
        x_df[, xcols[2]][idx_2] = newx_2[idx_2]
        # forcing no profile order effect constraint
        if (profileorder_constraint) {
          n = nrow(x_df)
          empty_df = x_df
          y_new = 1 - y
          for (i in 1:length(left_idx)) {
            empty_df[, left_idx[i]] = x_df[, right_idx[i]]
            empty_df[, right_idx[i]] = x_df[, left_idx[i]]
          }

          final_df = rbind(x_df, empty_df)
          final_df$Y = c(y, y_new)

        } else {
          final_df = x_df
          final_df$Y = y
        }

        X = model.matrix(form, final_df, contrasts.arg = lapply(final_df[, -c(non_factor_idx, ncol(final_df))], contrasts, contrasts = FALSE))[, -1]

        # this is a quick harmless fix to avoid columns that have all zero
        check_cols = sapply(data.frame(X), function(x) length(unique(x)))
        trouble_cols = as.vector(which(check_cols == 1))
        if (length(trouble_cols) > 0) {
          X[, trouble_cols][1, ] = X[, trouble_cols][1, ] + 1e-5
        }

        if (!speedup) {
          best_lam = hierNet_logistic_CV(lambda, nfolds = nfolds, X = as.matrix(X), y_var = final_df$Y, tol = tol, constraint = profileorder_constraint, seed = seed)
        }

        invisible(capture.output(fit <- hierNet_logistic(as.matrix(X), final_df$Y, lam= best_lam, tol = tol, aa = aa)))

        resamp_TS = hiernet_group(fit, idx = idx_in, X = X, analysis = 0, group = group)

        return(resamp_TS)
      }

    snow::stopCluster(cl)

  } else {
    e = vector()
    for (j in 1:B) {
      set.seed(j + B)

      x_df = x

      if (is.null(supplyown_resamples)) {
        if (design == "Uniform") {
          newx_1 = factor(resample_func_1(resample_X, nrow(x)))
          newx_2 = factor(resample_func_2(resample_X, nrow(x)))
        }
        if (design == "Constrained Uniform") {
          newx_1 = factor(resample_func_1(full_X, restricted_X, nrow(x), left_allowed))
          newx_2 = factor(resample_func_2(full_X, restricted_X, nrow(x), right_allowed))
        }
        if (design == "Nonuniform") {
          newx_1 = factor(resample_func_1(resample_X, nrow(x), p))
          newx_2 = factor(resample_func_2(resample_X, nrow(x), p))
        }

      } else {
        newx_1 = factor(supplyown_resamples[[j]][, 1])
        newx_2 = factor(supplyown_resamples[[j]][, 2])
      }

      x_df[, xcols[1]][idx_1] = newx_1[idx_1]
      x_df[, xcols[2]][idx_2] = newx_2[idx_2]
      # forcing no profile order effect constraint
      if (profileorder_constraint) {
        n = nrow(x_df)
        empty_df = x_df
        y_new = 1 - y
        for (i in 1:length(left_idx)) {
          empty_df[, left_idx[i]] = x_df[, right_idx[i]]
          empty_df[, right_idx[i]] = x_df[, left_idx[i]]
        }

        final_df = rbind(x_df, empty_df)
        final_df$Y = c(y, y_new)

      } else {
        final_df = x_df
        final_df$Y = y
      }

      X = model.matrix(form, final_df, contrasts.arg = lapply(final_df[, -c(non_factor_idx, ncol(final_df))], contrasts, contrasts = FALSE))[, -1]
      # this is a quick harmless fix to avoid columns that have all zero
      check_cols = sapply(data.frame(X), function(x) length(unique(x)))
      trouble_cols = as.vector(which(check_cols == 1))
      if (length(trouble_cols) > 0) {
        X[, trouble_cols][1, ] = X[, trouble_cols][1, ] + 1e-5
      }

      if (!speedup) {
        best_lam = hierNet_logistic_CV(lambda, nfolds = nfolds, X = as.matrix(X), y_var = final_df$Y, tol = tol, constraint = profileorder_constraint, seed = seed)
      }

      invisible(capture.output(fit <- hierNet_logistic(as.matrix(X), final_df$Y, lam= best_lam, tol = tol, aa = aa)))

      e[j] = hiernet_group(fit, idx = idx_in, X = X, analysis = 0, group = group)
      if (verbose) {
        cat(paste0("Done with task: ",j, " out of ", B, " resamples"))
      }
    }


  }
  p_val = (length(which(e >= as.numeric(obs_test_stat[1]))) + 1)/(B + 1)

  out = list()
  out$p_val = p_val
  out$obs_test_stat = obs_test_stat
  out$resampled_test_stat = e
  out$tol = tol
  if (speedup) {
    out$lam = best_lam
  }
  out$hiernet_fit = fit
  out$seed = seed

  return(out)
}

# Main helper function that performs CRT to test for no profile order effect
get_profileordereffect = function(x, y, left_idx, right_idx, B, num_cores, lambda, non_factor_idx, tol, speedup, seed, parallel, nfolds, verbose) {

  # checks
  if (!(all(sapply(x[, !(1:ncol(x)) %in% non_factor_idx], function(x) is(x, "factor"))))) stop("factors provided in formula are not all factors please supply non_factor_idx")

  if (length(left_idx) != length(right_idx)) stop("length left idx does not match right idx")


  final_df = x
  final_df$Y = y

  form = formula(paste0("Y ~ . "))
  X = model.matrix(form, final_df, contrasts.arg = lapply(final_df[, -c(non_factor_idx, ncol(final_df))], contrasts, contrasts = FALSE))[, -1]

  set.seed(seed)
  if (speedup) {
    chosen_initial = sample(c(0:B), size = 1)
    if (chosen_initial != 0) {
      x_df = x

      resampling_df = final_df
      a = sample(c(0,1), size =nrow(x), replace = TRUE)
      sample_idx = which(a == 1)

      kept = resampling_df[-sample_idx, ]

      b = resampling_df[sample_idx, ]
      new_y = 1 - b$Y
      new_first = b[, left_idx]
      new_second = b[, right_idx]
      name_1 = colnames(new_first)
      name_2 = colnames(new_second)
      colnames(new_first) = name_2
      colnames(new_second) = name_1
      new_df = cbind(new_second, new_first)
      new_df = cbind(new_df, b[, -c(left_idx, right_idx, ncol(b))])

      new_df$Y = new_y
      final_df_initial = rbind(kept, new_df)




      X_initial = model.matrix(form, final_df_initial, contrasts.arg = lapply(final_df_initial[, -c(non_factor_idx, ncol(final_df_initial))], contrasts, contrasts = FALSE))[, -1]

      best_lam = hierNet_logistic_CV(lambda, nfolds = nfolds, X = X_initial, y_var = final_df_initial$Y, tol = tol, constraint = FALSE, seed = seed)

      invisible(capture.output(initial <- hierNet_logistic(as.matrix(X_initial), final_df$Y, lam= best_lam, tol = tol)))

      if (verbose) {
        cat("Initial step completed: finished computing cross validated lambda")
      }

    } else {

      best_lam = hierNet_logistic_CV(lambda, nfolds = nfolds, X = X, y_var = final_df$Y, tol = tol, constraint = FALSE, seed = seed)

      invisible(capture.output(initial <- hierNet_logistic(as.matrix(X), final_df$Y, lam= best_lam, tol = tol)))
    }
    aa = initial
  } else {

    best_lam = hierNet_logistic_CV(lambda, nfolds = nfolds, X = X, y_var = final_df$Y, tol = tol, constraint = FALSE, seed = seed)

    aa = NULL
  }


  invisible(capture.output(fit <- hierNet_logistic(as.matrix(X), final_df$Y, lam= best_lam, tol = tol, aa = aa)))

  # get relevant index
  left_X = final_df[, c(left_idx, ncol(final_df))]
  X_left = model.matrix(form, left_X, contrasts.arg = lapply(left_X[, -c(non_factor_idx, ncol(left_X))], contrasts, contrasts = FALSE))[, -1]
  in_idx_left =  (1:ncol(X))[colnames(X) %in% colnames(X_left)]

  right_X = final_df[, c(right_idx, ncol(final_df))]
  X_right = model.matrix(form, right_X, contrasts.arg = lapply(right_X[, -c(non_factor_idx, ncol(right_X))], contrasts, contrasts = FALSE))[, -1]
  in_idx_right = (1:ncol(X))[colnames(X) %in% colnames(X_right)]

  in_idx_respondent = (1:ncol(X))[-c(in_idx_left, in_idx_right)]
  #


  obs_test_stat = PO_stat(fit, in_idx_left, in_idx_right, in_idx_respondent)

  ## parallel computing setup
  if (parallel) {
    cl <- snow::makeCluster(num_cores)
    doSNOW::registerDoSNOW(cl)
    if (verbose) {
      iterations <- B
      pb <- utils::txtProgressBar(max = iterations, style = 3)
      progress <- function(n) utils::setTxtProgressBar(pb, n)
      opts <- list(progress = progress)
    } else {
      opts = NULL
    }
    ##
    e <- foreach::foreach(j = 1:iterations, .combine = rbind,
                          .options.snow = opts) %dopar%
      {
        set.seed(j + B)
        library(CRTConjoint)
        x_df = x

        resampling_df = final_df
        a = sample(c(0,1), size =nrow(x), replace = TRUE)
        sample_idx = which(a == 1)

        kept = resampling_df[-sample_idx, ]

        b = resampling_df[sample_idx, ]
        new_y = 1 - b$Y
        new_first = b[, left_idx]
        new_second = b[, right_idx]
        name_1 = colnames(new_first)
        name_2 = colnames(new_second)
        colnames(new_first) = name_2
        colnames(new_second) = name_1
        new_df = cbind(new_second, new_first)
        new_df = cbind(new_df, b[, -c(left_idx, right_idx, ncol(b))])

        new_df$Y = new_y
        final_df_initial = rbind(kept, new_df)




        X_initial = model.matrix(form, final_df_initial, contrasts.arg = lapply(final_df_initial[, -c(non_factor_idx, ncol(final_df_initial))], contrasts, contrasts = FALSE))[, -1]

        if (!speedup) {
          best_lam = hierNet_logistic_CV(lambda, nfolds = nfolds, X = X_initial, y_var = final_df_initial$Y, tol = tol, constraint = FALSE, seed = seed)
        }

        invisible(capture.output(initial <- hierNet_logistic(as.matrix(X_initial), final_df_initial$Y, lam= best_lam, tol = tol, aa = aa)))

        resamp_TS = PO_stat(initial, in_idx_left, in_idx_right, in_idx_respondent)
        return(resamp_TS)
      }

    snow::stopCluster(cl)

  } else{
    e = vector()
    for (j in 1:B) {
      set.seed(j + B)
      x_df = x

      resampling_df = final_df
      a = sample(c(0,1), size =nrow(x), replace = TRUE)
      sample_idx = which(a == 1)

      kept = resampling_df[-sample_idx, ]

      b = resampling_df[sample_idx, ]
      new_y = 1 - b$Y
      new_first = b[, left_idx]
      new_second = b[, right_idx]
      name_1 = colnames(new_first)
      name_2 = colnames(new_second)
      colnames(new_first) = name_2
      colnames(new_second) = name_1
      new_df = cbind(new_second, new_first)
      new_df = cbind(new_df, b[, -c(left_idx, right_idx, ncol(b))])

      new_df$Y = new_y
      final_df_initial = rbind(kept, new_df)




      X_initial = model.matrix(form, final_df_initial, contrasts.arg = lapply(final_df_initial[, -c(non_factor_idx, ncol(final_df_initial))], contrasts, contrasts = FALSE))[, -1]

      if (!speedup) {
        best_lam = hierNet_logistic_CV(lambda, nfolds = nfolds, X = X_initial, y_var = final_df_initial$Y, tol = tol, constraint = FALSE, seed = seed)
      }
      invisible(capture.output(initial <- hierNet_logistic(as.matrix(X_initial), final_df_initial$Y, lam= best_lam, tol = tol, aa = aa)))

      e[j] = PO_stat(initial, in_idx_left, in_idx_right, in_idx_respondent)
      if (verbose) {
        cat(paste0("Done with task: ",j, " out of ", B, " resamples"))
      }
    }


  }
  p_val = (length(which(e >= as.numeric(obs_test_stat[1]))) + 1)/(B + 1)

  out = list()
  out$p_val = p_val
  out$obs_test_stat = obs_test_stat
  out$resampled_test_stat = e
  out$tol = tol
  if (speedup) {
    out$lam = best_lam
  }
  out$hiernet_fit = fit
  out$seed = seed
  return(out)

}

# Main helper function that performs CRT to test for no carryover effect
get_carryovereffect = function(x, y, left_idx, right_idx, B, num_cores, lambda, non_factor_idx, tol, seed, parallel, profileorder_constraint, task_var, resample_func, supplyown_resamples, nfolds, verbose) {

  # checks
  if (!(all(sapply(x[, left_idx[!left_idx %in% non_factor_idx]], function(x) is(x, "factor"))))) stop("left factors are not all factors please supply non_factor_idx")

  if (!(all(sapply(x[, right_idx[!right_idx %in% non_factor_idx]], function(x) is(x, "factor"))))) stop("right factors are not all factors please supply non_factor_idx")

  if (length(left_idx) != length(right_idx)) stop("length left idx does not match right idx")

  if((length(unique(table(x[, task_var])))) != 1) {stop("Some tasks are missing")}

  total_tasks = max(x[, task_var])
  Z_tasks = seq(2, total_tasks, by = 2)
  X_tasks = seq(1, max(Z_tasks) -1 , by = 2)

  Z_df = x[(x[, task_var] %in% Z_tasks), ][, c(left_idx, right_idx)]
  Z_left = (1:ncol(Z_df))[colnames(Z_df) %in% colnames(x)[left_idx]]
  Z_right = (1:ncol(Z_df))[!(colnames(Z_df) %in% colnames(x)[left_idx])]
  colnames(Z_df) = paste0("Z_", colnames(Z_df))
  X_df = x[(x[, task_var] %in% X_tasks), ][, c(left_idx, right_idx)]
  X_left = (1:ncol(X_df))[colnames(X_df) %in% colnames(x)[left_idx]]
  X_right = (1:ncol(X_df))[!(colnames(X_df) %in% colnames(x)[left_idx])]
  colnames(X_df) = paste0("X_", colnames(X_df))
  Y = y[(x[, task_var] %in% Z_tasks)]

  original_Xdf = X_df
  original_Zdf = Z_df

  final_df = cbind(X_df, Z_df)
  final_df$Y = Y
  just_Z = cbind(original_Zdf, Y = Y)

  if (profileorder_constraint) {
    y_new = 1 - final_df$Y
    x_df = X_df
    first = X_left
    second = X_right
    n = nrow(x_df)
    empty_df = x_df
    for (i in 1:length(first)) {
      empty_df[, first[i]] = x_df[, second[i]]
      empty_df[, second[i]] = x_df[, first[i]]
    }
    X_df = rbind(x_df, empty_df, x_df, empty_df)

    z_df = Z_df
    first = Z_left
    second = Z_right
    n = nrow(z_df)
    empty_df = z_df
    for (i in 1:length(first)) {
      empty_df[, first[i]] = z_df[, second[i]]
      empty_df[, second[i]] = z_df[, first[i]]
    }
    Z_df = rbind(Z_df, empty_df, empty_df, Z_df)

    full_Y = c(final_df$Y, y_new, y_new, final_df$Y)

    final_df = cbind(X_df, Z_df, Y = full_Y)
    just_Z = cbind(Z_df, Y = full_Y)
  }


  X = model.matrix(Y ~ ., data = final_df, contrasts.arg = lapply(final_df[, -ncol(final_df)], contrasts, contrasts = FALSE))[, -1]

  # this is because X and Z are symmetric so we can WLOG just take the first half which is always the X indexes
  idx = 1:(ncol(X)/2)

  Z_mat = model.matrix(Y ~ ., data = just_Z, contrasts.arg = lapply(just_Z[, -ncol(just_Z)], contrasts, contrasts = FALSE))[, -1]

  set.seed(seed)
  half = (nrow(Z_mat)/4)
  random_idx = suppressWarnings(split(sample(half, half, replace = FALSE), as.factor(1:nfolds)))

  if (profileorder_constraint) {
    for (i in 1:nfolds) {
      random_idx[[i]] = c(random_idx[[i]], random_idx[[i]] + half, random_idx[[i]] + 2*half, random_idx[[i]] + 3*half)
    }
  }


  best_lam = hierNet_logistic_CV(lambda, nfolds = nfolds, X = Z_mat, y_var = final_df$Y, tol = tol, constraint = profileorder_constraint, seed = seed, fold_idx = random_idx)
  if (verbose) {
   cat("Initial step completed: finished computing cross validated lambda")
  }

  if (is.null(resample_func)) {
    resample_func = function(x, seed = sample(c(0, 1000), size = 1), left_idx, right_idx) {
      set.seed(seed)
      df = x[, c(left_idx, right_idx)]
      variable = colnames(x)[c(left_idx, right_idx)]
      len = length(variable)
      resampled = list()
      n = nrow(df)
      for (i in 1:len) {
        var = df[, variable[i]]
        lev = levels(var)
        resampled[[i]] = factor(sample(lev, size = n, replace = TRUE))
      }

      resampled_df = data.frame(resampled[[1]])
      for (i in 2:len) {
        resampled_df = cbind(resampled_df, resampled[[i]])
      }
      colnames(resampled_df) = colnames(df)
      return(resampled_df)
    }
  }

  chosen_initial = sample(c(0:B), size = 1)
  if (chosen_initial != 0) {
    if (is.null(supplyown_resamples)) {
      resampled_x = resample_func(x = x, left_idx = left_idx, right_idx = right_idx, seed = chosen_initial)
      resampled_x_df = resampled_x[(x[, task_var] %in% X_tasks), ]

    } else {
      resampled_x = supplyown_resamples[[chosen_initial]]
      resampled_x_df = resampled_x[(x[, task_var] %in% X_tasks), ]
    }


    colnames(resampled_x_df) = paste0("X_", colnames(resampled_x_df))

    final_df_initial = cbind(resampled_x_df, original_Zdf)
    final_df_initial$Y = Y

    if (profileorder_constraint) {
      x_df = resampled_x_df
      first = X_left
      second = X_right
      n = nrow(x_df)
      empty_df = x_df
      for (i in 1:length(first)) {
        empty_df[, first[i]] = x_df[, second[i]]
        empty_df[, second[i]] = x_df[, first[i]]
      }
      X_df = rbind(x_df, empty_df, x_df, empty_df)

      final_df_initial = cbind(X_df, Z_df, Y = full_Y)
    }


    X_initial = model.matrix(Y ~ ., data = final_df_initial, contrasts.arg = lapply(final_df_initial[, -ncol(final_df)], contrasts, contrasts = FALSE))[, -1]


    invisible(capture.output(initial <- hierNet_logistic(as.matrix(X_initial), final_df$Y, lam= best_lam, tol = tol)))

  } else {
    invisible(capture.output(initial <- hierNet_logistic(as.matrix(X), final_df$Y, lam= best_lam, tol = tol)))
  }
  aa = initial

  invisible(capture.output(fit <- hierNet_logistic(as.matrix(X), final_df$Y, lam= best_lam, tol = tol, aa = aa)))

  obs_test_stat = CO_stat(fit, idx)

  ## parallel computing setup
  if (parallel) {
    cl <- snow::makeCluster(num_cores)
    doSNOW::registerDoSNOW(cl)
    if (verbose) {
      iterations <- B
      pb <- utils::txtProgressBar(max = iterations, style = 3)
      progress <- function(n) utils::setTxtProgressBar(pb, n)
      opts <- list(progress = progress)
    } else {
      opts = NULL
    }
    ##
    e <- foreach::foreach(j = 1:iterations, .combine = rbind,
                          .options.snow = opts) %dopar%
      {
        set.seed(j + B)
        library(CRTConjoint)

        if (is.null(supplyown_resamples)) {
          resampled_x = resample_func(x = x, left_idx = left_idx, right_idx = right_idx, seed = chosen_initial)
          resampled_x_df = resampled_x[(x[, task_var] %in% X_tasks), ]

        } else {
          resampled_x = supplyown_resamples[[j]]
          resampled_x_df = resampled_x[(x[, task_var] %in% X_tasks), ]
        }



        colnames(resampled_x_df) = paste0("X_", colnames(resampled_x_df))

        final_df_initial = cbind(resampled_x_df, original_Zdf)
        final_df_initial$Y = Y

        if (profileorder_constraint) {
          x_df = resampled_x_df
          first = X_left
          second = X_right
          n = nrow(x_df)
          empty_df = x_df
          for (i in 1:length(first)) {
            empty_df[, first[i]] = x_df[, second[i]]
            empty_df[, second[i]] = x_df[, first[i]]
          }
          X_df = rbind(x_df, empty_df, x_df, empty_df)

          final_df_initial = cbind(X_df, Z_df, Y = full_Y)
        }


        X_initial = model.matrix(Y ~ ., data = final_df_initial, contrasts.arg = lapply(final_df_initial[, -ncol(final_df_initial)], contrasts, contrasts = FALSE))[, -1]


        invisible(capture.output(initial <- hierNet_logistic(as.matrix(X_initial), final_df_initial$Y, lam= best_lam, tol = tol, aa = aa)))

        resamp_TS = CO_stat(initial, idx)

        return(resamp_TS)
      }

    snow::stopCluster(cl)

  } else {
    e = vector()
    for (j in 1:B) {

      if (is.null(supplyown_resamples)) {
        resampled_x = resample_func(x = x, left_idx = left_idx, right_idx = right_idx, seed = chosen_initial)
        resampled_x_df = resampled_x[(x[, task_var] %in% X_tasks), ]

      } else {
        resample_x = supplyown_resamples[[j]]
        resample_x_df = resampled_x[(x[, task_var] %in% X_tasks), ]
      }



      colnames(resampled_x_df) = paste0("X_", colnames(resampled_x_df))

      final_df_initial = cbind(resampled_x_df, original_Zdf)
      final_df_initial$Y = Y

      if (profileorder_constraint) {
        x_df = resampled_x_df
        first = X_left
        second = X_right
        n = nrow(x_df)
        empty_df = x_df
        for (i in 1:length(first)) {
          empty_df[, first[i]] = x_df[, second[i]]
          empty_df[, second[i]] = x_df[, first[i]]
        }
        X_df = rbind(x_df, empty_df, x_df, empty_df)

        final_df_initial = cbind(X_df, Z_df, Y = full_Y)
      }


      X_initial = model.matrix(Y ~ ., data = final_df_initial, contrasts.arg = lapply(final_df_initial[, -ncol(final_df)], contrasts, contrasts = FALSE))[, -1]


      invisible(capture.output(initial <- hierNet_logistic(as.matrix(X_initial), final_df_initial$Y, lam= best_lam, tol = tol, aa = aa)))

      e[j] = CO_stat(initial, idx)
      if (verbose) {
        cat(paste0("Done with task: ",j, " out of ", B, " resamples"))
      }
    }


  }
  p_val = (length(which(e >= as.numeric(obs_test_stat[1]))) + 1)/(B + 1)

  out = list()
  out$p_val = p_val
  out$obs_test_stat = obs_test_stat
  out$resampled_test_stat = e
  out$tol = tol
  out$hiernet_fit = fit
  out$seed = seed
  out$lam = best_lam
  return(out)
}

# Main helper function that performs CRT to test for no fatigue effect
get_fatigueeffect = function(x, y, left_idx, right_idx, B, num_cores, lambda, non_factor_idx, tol, speedup, seed, parallel, profileorder_constraint, task_var, respondent_var, nfolds, verbose) {
  # checks
  if (!(all(sapply(x[, left_idx[!left_idx %in% non_factor_idx]], function(x) is(x, "factor"))))) stop("left factors are not all factors please supply non_factor_idx")

  if (!(all(sapply(x[, right_idx[!right_idx %in% non_factor_idx]], function(x) is(x, "factor"))))) stop("right factors are not all factors please supply non_factor_idx")

  if (length(left_idx) != length(right_idx)) stop("length left idx does not match right idx")

  if((length(unique(table(x[, task_var])))) != 1) {stop("Some tasks are missing")}

  # forcing no profile order effect constraint
  if (profileorder_constraint) {
    x_df = x
    n = nrow(x_df)
    empty_df = x_df
    y_new = 1 - y
    for (i in 1:length(left_idx)) {
      empty_df[, left_idx[i]] = x_df[, right_idx[i]]
      empty_df[, right_idx[i]] = x_df[, left_idx[i]]
    }

    final_df = rbind(x_df, empty_df)
    final_df$Y = c(y, y_new)

  } else {
    final_df = x
    final_df$Y = y
  }

  final_df = final_df[, c(left_idx, right_idx, task_var, ncol(final_df))]
  task_var_name = colnames(x)[task_var]
  final_df_var_idx = which(colnames(final_df) == task_var_name)
  set.seed(seed)

  # Creates only Z matrix to perform CV on Z
  if (speedup) {
    Z_df = final_df[, !(colnames(final_df) %in% task_var_name)]
    if (is.null(non_factor_idx)) {
      Z_long = model.matrix(Y ~ . , Z_df, contrasts.arg = lapply(Z_df[, 1:(ncol(Z_df) - 1)], contrasts, contrasts = FALSE))[, -1]
    } else {
      non_factor_names = colnames(x)[non_factor_idx]
      Z_nonfactor_idx = (1:ncol(Z_df))[(colnames(Z_df) %in% non_factor_names)]
      Z_long = model.matrix(Y ~ . , Z_df, contrasts.arg = lapply(Z_df[, -c(Z_nonfactor_idx, ncol(Z_df))], contrasts, contrasts = FALSE))[, -1]

    }

    # obtains CV lambda
    best_lam = hierNet_logistic_CV(lambda, nfolds = nfolds, X = Z_long, y_var = Z_df$Y, tol = tol, constraint = profileorder_constraint, seed = seed)
    if (verbose) {
      cat("Initial step completed: finished computing cross validated lambda")
    }
  } else {
    best_lam = hierNet_logistic_CV(lambda, nfolds = nfolds, X = as.matrix(X), y_var = final_df$Y, tol = tol, constraint = profileorder_constraint, seed = seed)

  }

  form = formula(paste0("Y ~ ."))
  X = model.matrix(form, final_df, contrasts.arg = lapply(final_df[, -c(non_factor_idx, final_df_var_idx, ncol(final_df))], contrasts, contrasts = FALSE))[, -1]

  num_task = length(unique(x[, task_var]))
  respondent_idx = unique(x[, respondent_var])

  if (speedup) {
    chosen_initial = sample(c(0:B), size = 1)
    if (chosen_initial != 0) {

      # shuffle task number
      new_x = x
      for (i in 1:(length(respondent_idx))) {
        tasks = x[, task_var][(x[, respondent_var] == respondent_idx[i])]
        new_tasks = sample(tasks, replace = FALSE)
        new_x[, task_var][(x[, respondent_var] == respondent_idx[i])] = new_tasks
      }
      a = new_x[, task_var]
      resampled = final_df

      if (profileorder_constraint) {
        resampled[, (colnames(final_df) %in% task_var_name)] = c(a,a)
      } else {
        resampled[, (colnames(final_df) %in% task_var_name)] = c(a,a)
      }

      X_initial = model.matrix(form, resampled, contrasts.arg = lapply(resampled[, -c(non_factor_idx, final_df_var_idx, ncol(final_df))], contrasts, contrasts = FALSE))[, -1]

      invisible(capture.output(initial <- hierNet_logistic(as.matrix(X_initial), final_df$Y, lam= best_lam, tol = tol)))
    } else {
      invisible(capture.output(initial <- hierNet_logistic(as.matrix(X), final_df$Y, lam= best_lam, tol = tol)))
    }
    aa = initial
  } else {
    aa = NULL
  }


  invisible(capture.output(fit <- hierNet_logistic(as.matrix(X), final_df$Y, lam= best_lam, tol = tol, aa = aa)))

  I_1 = as.vector(fit$th[final_df_var_idx, ])

  I_2 = as.vector(t(fit$th[, final_df_var_idx]))

  I = (I_1 + I_2)/2

  obs_test_stat = sum(I^2)/2

  ## parallel computing setup
  if (parallel) {
    cl <- snow::makeCluster(num_cores)
    doSNOW::registerDoSNOW(cl)
    if (verbose) {
      iterations <- B
      pb <- utils::txtProgressBar(max = iterations, style = 3)
      progress <- function(n) utils::setTxtProgressBar(pb, n)
      opts <- list(progress = progress)
    } else {
      opts = NULL
    }
    ##
    e <- foreach::foreach(j = 1:iterations, .combine = rbind,
                          .options.snow = opts) %dopar%
      {
        set.seed(j + B)
        library(CRTConjoint)
        # shuffle task number
        new_x = x
        for (i in 1:(length(respondent_idx))) {
          tasks = x[, task_var][(x[, respondent_var] == respondent_idx[i])]
          new_tasks = sample(tasks, replace = FALSE)
          new_x[, task_var][(x[, respondent_var] == respondent_idx[i])] = new_tasks
        }
        a = new_x[, task_var]
        resampled = final_df

        if (profileorder_constraint) {
          resampled[, (colnames(final_df) %in% task_var_name)] = c(a,a)
        } else {
          resampled[, (colnames(final_df) %in% task_var_name)] = c(a,a)
        }

        X_initial = model.matrix(form, resampled, contrasts.arg = lapply(resampled[, -c(non_factor_idx, final_df_var_idx, ncol(final_df))], contrasts, contrasts = FALSE))[, -1]

        if (!speedup) {
          best_lam = hierNet_logistic_CV(lambda, nfolds = nfolds, X = as.matrix(X_initial), y_var = final_df$Y, tol = tol, constraint = profileorder_constraint, seed = seed)
        }

        invisible(capture.output(fit <- hierNet_logistic(as.matrix(X_initial), final_df$Y, lam= best_lam, tol = tol, aa = aa)))


        I_1 = as.vector(fit$th[final_df_var_idx, ])

        I_2 = as.vector(t(fit$th[, final_df_var_idx]))

        I = (I_1 + I_2)/2

        resamp_TS = sum(I^2)/2

        return(resamp_TS)
      }

    snow::stopCluster(cl)

  } else {
    e = vector()
    for (j in 1:B) {
      set.seed(j + B)
      # shuffle task number
      new_x = x
      for (i in 1:(length(respondent_idx))) {
        tasks = x[, task_var][(x[, respondent_var] == respondent_idx[i])]
        new_tasks = sample(tasks, replace = FALSE)
        new_x[, task_var][(x[, respondent_var] == respondent_idx[i])] = new_tasks
      }
      a = new_x[, task_var]
      resampled = final_df

      if (profileorder_constraint) {
        resampled[, (colnames(final_df) %in% task_var_name)] = c(a,a)
      } else {
        resampled[, (colnames(final_df) %in% task_var_name)] = c(a,a)
      }
      X_initial = model.matrix(form, resampled, contrasts.arg = lapply(resampled[, -c(non_factor_idx, final_df_var_idx, ncol(final_df))], contrasts, contrasts = FALSE))[, -1]

      if (!speedup) {
        best_lam = hierNet_logistic_CV(lambda, nfolds = nfolds, X = as.matrix(X_initial), y_var = final_df$Y, tol = tol, constraint = profileorder_constraint, seed = seed)
      }

      invisible(capture.output(fit <- hierNet_logistic(as.matrix(X_initial), final_df$Y, lam= best_lam, tol = tol, aa = aa)))


      I_1 = as.vector(fit$th[final_df_var_idx, ])

      I_2 = as.vector(t(fit$th[, final_df_var_idx]))

      I = (I_1 + I_2)/2

      e[j] = sum(I^2)/2
      if (verbose) {
        cat(paste0("Done with task: ",j, " out of ", B, " resamples"))
      }
    }


  }
  p_val = (length(which(e >= as.numeric(obs_test_stat[1]))) + 1)/(B + 1)

  out = list()
  out$p_val = p_val
  out$obs_test_stat = obs_test_stat
  out$resampled_test_stat = e
  out$tol = tol
  if (speedup) {
    out$lam = best_lam
  }
  out$hiernet_fit = fit
  out$seed = seed
  return(out)
}

Try the CRTConjoint package in your browser

Any scripts or data that you put into this service are public.

CRTConjoint documentation built on June 9, 2022, 9:06 a.m.