R/calculate_convex_hull_moment_matrix.R

Defines functions gen_momentsmatrix_constrained interpolate_convex_hull convhull_halfspace

Documented in convhull_halfspace gen_momentsmatrix_constrained interpolate_convex_hull

#' @title Compute convex hull in half-space form
#' @param points A matrix or data frame of numeric coordinates, size n x d
#' @keywords internal
#'
#' @return A list with elements:
#'   - A: a matrix of size m x d (m = number of facets)
#'   - b: a length-m vector
#'   so that x is inside the hull if and only if A %*% x + b >= 0 (for all rows).
#'   - volume: Total volume of the convex hull
convhull_halfspace = function(points) {
  pts_mat = as.matrix(points)

  ch = geometry::convhulln(pts_mat, options = "Fa", output.options = "n")
  chvol = geometry::convhulln(pts_mat, options = "Fa", output.options = "FA")

  hull_facets = ch$hull
  facet_norms = ch$normals

  m = nrow(facet_norms) # number of facets
  d = ncol(facet_norms) # dimension

  A = facet_norms[, seq_len(d - 1), drop = FALSE]
  b = facet_norms[, d]

  list(
    A = A,
    b = b,
    volume = chvol$vol
  )
}

#' @title Interpolate points within a convex hull
#'
#' @param points A numeric matrix (n x d).
#' @param ch_halfspace Convex hull. Output of `convhull_halfspace`
#' @param n_samples_per_dimension Number of samples per dimension (pre filtering) to take
#'
#' @keywords internal
#'
#' @return A list with:
#'   - data: The interpolated points
#'   - on_edge: A logical vector indicating whether the points
#'
interpolate_convex_hull = function(
  points,
  ch_halfspace,
  n_samples_per_dimension = 11
) {
  stopifnot(is.matrix(points), ncol(points) >= 2)

  facet_normals = ch_halfspace$A # same number of rows as hull_facets
  facet_offsets = ch_halfspace$b # same number of rows as hull_facets

  ranges = apply(points, 2, range)

  eg_list = list()
  for (i in seq_len(ncol(ranges))) {
    eg_list[[i]] = seq(
      ranges[1, i],
      ranges[2, i],
      length.out = n_samples_per_dimension
    )
  }
  numeric_eg = do.call(expand.grid, args = eg_list)
  filter_pts_edge = rep(FALSE, nrow(numeric_eg))
  filter_pts = rep(TRUE, nrow(numeric_eg))

  for (i in seq_len(nrow(facet_normals))) {
    vec_single = facet_normals[i, ]
    proj_pts = apply(numeric_eg, 1, \(x) sum(x * vec_single) + facet_offsets[i])
    filter_pts = filter_pts & proj_pts < 1e-8
    filter_pts_edge = filter_pts_edge | abs(proj_pts) < 1e-8
  }
  return(list(
    data = numeric_eg[which(filter_pts), , drop = FALSE],
    on_edge = filter_pts_edge[filter_pts]
  ))
}

#' @title Approximate continuous moment matrix over a numeric bounding box
#'
#' @description Treats the min–max range of each numeric column as a bounding box
#' and samples points uniformly to approximate the integral of the outer product of
#' the model terms, weighted by whether the point
#' is on the edge.
#'
#' @param candidate_set Candidate set
#' @param formula Default `~ .`. Model formula specifying the terms.
#' @param n_samples_per_dimension Default `10`. Number of samples to take per dimension when interpolating inside
#' the convex hull.
#'
#' @keywords internal
#' @return A matrix of size `p x p` (where `p` is the number of columns in the model matrix),
#'   approximating the continuous moment matrix.
#'
gen_momentsmatrix_constrained = function(
  candidate_set,
  formula = ~.,
  n_samples_per_dimension = 10,
  user_provided_high_res_candidateset = FALSE
) {
  # Identify numeric columns (continuous factors)
  is_numeric_col = vapply(candidate_set, is.numeric, logical(1))
  numeric_cols = names(candidate_set)[is_numeric_col]
  factor_cols = names(candidate_set)[!is_numeric_col]

  if (length(numeric_cols) == 0) {
    stop(
      "No numeric columns found in data. Cannot compute continuous bounding box."
    )
  }
  M_acc = NA
  total_weight = 0

  # Simple if all numeric: just integrate over the region.
  if (length(factor_cols) == 0) {
    sub_candidate_set = as.matrix(candidate_set)
    if (user_provided_high_res_candidateset) {
      mm_candidate_set = sub_candidate_set
      vol = nrow(mm_candidate_set)
    } else {
      if (ncol(sub_candidate_set) == 1) {
        mm_candidate_set = matrix(
          seq(
            min(sub_candidate_set),
            max(sub_candidate_set),
            length.out = n_samples_per_dimension
          ),
          ncol = 1
        )
        interp_ch = list()
        interp_ch$on_edge = rep(FALSE, nrow(mm_candidate_set))
        vol = max(sub_candidate_set) - min(sub_candidate_set)
      } else {
        ch = convhull_halfspace(sub_candidate_set)
        vol = ch$volume
        interp_ch = interpolate_convex_hull(
          as.matrix(sub_candidate_set),
          ch,
          n_samples_per_dimension = n_samples_per_dimension
        )
        mm_candidate_set = interp_ch$data
      }
    }

    colnames(mm_candidate_set) = numeric_cols
    interp_df = as.data.frame(mm_candidate_set)

    # Now build model matrix
    Xsub = model.matrix(formula, data = interp_df)

    w = rep(1, nrow(Xsub))
    if(!user_provided_high_res_candidateset) {
      w[interp_ch$on_edge] = 0.5
    }
    # average subregion moment
    Xsub_w = apply(Xsub, 2, \(x) x * sqrt(w))

    M = crossprod(Xsub_w) / sum(w)

    #Scale by the intercept
    if (colnames(M)[1] == "(Intercept)") {
      M = M / M[1, 1]
    }
    return(M)
  } else {
    M_acc = NA
    # For categorical factors with disallowed combinations, we need to account for the
    # reduced domain of the integral. We'll calculate a moment matrix as above for each
    # factor level combination, weigh it by the total number of points, and sum it. That
    # will give us the average prediction variance for the constrained region.
    unique_combos = unique(candidate_set[, factor_cols, drop = FALSE])

    named_contrast = function() {
      skpr::contr.simplex
    }
    skpr::contr.simplex

    get_contrasts_from_candset = function(candset) {
      csn = colnames(candset)[!unlist(lapply(candset, is.numeric))]
      contrasts_return = vector(mode = "list", length = length(csn))
      for(i in seq_along(csn)) {
        single_contrast = contr.simplex(unique(candset[[ csn[i] ]]))
        colnames(single_contrast) = paste0("_skpr_FACTOR_",seq_len(ncol(single_contrast)))
        contrasts_return[[i]] = single_contrast
      }
      names(contrasts_return) = csn
      return(contrasts_return)
    }

    # We'll accumulate a weighted sum of sub-matrices
    total_weight = 0

    for (r in seq_len(nrow(unique_combos))) {
      combo_row = unique_combos[r, , drop = FALSE]

      # subset of 'candidate_set' that matches this combo
      is_match = TRUE
      for (fc in factor_cols) {
        is_match = is_match & (candidate_set[[fc]] == combo_row[[fc]])
      }
      sub_candidate_set = candidate_set[is_match, , drop = FALSE]
      sub_candidate_set = sub_candidate_set[, is_numeric_col, drop = FALSE]
      # If no rows => disallowed or doesn't appear => skip
      if (nrow(sub_candidate_set) == 0) {
        next
      }
      if (user_provided_high_res_candidateset) {
        mm_candidate_set = sub_candidate_set
        vol = nrow(mm_candidate_set)
      } else {
        if (ncol(sub_candidate_set) == 1) {
          mm_candidate_set = matrix(
            seq(
              min(sub_candidate_set),
              max(sub_candidate_set),
              length.out = n_samples_per_dimension
            ),
            ncol = 1
          )
          interp_ch = list()
          interp_ch$on_edge = rep(FALSE, nrow(mm_candidate_set))
          vol = max(sub_candidate_set) - min(sub_candidate_set)
        } else {
          ch = convhull_halfspace(sub_candidate_set)
          if (ch$volume <= 0) {
            next
          }
          vol = ch$volume
          interp_ch = interpolate_convex_hull(
            as.matrix(sub_candidate_set),
            ch,
            n_samples_per_dimension = n_samples_per_dimension
          )
          mm_candidate_set = interp_ch$data
        }
      }

      colnames(mm_candidate_set) = numeric_cols
      interp_df = as.data.frame(mm_candidate_set)

      # Pin the factor columns at this combo
      for (fc in factor_cols) {
        interp_df[[fc]] = combo_row[[fc]]
      }

      # Now build model matrix
      Xsub = model.matrix(
        formula,
        data = interp_df,
        contrasts.arg = get_contrasts_from_candset(candidate_set)
      )

      w = rep(1, nrow(Xsub))
      if(!user_provided_high_res_candidateset) {
        w[interp_ch$on_edge] = 0.5
      }
      # average subregion moment
      Xsub_w = apply(Xsub, 2, \(x) x * sqrt(w))

      M_sub = crossprod(Xsub_w) / sum(w)

      # Weighted accumulation
      if (all(is.na(M_acc))) {
        M_acc = vol * M_sub
      } else {
        M_acc = M_acc + vol * M_sub
      }
      total_weight = total_weight + vol
    }

    M = M_acc / total_weight
    #Scale by the intercept
    if (colnames(M)[1] == "(Intercept)") {
      M = M / M[1, 1]
    }
    #Now scan for categorical factors and set off-diagonals to zero
    is_factor_term = grepl("_skpr_FACTOR_", colnames(model.matrix(
      formula,
      data = candidate_set,
      contrasts.arg = get_contrasts_from_candset(candidate_set)
    )))
    colnames(M) = gsub("_skpr_FACTOR_",replacement = "",x = colnames(M))
    rownames(M) = gsub("_skpr_FACTOR_",replacement = "",x = rownames(M))

    for(col in seq_len(ncol(M))) {
      for(row in seq_len(nrow(M))) {
        if(col != row) {
          if(is_factor_term[row] || is_factor_term[col]) {
            M[row,col] = 0
          }
        }
      }
    }
    return(M)
  }
}
tylermorganwall/skpr documentation built on April 13, 2025, 5:35 p.m.