R/scale_onehot.R

#' One-Hot Binary Matrix Scaling
#'
#' @param data a one-hot binary matrix
#' @return A scaled numerical matrix is returned.
#' @details The scaling technique is taken from Outlier Analysis (Aggarwal, 2017),
#' section 8.3. For each column j in the binary transformed matrix, a normalization
#' factor is defined as sqrt(ni \* pj \* (1-pj)), where ni is the number of distinct
#' categories in the reference variable from the raw data set and pj is the proportion
#' of records taking the value of 1 for the jth variable
#' @examples
#' onehot_scale(data = one_hot(my_data))
#' onehot(data = mydata, scale = T) # alternative when working with raw data
#'
#' @export
scale_onehot <- function(data) {
  n_categories <- ncol(data)
  col_means <- apply(data, 2, mean)
  norm_factors <- sqrt(n_categories * col_means * (1 - col_means))
  divide_norm_factors <- sweep(data, 2, norm_factors, "/")
  return(divide_norm_factors)
}
dannymorris/onehotter documentation built on May 15, 2019, 9:08 p.m.