R/robust_ggower.R

Defines functions robust_ggower

Documented in robust_ggower

#' Compute Robust Generalized Gower Distance
#'
#' Computes a weighted, robust version of the Gower distance for mixed-type data
#' (continuous, binary, categorical). Continuous variables are handled via a
#' robust Mahalanobis distance using a supplied robust covariance matrix.
#' Binary and categorical variables are transformed into distances via similarity coefficients.
#'
#' @param data Numeric matrix or data frame with all variables combined.
#' @param w Numeric vector of weights for each observation. Will be normalized internally.
#' @param p Integer vector of length 3: \code{c(#cont, #binary, #categorical)}.
#' @param robust_cov Robust covariance matrix for continuous variables.
#'
#' @details
#' The function computes distances separately for continuous, binary, and categorical variables,
#' then scales each by its geometric variability and combines them. The output is a normalized
#' squared distance matrix suitable for robust clustering or aggregation procedures.
#'
#' Continuous distances are Mahalanobis distances: \eqn{(x-y)^T (S)^-1 (x-y)}.
#' Categorical distances use a matching coefficient.
#' Binary distances are modified to account for positive/negative matches.
#'
#' @return A numeric matrix of squared robust Gower distances, normalized by geometric variability.
#' @keywords internal
robust_ggower<-function(data,w,p,robust_cov){ # data: data matrix, w: weights, p: 3-vector, robust_cov: robust covariance matrix
  pc<-p[1] # number of continuous variables
  pb<-p[2] # number of binary variables
  pq<-p[3] # number of categorical variables
  n<-dim(data)[1] # number of individuals
  w<-w/sum(w) # weights
  # ------------------------------------------------------------------
  #  Quantitative variables
  # ------------------------------------------------------------------
  numvar<-function(x,y,robust_cov){ # x,y: row vectors
    inv_robust_cov<-solve(robust_cov) # inverse of robust covariance matrix
    md<-t(x-y)%*%inv_robust_cov%*%(x-y) # Mahalanobis distance
  }
  # ------------------------------------------------------------------
  #  Categorical variables
  # ------------------------------------------------------------------
  categvar<-function(x,y){ # x,y: row vectors
    nmatch<-0 # number of matches
    nmatch<-sum(x==y) # number of matches
    s<-nmatch/length(x) # similarity coefficient
    d<-2*(1-s) # distance
  }
  # ------------------------------------------------------------------
  #  Binary variables
  # ------------------------------------------------------------------
  binaryvar<-function(x,y){ # x,y: row vectors
    nposmatch<-0 # number of positive matches
    nnegmatch<-0 # number of negative matches
    nposmatch<-sum(x*y) # number of positive matches
    nnegmatch<-sum((x-y)==0)-sum(x*y) # number of negative matches
    if (nnegmatch==length(x)){ # if all binary variables are different
      s<-0 # similarity coefficient
    }
    else{ # if there are some binary variables that are the same
      s<-nposmatch/(length(x)-nnegmatch) # similarity coefficient
    }
    d<-2*(1-s) # distance
  }
  # ------------------------------------------------------------------
  if (pc>0){ # if there are continuous variables
    D_num<-proxy::dist(x=data[,1:pc],y=NULL,method=numvar,robust_cov) # distance matrix
    D_num <- dbstats::as.D2(D_num) # D2 matrix
    gv_num <- 1/2*t(w)%*%D_num%*%w # geometric variability
    gv_num <- as.numeric(gv_num) # numeric
  }
  else{ # if there are no continuous variables
    D_num<-0 # distance matrix
    gv_num<-1 # geometric variability
  }
  if (pb>0){ # if there are binary variables
    D_bin<-proxy::dist(x=data[,(pc+1):(pc+pb)],y=NULL,method=binaryvar) # distance matrix
    D_bin<-dbstats::as.D2(D_bin) # D2 matrix
    gv_bin<-1/2*t(w)%*%D_bin%*%w  # geometric variability
    gv_bin<-as.numeric(gv_bin) # numeric
  }
  else{ # if there are no binary variables
    D_bin<-0 # distance matrix
    gv_bin<-1 # geometric variability
  }
  if (pq>0){ # if there are categorical variables
    D_cat<-proxy::dist(x=data[,(pc+pb+1):(pc+pb+pq)],y=NULL,method=categvar) # distance matrix
    D_cat<-dbstats::as.D2(D_cat) # D2 matrix
    gv_cat<-1/2*t(w)%*%D_cat%*%w # geometric variability
    gv_cat<-as.numeric(gv_cat) # numeric
  }
  else{ # if there are no categorical variables
    D_cat<-0 # distance matrix
    gv_cat<-1 # geometric variability
  }
  # ------------------------------------------------------------------
  D<-(D_num/gv_num)+(D_bin/gv_bin)+(D_cat/gv_cat) # Gower distance
  gv_D<-1/2*t(w)%*%D%*%w # geometric variability
  gv_D<-as.numeric(gv_D ) # numeric
  robust_gower_dist<-D/gv_D # robust Gower distance
  return(robust_gower_dist)
}

Try the dbrobust package in your browser

Any scripts or data that you put into this service are public.

dbrobust documentation built on Nov. 5, 2025, 6:24 p.m.