R/robust_RelMS.R

Defines functions robust_RelMS

Documented in robust_RelMS

#' Robust RelMS Distance
#'
#' Computes a robust version of the Gower distance using the RelMS method
#' for mixed-type data (continuous, binary, categorical).
#' Continuous variables are handled via a robust Mahalanobis distance using a supplied robust covariance matrix.
#' Binary and categorical variables are transformed into distances via similarity coefficients and
#' combined using the RelMS approach.
#'
#' @param data Numeric matrix or data frame with all variables combined.
#' @param w Numeric vector of weights for each observation. Will be normalized internally.
#' @param p Integer vector of length 3: \code{c(#cont, #binary, #categorical)}.
#' @param robust_cov Robust covariance matrix for continuous variables.
#'
#'
#' @details
#' The function computes distances separately for continuous, binary, and categorical variables,
#' then applies the RelMS combination procedure. Continuous distances are Mahalanobis distances,
#' categorical distances use a matching coefficient, and binary distances use a modified similarity coefficient.
#' Eigen decomposition is used to compute the square root matrices needed in the RelMS combination.
#'
#' @return A numeric matrix of squared robust distances normalized by geometric variability.
#' @keywords internal
# Robust Gower distance function====================================
robust_RelMS<-function(data,w,p,robust_cov){ # data: data matrix, w: weights, p: 3-vector, robust_cov: robust covariance matrix
  pc<-p[1] # number of continuous variables
  pb<-p[2] # number of binary variables
  pq<-p[3] # number of categorical variables
  n<-dim(data)[1] # number of individuals
  w<-w/sum(w) # weights
  m<-sum(p!=0) # number of variables
  # ------------------------------------------------------------------
  #  Quantitative variables
  # ------------------------------------------------------------------
  numvar<-function(x,y,robust_cov){ # x,y: row vectors
    inv_robust_cov<-solve(robust_cov) # inverse of robust covariance matrix
    md<-t(x-y)%*%inv_robust_cov%*%(x-y) # Mahalanobis distance
  }
  # ------------------------------------------------------------------
  #  Categorical variables
  # ------------------------------------------------------------------
  categvar<-function(x,y){ # x,y: row vectors
    nmatch<-0 # number of matches
    nmatch<-sum(x==y) # number of matches
    s<-nmatch/length(x) # similarity coefficient
    d<-2*(1-s) # distance
  }
  # ------------------------------------------------------------------
  #  Binary variables
  # ------------------------------------------------------------------
  binaryvar<-function(x,y){ # x,y: row vectors
    nposmatch<-0 # number of positive matches
    nnegmatch<-0 # number of negative matches
    nposmatch<-sum(x*y) # number of positive matches
    nnegmatch<-sum((x-y)==0)-sum(x*y) # number of negative matches
    if (nnegmatch==length(x)){ # if all binary variables are different
      s<-0 # similarity coefficient
    }
    else{ # if there are some binary variables that are the same
      s<-nposmatch/(length(x)-nnegmatch) # similarity coefficient
    }
    d<-2*(1-s) # distance
  }
  # ------------------------------------------------------------------
  if (pc>0){ # if there are continuous variables
    D_num<-proxy::dist(x=data[,1:pc],y=NULL,method=numvar,robust_cov) # distance matrix
    D_num<-dbstats::as.D2(D_num) # D2 matrix
    gv_num<-1/2*t(w)%*%D_num%*%w # geometric variability
    gv_num<-as.numeric(gv_num) # numeric
  }
  else{ # if there are no continuous variables
    D_num<-0 # distance matrix
    gv_num<-1 # geometric variability
  }
  if (pb>0){ # if there are binary variables
    D_bin<-proxy::dist(x=data[,(pc+1):(pc+pb)],y=NULL,method=binaryvar) # distance matrix
    D_bin<-dbstats::as.D2(D_bin) # D2 matrix
    gv_bin<-1/2*t(w)%*%D_bin%*%w # geometric variability
    gv_bin<-as.numeric(gv_bin) # numeric
  }
  else{
    D_bin<-0 # distance matrix
    gv_bin<-1 # geometric variability
  }
  if (pq>0){ # if there are categorical variables
    D_cat<-proxy::dist(x=data[,(pc+pb+1):(pc+pb+pq)],y=NULL,method=categvar) # distance matrix
    D_cat<-dbstats::as.D2(D_cat) # D2 matrix
    gv_cat<-1/2*t(w)%*%D_cat%*%w # geometric variability
    gv_cat<-as.numeric(gv_cat) # numeric
  }
  else{ # if there are no categorical variables
    D_cat<-0 # distance matrix
    gv_cat<-1 # geometric variability
  }
  # ------------------------------------------------------------------
  Dw<-diag(w) # diagonal matrix of weights
  Jw<-diag(n)-rep(1,n)%*%t(w) # Jw matrix
  Jw<-Dw^(1/2)%*%Jw  # We start to work with F's
  epsilon=1.0e-50 # epsilon
  if (pc>0){ # if there are continuous variables
    DA<-D_num/gv_num # Gower distance
    GA<--(0.5)* Jw %*% DA %*% t(Jw) # G matrix
    EIA<-eigen(GA) # eigenvalues and eigenvectors of G
    fA<-Re(EIA$values)[Re(EIA$values)>epsilon] # eigenvalues of G
    ZA<-Re(EIA$vectors)[,Re(EIA$values)>epsilon] # eigenvectors of G
    sqrtGA<-ZA%*%diag(sqrt(fA))%*%t(ZA) # square root of G
  }
  else { # if there are no continuous variables
    GA<-0 # G matrix
    sqrtGA<-matrix(0,nrow=n,ncol=n) # square root of G
  }
  if (pb>0){ # if there are binary variables
    DB<-D_bin/gv_bin # Gower distance
    GB<--(0.5)*Jw %*%DB %*%t(Jw) # G matrix
    EIB<-eigen(GB) # eigenvalues and eigenvectors of G
    fB<-Re(EIB$values)[Re(EIB$values)>epsilon] # eigenvalues of G
    ZB<-Re(EIB$vectors)[,Re(EIB$values)>epsilon] # eigenvectors of G
    sqrtGB<-ZB%*%diag(sqrt(fB))%*%t(ZB) # square root of G
  }
  else { # if there are no binary variables
    GB<-0 # G matrix
    sqrtGB<-matrix(0,nrow=n,ncol=n) # square root of G
  }
  if (pq>0){ # if there are categorical variables
    DC<-D_cat/gv_cat # Gower distance
    GC<--(0.5)*Jw %*% DC %*% t(Jw) # G matrix
    EIC<-eigen(GC) # eigenvalues and eigenvectors of G
    fC<-Re(EIC$values)[Re(EIC$values)>epsilon] # eigenvalues of G
    ZC<-Re(EIC$vectors)[,Re(EIC$values)>epsilon] # eigenvectors of G
    sqrtGC<-ZC%*%diag(sqrt(fC))%*%t(ZC) # square root of G
  }
  else { # if there are no categorical variables
    GC<-0 # G matrix
    sqrtGC<-matrix(0,nrow=n,ncol=n) # square root of G
  }
  # ------------------------------------------------------------------
  GABC2<-GA+GB+GC-1/m*(sqrtGA%*%sqrtGB+sqrtGB%*%sqrtGA+sqrtGA%*%sqrtGC+sqrtGC%*%sqrtGA+sqrtGB%*%sqrtGC+sqrtGC%*%sqrtGB) # G matrix
  GABC2<-diag(w^(-1/2))%*%GABC2%*%diag(w^(-1/2)) # We go to G's
  gABC2<-diag(GABC2) # diagonal of G
  DABC2<-gABC2%*%t(rep(1,n))+(rep(1,n))%*%t(gABC2)-2*GABC2 # Gower distance
  gv_DABC2<-1/2*t(w)%*%DABC2%*%w # geometric variability
  gv_DABC2<-as.numeric(gv_DABC2) # numeric
  DABC2<-DABC2/gv_DABC2 # Are squared distances DABC2 <- as.D2(DABC2)
  return(DABC2) # return squared distances
}

Try the dbrobust package in your browser

Any scripts or data that you put into this service are public.

dbrobust documentation built on Nov. 5, 2025, 6:24 p.m.