R/dist_functions.R

Defines functions mahalanobize weightit_distances is.cov_like check_X get.covs.matrix.for.dist eucdist_internal transform_covariates

#Functions to compute distance matrices
#Copied from MatchIt with edits

#Transforms covariates so that Euclidean distance computed on transforms covariates is equivalent to
#requested distance. When discarded is not NULL, statistics relevant to transformation are computed
#using retained units, but full covariate matrix is returned.
transform_covariates <- function(formula = NULL, data = NULL, method = "mahalanobis", s.weights = NULL, var = NULL,
                                 discarded = NULL) {
  X <- get.covs.matrix.for.dist(formula, data)

  X <- check_X(X)

  #If allvariables have no variance, use Euclidean to avoid errors
  #If some have no variance, removes those to avoid messing up distances
  no_variance <- which(apply(X, 2, function(x) abs(max(x) - min(x)) < sqrt(.Machine$double.eps)))
  if (length(no_variance) == ncol(X)) {
    method <- "euclidean"
    X <- X[,1, drop = FALSE]
  }
  else if (length(no_variance) > 0) {
    X <- X[,-no_variance, drop = FALSE]
  }

  method <- match_arg(method, weightit_distances())

  if (is.null(discarded)) discarded <- rep(FALSE, nrow(X))

  if (method == "mahalanobis") {
    # X <- sweep(X, 2, colMeans(X))

    if (is.null(var)) {
      X <- scale(X)
      #NOTE: optmatch and Rubin (1980) use pooled within-group covariance matrix
      if (is.null(s.weights)) {
        var <- cov(X[!discarded,, drop = FALSE])
      }
      else {
        var <- cov.wt(X[!discarded,, drop = FALSE], s.weights[!discarded])$cov
      }
    }
    else if (!is.cov_like(var)) {
      stop("If 'var' is not NULL, it must be a covariance matrix with as many entries as supplied variables.", call. = FALSE)
    }

    inv_var <- try(solve(var), silent = TRUE)
    if (inherits(inv_var, "try-error")) {
      inv_var <- generalized_inverse(var)
    }

    X <- mahalanobize(X, inv_var)
  }
  else if (method == "euclidean") {
    #Do nothing
  }
  else if (method == "scaled_euclidean") {
    if (is.null(var)) {
      sds <- sqrt(col.w.v(X[!discarded,, drop = FALSE], w = s.weights[!discarded]))
    }
    else if (is.cov_like(var, X)) {
      sds <- sqrt(diag(var))
    }
    else if (is.numeric(var) && is.cov_like(diag(var), X)) {
      sds <- sqrt(var)
    }
    else {
      stop("If 'var' is not NULL, it must be a covariance matrix or a vector of variances with as many entries as supplied variables.", call. = FALSE)
    }

    for (i in seq_len(ncol(X))) {
      X[,i] <- X[,i]/sds[i]
    }
  }

  X
}

#Internal function for fast(ish) Euclidean distance
eucdist_internal <- function(X, treat = NULL) {

  # if (is.null(dim(X))) X <- as.matrix(X)

  if (is.null(treat)) {
    if (is.null(dim(X))) {
      d <- abs(outer(X, X, "-"))
      dimnames(d) <- list(names(X), names(X))
    }
    else {
      if (ncol(X) == 1) {
        d <- abs(outer(drop(X), drop(X), "-"))
      }
      else {
        d <- as.matrix(dist(X))
      }
      dimnames(d) <- list(rownames(X), rownames(X))
    }
  }
  else {
    treat_l <- as.logical(treat)
    if (is.null(dim(X))) {
      d <- abs(outer(X[treat_l], X[!treat_l], "-"))
      dimnames(d) <- list(names(X)[treat_l], names(X)[!treat_l])
    }
    else {
      if (ncol(X) == 1) {
        d <- abs(outer(X[treat_l,], X[!treat_l,], "-"))
      }
      else {
        d <- as.matrix(dist(X))[treat_l, !treat_l, drop = FALSE]
      }
      dimnames(d) <- list(rownames(X)[treat_l], rownames(X)[!treat_l])
    }
  }

  d
}

#Get covariates (RHS) vars from formula; factor variable contrasts divided by sqrt(2)
#to ensure same result as when non-factor binary variable supplied (see optmatch:::contr.match_on)
get.covs.matrix.for.dist <- function(formula = NULL, data = NULL) {

  if (is.null(formula)) {
    fnames <- colnames(data)
    fnames[!startsWith(fnames, "`")] <- paste0("`", fnames[!startsWith(fnames, "`")], "`")
    data <- as.data.frame(data)
    formula <- terms(reformulate(fnames), data = data)
  }
  else {
    data <- as.data.frame(data)
    formula <- terms(formula, data = data)
  }

  mf <- model.frame(formula, data, na.action = na.pass)

  chars.in.mf <- vapply(mf, is.character, logical(1L))
  mf[chars.in.mf] <- lapply(mf[chars.in.mf], factor)

  formula <- update(formula, NULL ~ . + 1)

  X <- model.matrix(formula, data = mf,
                    contrasts.arg = lapply(Filter(is.factor, mf),
                                           function(x) contrasts(x, contrasts = FALSE)/sqrt(2)))

  if (ncol(X) > 1) {
    assign <- attr(X, "assign")[-1]
    X <- X[,-1, drop = FALSE]
  }
  attr(X, "assign") <- assign

  attr(X, "treat") <-  model.response(mf)

  X
}
check_X <- function(X) {
  if (isTRUE(attr(X, "checked"))) return(X)

  treat <- attr(X, "treat")

  if (is.data.frame(X)) X <- as.matrix(X)
  else if (is.numeric(X) && is.null(dim(X))) {
    X <- matrix(X, nrow = length(X),
                dimnames = list(names(X), NULL))
  }

  if (anyNA(X)) stop("Missing values are not allowed in the covariates.", call. = FALSE)
  else if (any(!is.finite(X))) stop("Non-finite values are not allowed in the covariates.", call. = FALSE)

  if (!is.numeric(X) || length(dim(X)) != 2) {
    stop("bad X")
  }
  attr(X, "checked") <- TRUE
  attr(X, "treat") <- treat
  X
}

is.cov_like <- function(var, X) {
  is.numeric(var) &&
    length(dim(var)) == 2 &&
    (missing(X) || all(dim(var) == ncol(X))) &&
    isSymmetric(var) &&
    all(diag(var) >= 0)
}
weightit_distances <- function() {
  c("mahalanobis", "euclidean", "scaled_euclidean")
}
mahalanobize <- function(X, inv_var) {
  ## Mahalanobize covariates by computing cholesky decomp,
  ## allowing for NPD cov matrix by pivoting
  ch <- suppressWarnings(chol(inv_var, pivot = TRUE))
  p <- order(attr(ch, "pivot"))
  # r <- seq_len(attr(ch, "rank"))

  tcrossprod(X, ch[, p, drop = FALSE])
}

Try the WeightIt package in your browser

Any scripts or data that you put into this service are public.

WeightIt documentation built on May 31, 2023, 9:25 p.m.