#' Calculate a covariance matrix
#'
#' Calculate a covariance matrix of the design matrix \code{x} using one of the methods
#' \code{complete_observations}, \code{pairwise_covariance}, \code{loh_wainwright_bias_correction} or \code{average_imputation}.
#'
#' @param x A n x p design matrix
#' @param NA_method The method that should be applied to estimate the covariance structure.
#' @param min_points Minimal number of available observations of a variable necessary such that an estimation of the variance of that
#' variable is reliable. An integer not smaller than 2.
#' @return A p x p matrix with an estimate for the covariance structure of \code{x}. If less than \code{min_points} observations are
#' available for some variable, the corresponding rows and columns are NA.
#' @export
get_cov_mat <- function(x, NA_method = c('complete_observations', 'pairwise_covariance',
'loh_wainwright_bias_correction', 'average_imputation'), min_points = 2){
stopifnot(min_points >= 2)
NA_mth <- match.arg(NA_method)
p <- ncol(x)
cov_mat <- matrix(NA, ncol = p, nrow = p)
if(NA_mth == "complete_observations"){
n_obs <- sum(complete.cases(x))
if(n_obs < 2){
return(list(mat = cov_mat, inds = rep(F, p), n_eff_obs = 0))
} else {
return(list(mat = (n_obs - 1) / n_obs * cov(x, use = 'na.or.complete'), inds = rep(T, ncol(x)), n_eff_obs = n_obs))
}
} else {
# calculate available entries for each column
available_obs <- apply(!is.na(x), 2, sum)
# we can only estimate variance for columns with at least 2 available entries
n <- nrow(x)
inds <- (available_obs >= max(2, min_points))
# if no variable has enough observations, return NA
if(!any(inds)){
return(list(mat = cov_mat, inds = rep(F, p), n_eff_obs = 0))
}
n_eff_obs <- sum(available_obs) / sum(inds)
# pairwise covariance method. Calculates pairwise covariances and impute leftover covariances with 0.
if (NA_mth == "pairwise_covariance"){
temp <- cov(x[, inds, drop = F], use = 'pairwise')
temp[is.na(temp)] <- 0
# As the result might not be positive semi-definite, which is required for e.g. glasso, project
# onto closest (wrt Frobenius) positive definite matrix
cov_mat[inds, inds] <- as.matrix(Matrix::nearPD(temp)$mat)
# Loh-Wainwright bias correction method. Tries to remove bias of (co-)variance estimation if missing
# values are present.
} else if (NA_mth == "loh_wainwright_bias_correction"){
# first center the observations and impute the missing values with the mean of the corresponding column
z <- scale(x[, inds, drop = F], T, F)
z[is.na(z)] <- 0
# estimate local missingness probability per predictor
miss_frac <- 1 - available_obs[inds, drop = F] / nrow(x)
stopifnot(length(miss_frac) == ncol(z))
# Initiate helper matrix for LW bias correction
M <- matrix(rep((1 - miss_frac), ncol(z)), ncol = ncol(z))
M <- t(M) * M
diag(M) <- 1 - miss_frac
cov_mat[inds, inds] <- as.matrix(Matrix::nearPD(cov(z) * (n - 1) / n / M)$mat)
# to avoid double debiasing, use n_eff_obs = n_obs = nrow(x)
#n_eff_obs <- nrow(z)
} else if (NA_mth == "average_imputation"){
# center the observations and impute the missing values with the mean of the corresponding column
z <- scale(x[, inds, drop = F], T, F)
z[is.na(z)] <- 0
cov_mat[inds, inds] <- cov(z) * (n_eff_obs - 1) / n_eff_obs
}
list(mat = cov_mat, inds = inds, n_eff_obs = n_eff_obs)
}
}
#' Negative loglikelihood of a multivariate normal
#'
#' @param x A design matrix
#' @param mu Mean of the multivariate Gaussian distribution
#' @param cov_mat Covariance matrix of the multivariate Gaussian distribution
#' @param cov_mat_inv precision matrix of the multivariate Gaussian distribution
#' @return An array with the negative log-likelihoods of the observations in \code{x} given the Gaussian model
#' specified.
#' @export
loglikelihood <- function(x, mu, cov_mat_inv, cov_mat = solve(cov_mat_inv)){
if(!is.matrix(x)){
x <- as.matrix(x)
warning('non matrix input into loglikelihood')
}
n <- nrow(x)
p <- ncol(x)
loss <- rep(NA, n)
# to avoid multiple inverseions of cov_mat[inds_cur, inds_cur], order observations by missingness
# structure and handle in groups where missingness structure is same
ranks <- data.table::frank(data.table::data.table(is.na(x)))
for (j in unique(ranks)){
# x_cur is a group of observations with the same missingness structure
x_cur <- x[ranks == j, , drop = F]
inds_cur <- !is.na(x_cur[1, ])
# are there any missing values and at least one observed value?
if(any(!inds_cur) & any(inds_cur)){
log_det <- - determinant(cov_mat_inv[inds_cur, inds_cur, drop = F], logarithm = TRUE)$modulus
v <- t(x_cur[, inds_cur, drop = F]) - mu[inds_cur, drop = F]
#distance <- diag(t(v) %*% solve(cov_mat[inds_cur, inds_cur, drop = F], v))
distance <- diag(t(v) %*% cov_mat_inv[inds_cur, inds_cur, drop = F] %*% v)
loss[ranks == j] <- distance + log_det + sum(inds_cur) * log(2 * pi)
# are there no missing values?
} else if (!any(!inds_cur)){
log_det <- determinant(cov_mat, logarithm = TRUE)$modulus
v <- t(x_cur) - mu
distance <- diag(t(v) %*% cov_mat_inv %*% v)
loss[ranks == j] <- distance + log_det + sum(inds_cur) * log(2 * pi)
# are there no observed values?
} else if (all(!inds_cur)){
loss[ranks == j] <- 0
}
}
# return loss vector
loss/2
}
#' Get a glasso fit
#'
#' @param x A design matrix
#' @param lambda Regularisation parameter for the glasso function.
#' @param control An object of class \code{hdcd_control} generated by \link{hdcd_control}.
#' @return A list with values \code{cov_mat}, the original estimate of the covariance structure,
#' \code{inds} an array indicating the variables for which a variance could be estimated,
#' the precision matrix estimated with the glasso \code{wi} and its inverse \code{w}.
#' @importFrom glasso glasso
#' @export
get_glasso_fit <- function(x, lambda, control){
penalize_diagonal <- control$glasso_penalize_diagonal
standardize <- control$glasso_standardize
threshold <- control$glasso_threshold
min_points <- control$segment_loss_min_points
NA_method <- control$glasso_NA_method
n_obs <- control$n_obs
stopifnot(!any(is.null(penalize_diagonal),
is.null(standardize),
is.null(threshold),
is.null(min_points),
is.null(NA_method),
is.null(n_obs)))
# Get (estimate of) covariance matrix
cov_mat_output <- get_cov_mat(x, NA_method, min_points = min_points)
# Prepare list to save results in
out <- list()
out$cov_mat <- cov_mat_output$mat
out$inds <- cov_mat_output$inds
out$w <- matrix(NA, nrow = ncol(x), ncol = ncol(x))
out$wi <- matrix(NA, nrow = ncol(x), ncol = ncol(x))
p_cur <- sum(out$inds)
if (p_cur == 0){
return(out)
}
# lambda should be rescaled by sqrt of length of segment corresponding to asymptotic theory
obs_share <- cov_mat_output$n_eff_obs / n_obs
if (standardize){
rho <- lambda / sqrt(obs_share) * diag(out$cov_mat[out$inds, out$inds, drop = F])
} else {
rho <- lambda / sqrt(obs_share)
}
glasso_output <- glasso::glasso(
out$cov_mat[out$inds, out$inds, drop = F],
rho = rho,
penalize.diagonal = penalize_diagonal,
thr = threshold
)
out$w[out$inds, out$inds] <- glasso_output$w
out$wi[out$inds, out$inds] <- glasso_output$wi
out$mu <- colMeans(x, na.rm = T)
# rescaling of loglikelihood returned from glasso
if (!standardize){
out$loglik <- ((-2 / p_cur) * glasso_output$l - sum(abs(lambda / sqrt(obs_share) * glasso_output$wi))) * obs_share
} else {
out$loglik <- ((-2 / p_cur) * glasso_output$l - sum(abs(lambda / sqrt(obs_share) * sqrt((diag(out$cov_mat[out$inds, out$inds, drop = F]) %*% t(diag(out$cov_mat[out$inds, out$inds, drop = F])))) * glasso_output$wi))) * obs_share
}
out
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.