#' Shrinkage Estimation of the Covariance Matrix
#'
#' @description This implements the covariance estimator of Schaefer & Strimmer (2005) which in turn builds
#' upon Ledoit & Wolf's (2004) paper. Shrinkage targets A, B, and D of Schaefer & Strimmer's (2005)
#' paper are offered here, details of which are given below: \cr
#' \cr
#' Target A: Identity (diagonal, unit variance). Off-diagonal entries are shrunk towards zero, while
#' diagonal entries are shrunk towards 1. \cr \cr
#'
#' Target B: Pooled (diagonal, common-variance). Off diagonals are shrunk as in Target A, but diagonal
#' entries are shrunk towards a common value. In this package, the geometric mean is utilized. Schaefer & Strimmer
#' use the median, but the geometric mean is a good choice because it is preferable to the arithmetic mean
#' when data are > 0 (which variances are) and is robust to outlying observations while still accounting for them
#' in its estimate. This is the default shrinkage target. \cr \cr
#'
#' Target D: Unequal (diagonal, empirical variances). This works just as Target B, but the mixing parameter
#' for the diagonal is set to zero, such that the empirical variances are along the diagonal. If you have
#' reason to believe the variances of your covariates should be more similar than different, this is a recommended
#' choice. \cr \cr
#'
#' Targets C, E, and F are not offered due to the fact that they do not guarantee a positive-definite
#' covariance matrix.
#'
#' Also available here is an adaptive non-linear shrinkage procedure. The estimator was adapted from the nlshrink
#' package, but with a few minor adjustments to simplify the useage and speed up the estimation. Note that
#' the penalization method for this is entirely different and based on a non-linear optimization problem. For details,
#' see Ledoit & Wolf (2015).
#'
#' @param x a data frame or matrix of numeric covariates
#' @param alpha a custom value for the off-diagonal mixing parameter. if left as NULL the optimal value will automatically be selected.
#' @param alpha.var a custom value for the diagonal mixing parameter. if left as NULL the optimal value will automatically be selected.
#' @param target one of "unequal" (the default), "identity", "pooled", or "adaptive".
#' @param ... other arguments
#'
#' @return a covariance matrix
#' @export
#'
#' @examples
#'
#' @references
#' Schaefer, J. ; K. Strimmer (2005) A shrinkage approach to large-scale covariance estimation and implications for functional genomics. Statist. Appl. Genet. Mol. Biol. 4:32. \cr
#' \cr
#' Ledoit, O. ; M. Wolf (2004). Honey, I shrunk the sample covariance matrix. J. Portfolio Management, 30(4), 110-119, doi: 10.3905/jpm.2004.110 \cr
#' \cr
#' Ledoit, O. ; Wolf, M. (2015). Spectrum estimation: a unified framework for covariance matrix estimation and PCA in large dimensions. Journal of Multivariate Analysis, 139(2)
#' \cr
#'
#'
covShrink = function(x, w = NULL, alpha = NULL, alpha.var = NULL, target = c("unequal", "identity", "pooled", "adaptive"), ...){
target <- match.arg(target)
x <- as.matrix(x)
gmean = function(x, na.rm=TRUE){
exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x))
}
if (target != "adaptive"){
if (is.null(w)){w = rep(1/nrow(x), nrow(x))}else{w = w/sum(w)}
if (is.null(alpha)) alpha = .estimateAlpha(x, w)
if (is.null(alpha.var)) alpha.var = .estimateAlphaVar(x, w, target)
rho <- cov2cor(t(x)%*%(diag(w)%*%x))
}
if (target == "identity"){
rho.shrink <- ((1-alpha)*rho) + (alpha*diag(rep(1,ncol(x))))
gv = gmean(colVars(x))
vars = sapply(colVars(x), function(i) ((1-alpha.var)*i)+(alpha.var))
Sigma = cor2cov(rho.shrink, vars)
attr(Sigma, "alpha")<-alpha;attr(Sigma,"alpha.var")<-alpha.var
}
else if (target == "unequal"){
rho.shrink = ((1-alpha)*rho) + (alpha*diag(rep(1,ncol(x))))
vars = colVars(x)
Sigma = cor2cov(rho.shrink, vars)
attr(Sigma, "alpha")<-alpha;attr(Sigma,"alpha.var")<-alpha.var
}
else if (target == "pooled"){
rho.shrink <- ((1-alpha)*rho) + (alpha*diag(rep(1,ncol(x))))
gv = gmean(colVars(x))
vars = sapply(colVars(x), function(i) ((1-alpha.var)*i)+(alpha.var*gv))
Sigma = cor2cov(rho.shrink, vars)
attr(Sigma, "alpha")<-alpha;attr(Sigma,"alpha.var")<-alpha.var
}
else if (target == "adaptive"){
if (!is.null(w)){
x <- diag((w * nrow(x))^0.5) %*% x
Sigma = cov.nlshrink(x, ...)
}
else{
Sigma = cov.nlshrink(x, ...)
}
attr(Sigma, "alpha") <- NA;attr(Sigma, "alpha.var") <- NA
}
return(Sigma)
}
cov.nlshrink <- function(X, k=0) {
n <- nrow(X); p <- ncol(X)
if (k == 0){X <- X - tcrossprod(rep(1,n), colMeans(X));k = 1}
if (n > k){effn <- n - k}else{k <- n - 1;effn <- 1}
Sigma <- crossprod(X)/effn;eig<-eigen(Sigma);Gamma<-eig$vectors;lambda<-as.vector(eig$values)
lambdasort<-sort(lambda);lambdaord<-order(lambda);tau_est<-.estTau(X = X, k = k)
nlshrink_tau <- .nlshrink(.EstQ(tau_est, effn))
mleshrink <- Gamma %*% tcrossprod(diag(nlshrink_tau[lambdaord]), Gamma)
return(mleshrink)
}
.estimateAlpha <- function (x, w = NULL){
n = nrow(x)
p = ncol(x)
if (p == 1)
return(1)
if (is.null(w)){w = rep(1/nrow(x), nrow(x))}else{w = w/sum(w)}
xs = Scale(x, w, center = TRUE, scale = TRUE)
w2 = sum(w * w)
h1w2 = w2/(1 - w2)
sw = sqrt(w)
xsw = sweep(xs, MARGIN = 1, STATS = sw, FUN = "*")
xswsvd = svd(xsw)
sE2R = sum(xsw * (tcrossprod(sweep(xswsvd$u, 2, xswsvd$d^3, "*"), xswsvd$v))) - sum(colSums(xsw^2)^2)
remove(xsw, xswsvd)
xs2w = sweep(xs^2, MARGIN = 1, STATS = sw, FUN = "*")
sER2 = 2 * sum(xs2w[, (p - 1):1] * t(apply(xs2w[, p:2, drop = FALSE], 1, cumsum)))
remove(xs2w)
denominator = sE2R
numerator = sER2 - sE2R
if (denominator == 0)
alpha = 1
else alpha = max(0, min(1, numerator/denominator * h1w2))
return(alpha)
}
.estimateAlphaVar = function (x, w = NULL, target = c("unequal", "identity", "pooled")) {
target <- match.arg(target)
n = nrow(x)
p = ncol(x)
if (is.null(w)){w = rep(1/nrow(x), nrow(x))}else{w = w/sum(w)}
w2 = sum(w^2)
h1 = 1/(1 - w2)
h1w2 = w2/(1 - w2)
xc = scale(x, center = TRUE, scale = FALSE)
v = h1 * (colSums(w * xc^2))
gmean = function(x, na.rm=TRUE){
exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x))
}
if (target == "pooled"){
target = gmean(v)
}
else if (target == "identity"){
target = 1
}
else if (target == "unequal"){
return(0)
}
Z = xc^2
q1 = colSums(sweep(Z, MARGIN = 1, STATS = w, FUN = "*"))
q2 = colSums(sweep(Z^2, MARGIN = 1, STATS = w, FUN = "*")) - q1^2
numerator = sum(q2)
denominator = sum((q1 - target/h1)^2)
if (denominator == 0) alpha.var = 1
else alpha.var = max(0, min(1, numerator/denominator * h1w2))
return(alpha.var)
}
.EstQ <- function(tau, n){
# first check inputs
if (as.integer(n) <= 0) stop("n must be positive integer")
# these parameters can be changed if needed
left_tol <- 1e-4
tol <- 1e-4
if (is.unsorted(tau)) tau_ <- sort(tau) else tau_ <- tau
tau_[tau_ < 0 & abs(tau_) < left_tol] = 0
tau_[tau_ > 0 & abs(tau_) < tol] = 0
if (any (tau_ < 0)) stop("all eigenvalues must be non-negative")
if (all(tau_ == 0)) stop("at least one eigenvalue must be non-zero")
# set up environment
Q <- new.env()
Q$n <- as.integer(n)
if (Q$n != n) print ("Note: n coerced to integer")
Q$tau <- tau_
Q$p <- length(Q$tau)
Q$c <- Q$p / Q$n
tau_nonzero <- Q$tau[Q$tau > 0]
Q$t <- unique(tau_nonzero)
Q$K <- length(Q$t)
Q$pw <- vapply(Q$t, function(x) length(which(tau_nonzero == x)), integer(1))
Q$pzw <- Q$p - sum(Q$pw)
Q$pwt2 <- Q$pw * Q$t^2
sup_fn <- function(Q){
x_fn <- function(k){(Q$t[k]*Q$t[k+1])^(2/3) * ( (Q$pw[k]*Q$t[k+1])^(1/3) + (Q$pw[k+1]*Q$t[k])^(1/3) ) / ( Q$pwt2[k]^(1/3) + Q$pwt2[k+1]^(1/3) )}
p_theta_fn <- function(u, k) Q$pwt2[k]/(Q$t[k] - u)^2 + Q$pwt2[k+1]/(Q$t[k+1] - u)^2
p_phi_L_fn <- function(u, k) if (k == 1) 0 else {sum(Q$pwt2[1:(k-1)]/(Q$t[1:(k-1)] - u)^2)}
p_phi_R_fn <- function(u, k) if (k == (Q$K-1)) 0 else {sum(Q$pwt2[(k+2):Q$K]/(Q$t[(k+2):Q$K] - u)^2 )}
spec_sep <- function() {
if (Q$K == 1) return (NULL)
vapply(1:(Q$K-1), function(k)
p_theta_fn(x_fn(k),k) + p_phi_L_fn(Q$t[k+1], k) + p_phi_R_fn(Q$t[k], k) < Q$n, logical(1) )
}
p_phi_diff_fn <- function(u) 2*sum(Q$pwt2/(Q$t - u)^3)
p_phi_fn <- function(u) sum(Q$pwt2/(Q$t - u)^2)
spec_sep2 <- function(){
if (Q$K == 1) return (NULL)
spec_sep_out <- spec_sep();out <- c()
for (k in which(spec_sep_out)){
x_k <- x_fn(k);p_phi_diff_x_k <- p_phi_diff_fn(x_k)
if (p_phi_diff_x_k == 0) out <- c(out, setNames(x_k,k) )
else if (p_phi_diff_x_k > 0) {
x_root <- optimize(f = function(x) (p_phi_diff_fn(x))^2, interval=c(Q$t[k], x_k), tol=1e-4)$minimum
if (p_phi_fn(x_root) < Q$n) out <- c(out, setNames(x_root,k))
}
else {
x_root <- optimize(f = function(x) (p_phi_diff_fn(x))^2, interval=c(x_k, Q$t[k+1]), tol=1e-4)$minimum
if(p_phi_fn(x_root) < Q$n) out <- c(out, setNames(x_root,k))
}
}
return ( out )
}
spec_boundint <- function(){
spec_sep_out <- spec_sep2()
if (is.null(spec_sep_out)) return (NULL)
K_subset <- as.integer(names(spec_sep_out))
out <- do.call(rbind, lapply(K_subset, function(k){
x_mid <- spec_sep_out[as.character(k)]
spec_k_lb <- optimize(f = function(x) (p_phi_fn(x) - Q$n)^2, interval=c(Q$t[k],x_mid), tol=1e-4)$minimum
spec_k_ub <- optimize(f = function(x) (p_phi_fn(x) - Q$n)^2, interval=c(x_mid,Q$t[k+1]), tol=1e-4)$minimum
pw1 <- Q$pzw + sum(Q$pw[1:k]);pw2 <- Q$p - pw1
return(c(spec_k_lb, spec_k_ub, pw1, pw2))
}))
rownames(out) <- K_subset
return (out)
}
boundint <- spec_boundint()
spec_supp <- function(){
t1 <- min(Q$t)
x_lb1 <- t1 - sqrt(1/Q$n * sum(Q$pwt2) ) - 1
x_t1 <- optimize(f = function(x) (p_phi_fn(x) - Q$n)^2, interval=c(x_lb1, t1), tol=1e-4)$minimum
tK <- max(Q$t)
x_ubK <- tK + sqrt(1/Q$n * sum(Q$pwt2) ) + 1
x_tK <- optimize(f = function(x) (p_phi_fn(x) - Q$n)^2, interval=c(tK, x_ubK), tol=1e-4)$minimum
if (is.null(boundint)) {
supp_intervals <- matrix(c(x_t1,x_tK), ncol=2)
omega <- Q$p
}
else {
supp_intervals <- matrix(c(x_t1, t(boundint[,1:2]), x_tK), byrow=TRUE, ncol=2)
omega <- vapply(1:nrow(supp_intervals), function(i)
sum(Q$pw[Q$t > supp_intervals[i,1] & Q$t < supp_intervals[i,2]] ), integer(1) )
}
rownames(supp_intervals) <- omega
return(supp_intervals)
}
Q$u_Fbar <- spec_supp()
Q$numint <- nrow(Q$u_Fbar)
m_LF_u_Fbar_fn <- function(u) 1/Q$p * sum(Q$pw*Q$t/(Q$t - u) )
Q$m_LF_u_Fbar <- apply(Q$u_Fbar, c(1,2), m_LF_u_Fbar_fn)
endpoint_fn <- function(u, m_LF_u) max(0, u - Q$c*u*m_LF_u)
Q$endpoints <- matrix(mapply(endpoint_fn, Q$u_Fbar, Q$m_LF_u_Fbar), nrow=Q$numint)
Q$endpoints[1,1] <- ifelse(Q$p==Q$n, 0, Q$endpoints[1,1] )
numeig_fn <- function(){
if (Q$numint == 1) out <- Q$p - max(Q$pzw, Q$p - Q$n)
else {
out <- rep(0, Q$numint)
out[1] <- boundint[1,3] - max(Q$pzw, Q$p - Q$n)
if (Q$numint > 2) out[2:(Q$numint-1)] <- diff(boundint[1:(Q$numint - 1),3])
out[Q$numint] <- boundint[(Q$numint-1),4]
}
return (as.integer(out) )
}
Q$numeig <- numeig_fn()
}
grid_fn <- function(Q){
minnxi = 100
nxi0 = max(minnxi,min(sum(Q$pw),Q$n))
Q$mult = ceiling(nxi0 / min(sum(Q$pw),Q$n) )
Q$nxi = Q$mult*min(sum(Q$pw),Q$n)
Q$xi_list <- lapply(1:Q$numint, function(i) {
lb <- Q$u_Fbar[i,1]; ub <- Q$u_Fbar[i,2]
temp <- Q$numeig[i]*Q$mult
return ( lb + (ub-lb)*(sin( pi/(2*(temp + 1)) * (1:temp) ) )^2 )
})
}
sol_fn <- function(Q){
Gamma_fn <- function(y, xi)
sum(Q$tau^2/( (Q$tau - xi)^2 + y^2 ) ) - Q$n
Q$zxi_list <- lapply(1:length(Q$xi_list), function(i) {
vapply(1:(length(Q$xi_list[[i]]) ), function(j) {
xi <- Q$xi_list[[i]][j]
delta <- min( (Q$t - xi)^2 )
y_ub <- sqrt(max(1/Q$n * sum(Q$pwt2) - delta, 0) ) + 1
root <- optimize(f = function(y) (Gamma_fn(y, xi))^2, interval=c(0,y_ub), tol=1e-4)$minimum
return (complex(real = xi, imaginary = root))}, complex(1))
})
}
den_fn <- function(Q){
Q$a = 4
m_LF_fn <- Vectorize ( function(z) 1/Q$p * sum(Q$tau/(Q$tau - z) ), vectorize.args = "z" )
Q$m_LF_zxi_list <- lapply(1:length(Q$zxi_list), function(i) m_LF_fn(Q$zxi_list[[i]]) )
Q$x_list <- lapply(1:length(Q$zxi_list), function(i) Re(Q$zxi_list[[i]] * (1 - Q$c*Q$m_LF_zxi_list[[i]]) ) )
Q$f_list <- lapply(1:length(Q$zxi_list), function(i) 1/(pi*Q$c) * Im(Q$zxi_list[[i]]) / (abs(Q$zxi_list[[i]])^2) )
Q$zeta_list <- lapply(1:length(Q$x_list), function(i) Q$x_list[[i]]^(1/Q$a) )
Q$g_list <- lapply(1:length(Q$zeta_list), function(i) Q$a*Q$zeta_list[[i]]^(Q$a-1) * Q$f_list[[i]])
}
dis_fn <- function(Q){
Q$dis_x_list <- lapply(1:Q$numint, function(i) c(Q$endpoints[i,1], Q$x_list[[i]], Q$endpoints[i,2]))
Q$dis_zeta_list <- lapply(1:Q$numint, function(i) c(Q$endpoints[i,1]^(1/Q$a), Q$zeta_list[[i]], Q$endpoints[i,2]^(1/Q$a) ) )
Q$dis_m_LF_list <- lapply(1:Q$numint, function(i) c(Q$m_LF_u_Fbar[i,1], Q$m_LF_zxi_list[[i]], Q$m_LF_u_Fbar[i,2]))
Q$dis_g_list <- lapply(1:Q$numint, function(i) c(0, Q$g_list[[i]], 0) )
Q$F0 <- 1/Q$p * max(Q$pzw, (Q$p - Q$n) )
Q$Fstart <- Q$F0 + c(0, cumsum(Q$numeig[-Q$numint]) ) / Q$p
Q$Fend <- Q$F0 + cumsum(Q$numeig) / Q$p
Q$intlength <- sapply(1:Q$numint, function(i) length(Q$dis_x_list[[i]]))
Q$dis_G_raw <- lapply(1:Q$numint, function(i)
c(0, cumsum(diff(Q$dis_zeta_list[[i]]) * (Q$dis_g_list[[i]][-1] + Q$dis_g_list[[i]][-Q$intlength[i]]) * 0.5 ) ) )
Q$dis_G_list <- lapply(1:Q$numint, function(i) Q$Fstart[i] +(Q$Fend[i] - Q$Fstart[i]) * Q$dis_G_raw[[i]] / Q$dis_G_raw[[i]][Q$intlength[i]] )
}
lambda_fn <- function(Q){
Q$F <- lapply(1:Q$numint, function(i) unique(Q$dis_G_list[[i]]))
Q$F_idx <- lapply(1:Q$numint, function(i) which(!duplicated(Q$dis_G_list[[i]] ) ) )
Q$nidx <- sapply(1:Q$numint, function(i) length(Q$F_idx[[i]]))
Q$x_F <- lapply(1:Q$numint, function(i) (Q$dis_zeta_list[[i]][Q$F_idx[[i]]])^(Q$a) )
Q$x_F_mean <- lapply(1:Q$numint, function(i) 0.5*(Q$x_F[[i]][-1] + Q$x_F[[i]][-length(Q$x_F[[i]])] ) )
Q$x_F_diff <- lapply(1:Q$numint, function(i) diff(Q$x_F[[i]]))
Q$F_diff <- lapply(1:Q$numint, function(i) diff(Q$F[[i]]))
Q$nquant <- Q$numeig + 1
Q$quant <- lapply(1:Q$numint, function(i) seq(Q$F[[i]][1], Q$F[[i]][Q$nidx[i]],length.out = Q$nquant[i]))
Q$bins <- lapply(1:Q$numint, function(i) findInterval(Q$quant[[i]], Q$F[[i]], rightmost.closed=TRUE))
if (any(sapply(1:length(Q$bins), function(i) c(Q$bins[[i]][1], Q$bins[[i]][Q$nquant[i]] ) ) != rbind(rep(1,Q$numint), Q$nidx-1) ) ) stop("Unexpected bin allocation")
Q$integral_indic <- lapply(1:Q$numint, function(i)1*( tcrossprod(rep(1,Q$nquant[[i]]), Q$F_idx[[i]][-1]) <= tcrossprod(Q$bins[[i]], rep(1, (Q$nidx[i]-1) ) ) ) )
Q$lambda <- rep(0, Q$p)
X_integral <- lapply(1:Q$numint, function(i) Q$F_diff[[i]] * Q$x_F_mean[[i]] )
integral_j_kappa <- lapply(1:Q$numint, function(i) (Q$quant[[i]] - Q$F[[i]][Q$bins[[i]]])* ( Q$x_F[[i]][Q$bins[[i]]] +
0.5* (Q$quant[[i]] - Q$F[[i]][Q$bins[[i]]]) * Q$x_F_diff[[i]][Q$bins[[i]]] / Q$F_diff[[i]][Q$bins[[i]]]) )
X_kappa_integral <- lapply(1:Q$numint, function(i)
rowSums ( tcrossprod(rep(1,Q$nquant[[i]]), X_integral[[i]]) * Q$integral_indic[[i]] ) + integral_j_kappa[[i]] )
for (i in 1:Q$numint)
Q$lambda[round(Q$F[[i]][1] * Q$p + 1):round(Q$F[[i]][Q$nidx[i]] * Q$p)] <- diff(X_kappa_integral[[i]]) * Q$p
}
sup_fn(Q)
grid_fn(Q)
sol_fn(Q)
den_fn(Q)
dis_fn(Q)
lambda_fn(Q)
return(Q)
}
.nlshrink <- function(Q) {
delta <- rep(0,Q$p)
if ((Q$p - Q$n) > Q$pzw) {
lb <- min(Q$tau) - 1/ Q$n * sum(Q$tau) - 1
ub <- 1/Q$p * (Q$p - Q$pw[1])*Q$t[1]
optim_fn <- function(u) (sum(Q$pw*Q$t / (Q$t - u)) - Q$n)^2
u_Fbar0 <- optimize(f = optim_fn, interval = c(lb,ub))$minimum
null_adj <- u_Fbar0 / (1 - Q$c)
delta[1:(Q$p - Q$n)] <- null_adj
}
lim0 <- ifelse(Q$p == Q$n & all(Q$tau > 0), Q$n / sum(Q$pw / Q$t), 0)
out <- lapply(1:Q$numint, function(i) {
y <- rep(0, Q$nidx[i])
Dr <- abs(1 - Q$c*Q$dis_m_LF_list[[i]][Q$F_idx[[i]]])^2
if (Q$p == Q$n) {
j = (Q$x_F[[i]] == 0 & Dr == 0)
y[j] <- lim0
y[!j] <- Q$x_F[[i]][!j] / Dr[!j]
} else {
y <- Q$x_F[[i]] / Dr
}
F_temp <- Q$F[[i]]
b <- Q$bins[[i]]
F_temp_b <- F_temp[b]
integral_ydF <- diff(F_temp) * 0.5 * (y[-1] + y [-Q$nidx[i]])
integral_ydF2 <- (Q$quant[[i]] - F_temp_b) * (y[b] + 0.5 * (Q$quant[[i]] - F_temp_b) * (y[b + 1] - y[b]) / (F_temp[b+1] - F_temp_b) )
integral_ydF3 <- rowSums(tcrossprod(rep(1,Q$nquant[i]), integral_ydF) * Q$integral_indic[[i]]) + integral_ydF2
return (diff(integral_ydF3) * Q$p)
})
for (i in 1:Q$numint)
delta[round(Q$F[[i]][1]*Q$p + 1):round(Q$F[[i]][Q$nidx[i]] * Q$p)] <- out[[i]]
if (Q$p == Q$n)
delta[delta < lim0] <- lim0
return (delta)
}
.getLambdaJ <- function(Q){
rep1p <- rep(1,Q$p)
sup_J_fn <- function(Q){
u_vec <- c(t(Q$u_Fbar))
Dr <- apply(Q$u_Fbar, c(1,2), function(u) sum(Q$tau^2 / (Q$tau-u)^3 ) )
temp <- (array(1,dim=dim(Q$u_Fbar)) %o% Q$tau) - (Q$u_Fbar %o% rep1p)
Q$sup_J <- (Q$u_Fbar %o% Q$tau) / (temp^3 * (Dr %o% rep1p) )
Q$endpoints_J <- 1/Q$n * (Q$u_Fbar^2 %o% rep(1,Q$p)) / temp^2
if (Q$p == Q$n) Q$endpoints_J[1,1,] <- 0
}
sup_J_fn(Q)
Q$xi_J_list <- lapply(1:Q$numint, function(i) {
length_temp <- length(Q$xi_list[[i]])
temp <- (sin( pi/(2*(length_temp + 1)) * (1:length_temp) ) )^2
return( tcrossprod((1 - temp), Q$sup_J[i,1,]) + tcrossprod(temp, Q$sup_J[i,2,]) )
})
den_J_fn <- function(Q){
TAU_list <- lapply(1:Q$numint, function(i) tcrossprod(rep(1,length(Q$xi_list[[i]])), Q$tau) )
TAU2_list <- lapply(1:Q$numint, function(i) tcrossprod(rep(1,length(Q$xi_list[[i]])), Q$tau^2) )
ZXI_list <- lapply(1:Q$numint, function(i) tcrossprod(Q$zxi_list[[i]], rep1p ) )
XI_list <- lapply(1:Q$numint, function(i) tcrossprod(Re(Q$zxi_list[[i]]), rep1p ) )
YXI_list <- lapply(1:Q$numint, function(i) tcrossprod(Im(Q$zxi_list[[i]]), rep1p ) )
YXI2_list <- lapply(1:Q$numint, function(i) tcrossprod(Im(Q$zxi_list[[i]])^2, rep1p ) )
TAU_XI_list <- lapply(1:Q$numint, function(i) TAU_list[[i]] - XI_list[[i]])
#Jacobian of yxi := imag(zxi)
Q$yxi_J_list <- lapply(1:Q$numint, function(k) {
NORM <- TAU_XI_list[[k]]^2 + YXI2_list[[k]]; NORM2 <- NORM*NORM
Nr <- rowSums(TAU2_list[[k]] * TAU_XI_list[[k]] / NORM2)
Dr <- rowSums(TAU2_list[[k]] * YXI_list[[k]] / NORM2)
return ( (TAU_list[[k]] / NORM - TAU2_list[[k]] * TAU_XI_list[[k]] / NORM2 + tcrossprod(Nr, rep1p) * Q$xi_J_list[[k]]) / tcrossprod(Dr, rep1p) )
})
Q$zxi_J_list <- lapply(1:Q$numint, function(k) Q$xi_J_list[[k]] + 1i*Q$yxi_J_list[[k]])
Q$f_J_list <- lapply(1:Q$numint, function(k)
1/(pi*Q$c) * Im(Q$zxi_J_list[[k]] * tcrossprod(1/(Q$zxi_list[[k]]*Q$zxi_list[[k]]), rep1p) ) )
Q$m_LF_J_list <- lapply(1:Q$numint, function(k) {
TAU_ZXI2 <- (TAU_list[[k]] - ZXI_list[[k]])^2
1/Q$p * (-ZXI_list[[k]] / TAU_ZXI2 + Q$zxi_J_list[[k]] * tcrossprod(rowSums(TAU_list[[k]] / TAU_ZXI2 ),rep1p) )
})
Q$x_J_list <- lapply(1:Q$numint, function(k)
Re ( Q$zxi_J_list[[k]] * (1 - Q$c*tcrossprod(Q$m_LF_zxi_list[[k]], rep1p) ) - Q$c * ZXI_list[[k]] * Q$m_LF_J_list[[k]] ) )
Q$zeta_J_list <- lapply(1:Q$numint, function(k)
1/Q$a * Q$x_J_list[[k]] * tcrossprod(Q$x_list[[k]]^(1/Q$a - 1), rep1p) )
Q$g_J_list <- lapply(1:Q$numint, function(k)
Q$a * ( (Q$a - 1) * Q$zeta_J_list[[k]] * tcrossprod(Q$f_list[[k]]*Q$zeta_list[[k]]^(Q$a - 2), rep1p) +
tcrossprod(Q$zeta_list[[k]]^(Q$a - 1), rep1p) * Q$f_J_list[[k]] ) )
}
den_J_fn(Q)
dis_J_fn <- function(Q){
Q$dis_g_J_list <- lapply(1:Q$numint, function(k) rbind(rep(0,Q$p), Q$g_J_list[[k]], rep(0,Q$p)))
Q$dis_zeta_J_list <- lapply(1:Q$numint, function(k)
rbind(1/Q$a * Q$endpoints[k,1]^(1/Q$a - 1) * Q$endpoints_J[k,1,],
Q$zeta_J_list[[k]],
1/Q$a * Q$endpoints[k,2]^(1/Q$a - 1) * Q$endpoints_J[k,2,]) )
if (Q$p == Q$n) Q$dis_zeta_J_list[[1]][1,] <- 0
dis_zeta_J_diff <- lapply(1:Q$numint, function(k) Q$dis_zeta_J_list[[k]][-1,] - Q$dis_zeta_J_list[[k]][-Q$intlength[k],])
dis_zeta_diff <- lapply(1:Q$numint, function(k) diff(Q$dis_zeta_list[[k]]) )
dis_g_mean <- lapply(1:Q$numint, function(k) 0.5*(Q$dis_g_list[[k]][-1] + Q$dis_g_list[[k]][-Q$intlength[k]]) )
dis_g_J_mean <- lapply(1:Q$numint, function(k) 0.5*(Q$dis_g_J_list[[k]][-1,] + Q$dis_g_J_list[[k]][-Q$intlength[k],] ) )
G_J_raw_list <- lapply(1:Q$numint, function(k) rbind(rep(0,Q$p),
apply(dis_zeta_J_diff[[k]] * tcrossprod(dis_g_mean[[k]], rep1p), 2, cumsum) +
apply(tcrossprod(dis_zeta_diff[[k]], rep1p) * dis_g_J_mean[[k]], 2, cumsum) ) )
Fends_diff <- Q$Fend - Q$Fstart
Q$dis_G_J <- lapply(1:Q$numint, function(k)
Fends_diff[k] / Q$dis_G_raw[[k]][Q$intlength[k]] * (G_J_raw_list[[k]] -
tcrossprod(Q$dis_G_raw[[k]], rep1p) * tcrossprod(rep(1,Q$intlength[k]), G_J_raw_list[[k]][Q$intlength[k],]) ) )
}
dis_J_fn(Q)
lambda_J_fn <- function(Q){
Q$lambda_J <- matrix(0, nrow=Q$p, ncol = Q$p)
if ((Q$p - Q$n) <= Q$pzw & Q$pzw > 0)
Q$lambda_J[max(1,Q$p - Q$n + 1):Q$pzw, 1:Q$pzw] <- 1 - Q$c
Q$F_J <- lapply(1:Q$numint, function(k) Q$dis_G_J[[k]][Q$F_idx[[k]],])
Q$x_J_F <- lapply(1:Q$numint, function(k)
tcrossprod((Q$a*Q$dis_zeta_list[[k]][Q$F_idx[[k]]]^(Q$a-1)), rep1p) * Q$dis_zeta_J_list[[k]][Q$F_idx[[k]],] )
F_J_diff <- lapply(1:Q$numint, function(k) Q$F_J[[k]][-1,] - Q$F_J[[k]][-Q$nidx[k],])
x_J_F_mean <- lapply(1:Q$numint, function(k) 0.5*(Q$x_J_F[[k]][-1,] + Q$x_J_F[[k]][-Q$nidx[k],]) )
x_J_F_diff <- lapply(1:Q$numint, function(k) Q$x_J_F[[k]][-1,] - Q$x_J_F[[k]][-Q$nidx[k],])
X_J_integral <- lapply(1:Q$numint, function(k)
F_J_diff[[k]] * tcrossprod(Q$x_F_mean[[k]], rep1p) + tcrossprod(Q$F_diff[[k]], rep1p) * x_J_F_mean[[k]] )
integral_j_kappa <- lapply(1:Q$numint, function(k) {
q <- Q$quant[[k]]
b <- Q$bins[[k]]
tcrossprod(q,rep1p) * Q$x_J_F[[k]][b,] - Q$x_J_F[[k]][b,] * tcrossprod(Q$F[[k]][b], rep1p) -
Q$F_J[[k]][b,] * tcrossprod(Q$x_F[[k]][b], rep1p) -
Q$F_J[[k]][b,] * tcrossprod(((q - Q$F[[k]][b])*Q$x_F_diff[[k]][b] / Q$F_diff[[k]][b]), rep1p) +
x_J_F_diff[[k]][b,] * tcrossprod( (0.5 * (q - Q$F[[k]][b])^2 / Q$F_diff[[k]][b]), rep1p) -
F_J_diff[[k]][b,] * tcrossprod( (0.5 * (q - Q$F[[k]][b])^2 * Q$x_F_diff[[k]][b] / Q$F_diff[[k]][b]^2), rep1p) })
X_J_kappa_integral <- lapply(1:Q$numint, function(i) Q$integral_indic[[i]] %*% X_J_integral[[i]] + integral_j_kappa[[i]])
for (i in 1:Q$numint)
Q$lambda_J[round(Q$F[[i]][1] * Q$p + 1):round(Q$F[[i]][Q$nidx[i]] * Q$p), ] <- apply(X_J_kappa_integral[[i]], 2, diff) * Q$p
}
lambda_J_fn(Q)
return (Q$lambda_J)
}
.ESD <- function(tau, n) {
Q <- .EstQ(tau,n)
return( setNames(Reduce(c, Q$F), Reduce(c,Q$dis_x_list)) )
}
.estTau <- function(X, k = 0, control = list(rel.tol = 1e-4)){
rho <- new.env()
rho$n <- nrow(X);rho$p <- ncol(X);rho$k <- k
if (rho$k == 0) {rho$X <- X - tcrossprod(rep(1,rho$n), colMeans(X));rho$k <- 1}
else rho$X <- X
if (rho$n > rho$k){rho$effn <- rho$n - rho$k}
else{rho$k<-rho$n-1;rho$effn<-1}
rho$S <- crossprod(rho$X)/rho$effn
rho$lambda <- sort(eigen(rho$S, only.values = TRUE)$values)
rho$lambda[rho$lambda < 0] = 0
if (rho$p > rho$effn){rho$lambda[1:(rho$p - rho$effn)] <- 0}
rho$lambda_ls <- .eigenshrink(X = rho$X, k = rho$k)
lambda_obj <- function(tau){
if(is.unsorted(tau)) {tausort <- sort(tau);tauorder <- order(tau)}
else{tausort <- tau;tauorder <- 1:rho$p}
Q <- .EstQ(tausort,rho$effn)
lambda_est <- Q$lambda
lambda_est_order <- lambda_est[tauorder]
return(mean((lambda_est_order - rho$lambda)^2))
}
lambda_gr <- function(tau){
if(is.unsorted(tau)){tausort<-sort(tau);tauorder<-order(tau)}
else{tausort<-tau;tauorder<-1:rho$p}
Q <- .EstQ(tausort,rho$effn)
lambda_est <- Q$lambda
lambda_est_order <- lambda_est[tauorder]
lambda_est_J <- .getLambdaJ(Q)
lambda_est_J_order <- lambda_est_J[tauorder,]
return (as.numeric( 2/Q$p * crossprod((lambda_est_J_order), (lambda_est_order - rho$lambda) ) ) )
}
optim_out <- nlminb(start=rho$lambda_ls,objective=lambda_obj,gradient=lambda_gr,lower=min(rho$lambda),upper=max(rho$lambda),control=control)
return(optim_out$par)
}
.eigenshrink <- function(X, k = 0){
n <- nrow(X);p <- ncol(X)
if (k == 0){X <- X - tcrossprod(rep(1,n), colMeans(X));k = 1}
if (n > k) {effn <- n-k} else stop("k must be strictly less than nrow(X)")
S <- crossprod(X)/effn
lambda <- sort(eigen(S,only.values = TRUE)$values)
lambda[lambda < 0] <- 0
if (p > effn) lambda [1:(p-effn)] <- 0
Z <- X*X
phi <- sum(crossprod(Z) / effn - 2*crossprod(X)* S / effn + S*S)
gamma <- sum((S-mean(diag(S))*diag(p))^2)
alpha<-max(0, min(1,phi/gamma/effn))
gmean=function(x,na.rm=TRUE){exp(sum(log(x[x>0]),na.rm=na.rm)/length(x))}
lambda_mean<-gmean(lambda)
return(lambda_mean+sqrt(1-alpha)*(lambda-lambda_mean))
}
#' print method for covRobust objects
#'
#' @param fit the object
#' @param digits the number of significant digits to display. defaults to 5. changes to 3 if the number of variables is >= 10.
#' @method print covRobust
#' @export
#'
#'
print.covRobust <- function(fit, digits = 5){
if (ncol(fit$cov)>=10) digits <-3
center <- format(round(fit$center, digits), nsmall = digits)
cov <- format(round(fit$cov, digits), nsmall = digits)
brown <- crayon::make_style(rgb(0.766, 0.204, 0.237))
green2 <- crayon::make_style(rgb(0.435, 0.653, 0.218))
cat(brown("Covariance:\n"))
print(noquote(cov), right=T)
cat(green2("\nLocation:\n"))
print(noquote(center), right=T)
if (!is.null(fit$com)){
brown2 <- crayon::make_style(rgb(0.8166, 0.104, 0.237))
green3 <- crayon::make_style(rgb(0.502, 0.62, 0.278))
cov2 <- format(round(fit$com, digits), nsmall = digits)
center2 <- format(round(fit$medians, digits), nsmall = digits)
cat(brown2("\nL1 Scatter:\n"))
print(noquote(cov2), right=T)
cat(green3("\nL1 Location:\n"))
print(noquote(center2), right=T)
}
}
#' plot method for covRobust objects
#'
#' @param fit the object
#' @method plot covRobust
#' @export
#'
#'
plot.covRobust <- function(fit){
dist <- fit$dist
index <- 1:length(dist)
outs <- fit$outlier
thresh <- min(dist[outs])
plot(1, type = "n", xlab="Index",
ylab= "Mahalanobis Distance",
main="Outlier Plot",
xlim=c(1,max(index)), ylim=c(max(0,min(dist)-2), max(dist))
)
points(index[-outs], dist[-outs], col="#0644ccCC", bg = "#486bb82F", pch = 21)
abline(a = thresh, b = 0, col = "#a30000BF", lwd = 2)
points(index[outs], dist[outs], col="#f3054eCC", bg = "#f22c852F", pch = 21)
}
#' Co-Median Robust Covariance Matrix
#'
#' @description The co-median matrix is an alternative to the covariance matrix. To understand
#' how this works, first consider the definition of the median absolute deviation, \eqn{MAD(x) = md(x-md(x))}.
#' The MAD is usually scaled by a factor of 1.4826 to make it usable as a consistent robust estimator of the
#' standard deviation. Also offered as an option here is to replace the standard estimate of the median with
#' the Harrell-Davis estimator of the median, which can improve accuracy in smaller sample sizes (Harrell & Davis, 1982).
#' \cr
#' \cr
#' The co-median is defined by \eqn{com(x,y) = med((x-med(x) * (y-med(y))))}, and the standardized form analagous
#' to the correlation coefficient, \eqn{ \delta = com(x,y)/(MAD(x) * MAD(y))}. Note that \eqn{\delta}
#' is not guaranteed to lie within the interval [-1, 1] like the correlation coefficient, however, but
#' typically only deviates from this interval for non-normally distributed random variables and is a
#' smooth function of the correlation coefficient (Falk, 1997; Falk, 1998). \cr
#' \cr
#' A disadvantage of the median absolute deviation is that it can collapse to zero when half of the values
#' in a vector are the same. When a column with MAD=0 is detected, the function returns an error message.
#' Another disadvantage of the co-median matrix is that it is not guaranteed to be positive-semidefinite
#' even when n > p. To get around this problem this function implements an iterative algorithm proposed by
#' Sajesh and Srinivasan (2012), described below.
#'
#' 1. Let \eqn{\delta}(X) be the co-median correlation matrix of X. Compute the eigenvalues and eigenvectors
#' of \eqn{\delta}(X), and let \strong{E} denote the eigenvectors, and \strong{\eqn{\Lambda}} the diagonal matrix of
#' eigenvalues. \cr
#'
#' 2. Let \strong{Q} = \strong{D}\strong{E}, where \strong{D} is a diagonal matrix of MADs. Let
#' \strong{invQ} be the inverse of \strong{Q}. Scores are then obtained as \strong{Z} = \strong{XinvQ},
#' whose squared-MADs are stored in a diagonal matrix, \strong{\eqn{\Gamma}}. Furthermore, denote the
#' vector of column medians of \strong{Z} as \eqn{\gamma}. \cr
#'
#' 3. The resulting robust estimates for location and scatter are then respectively defined as
#' \strong{\eqn{\Omega}} = \strong{Q}\strong{\eqn{\Gamma}}\strong{Q'} and \eqn{mu} = \strong{Q}\eqn{\gamma}. \cr \cr
#'
#' 4. Optional Step: Reiterate the above steps one or two times, but substituting \strong{\eqn{\Omega}} for
#' \eqn{\delta} and \strong{\eqn{\Gamma}} for the sample MADs in \strong{D} in the re-iterated steps.
#'
#' @param x a data frame or matrix containing numeric variables
#' @param method one of "med", "hd", or "aad". "med" uses the typical median and MAD. "hd" uses the Harrell-Davis
#' estimate of the median in place of the median, and "aad" uses the average absolute deviation in lieu of the
#' median absolute deviation. if option "aad" is used the appropriate consistency constant, sqrt(pi/2), is used
#' instead of 1.4826. the only time "aad" is preferable is when there are columns in the data with a median absolute
#' deviation of zero.
#' @param iter number of refinement iterations
#' @param alpha the chi-squared quantile for declaring an outlier in the final reweighted estimate. must be > 0.50.
#' @return a covRobust object containing the following elements:
#' \itemize{
#' \item \strong{center}: multivariate mean of cleaned data set after discarding outliers identified by the mahalanobis distances of the co-median matrix.
#' \item \strong{cov}: covariance matrix of cleaned data set after discarding outliers identified by the mahalanobis distances of the co-median matrix.
#' \item \strong{medians}: estimated multivariate median
#' \item \strong{com}: estimated co-median matrix
#' \item \strong{delta}: the initial raw comedian correlation matrix
#' \item \strong{dist}: the mahalanobis distances based on the cleaned covariance matrix
#' \item \strong{distL1}: the mahalanobis distances based on the co-median matrix
#' \item \strong{outliers}: the indices of the outliers identified by the co-median matrix based mahalanobis distances; these are the points removed to obtain the cleaned covariance matrix.
#' \item \strong{weights}: the weights for downweighting outliers. here they are binary, with 0 marking an outlier and 1 otherwise.
#' }
#' @export
#' @examples
#' cov.comed(x)
#'
#' @references
#' Falk, M. (1997) On MAD and comedians. Annals of the Institute of Statistical Mathematics 49, 615-644. \cr
#' \cr
#' Falk, M. (1998). A Note on the Comedian for Elliptical Distributions. Journal of Multivariate Analysis, 67(2), 306-317. doi:10.1006/jmva.1998.1775 \cr
#' \cr
#' Harrell, F. E. & Davis, C. E. (1982). A new distribution-free quantile estimator. Biometrika, 69, 635–640 \cr
#' \cr
#' Sajesh, T. A., & Srinivasan, M. R. (2012). Outlier detection for high dimensional data using the Comedian approach. Journal of Statistical Computation and Simulation, 82(5), 745-757. doi:10.1080/00949655.2011.552504
#'
cov.comed <- function(x, method = c("med", "hd", "aad"), iter = 1){
method <- match.arg(method)
if(method=="med"){med<-T;hd<-F;aad<-F}
else if(method=="hd"){med<-F;hd<-T;aad<-F}
else if(method=="aad"){med<-F;hd<-F;aad<-T}
coMedfun <- function (omega,z,x){
p <- ncol(x);U<-wsvd(omega)$u;madx<-colMads(x);invmad<-1/madx
Q <-madx*U;iQ<-invmad*U;z<-x%*%iQ;madz<-colMads(z);Omega=mpd(tcrossprod(Q*rep(madz,each=p)))
return(list(Omega=Omega, z=z, Q=Q))
}
coMedHDfun <- function (omega,z,x){
hdmad <- function(i) cvreg:::HDMAD(i) * 1.4826
p <- ncol(x);U<-wsvd(omega)$u;madx<-apply(x,2,hdmad);invmad<-1/madx
Q <-madx*U;iQ<-invmad*U;z<-x%*%iQ;madz<-apply(z,2,hdmad);Omega=tcrossprod(Q*rep(madz,each=p))
return(list(Omega=Omega, z=z, Q=Q))
}
coAADfun <- function (omega,z,x){
p <- ncol(x);U<-wsvd(omega)$u;madx<-apply(x,2,aad);invmad<-1/madx
Q <-madx*U;iQ<-invmad*U;z<-x%*%iQ;madz<-apply(z,2,aad);Omega=tcrossprod(Q*rep(madz,each=p))
return(list(Omega=Omega, z=z, Q=Q))
}
if (med){
x<-as.matrix(x);n<-nrow(x);p<-ncol(x);colmads<-colMads(x)
if (any(colmads==0)){return(cat("Warning: At least one variable has a MAD of 0. Try using method='aad' instead."))}
## Calculate Co-Median Matrix and Initial Covariance Estimate
imad <- 1/colmads;delta<-cvreg:::coMedianMatrix(x);delta[which(is.na(as.vector(delta)))]<-0;
omega<-imad*mpd(delta)*rep(imad,each=p);U<-wsvd(omega)$u;iQ<-imad*U;
z<-x%*%iQ;Sigma<-coMedfun(omega,z,x);for (i in 1:iter){Sigma<-coMedfun(Sigma$Omega,Sigma$z,x)}
medians<-drop(Sigma$Q %*% colMedians(Sigma$z)); Omega<-Sigma$Omega;
## Calculate Weights Based On Initial Covariance Estimate
alpha <- min(0.975, 1-((0.24815 - 0.0021885*p)/sqrt(n)));
wtfun = function(d,p){q<-1.4826*(qchisq(alpha,p)/qchisq(0.5,p));as.numeric(d<median(d)*q)}
dist2<-mahalanobis_dist(x,medians,Omega);wt=wtfun(dist2,p);idx=which(wt==1);h<-sum(wt);
## Calculate New Covariance Matrix
center<-matrixStats::colWeightedMeans(x, wt);
newx<-sweep(x[idx,],2,center);
Sigma<-mpd(crossprod(newx)/h);
colnames(Omega)<-rownames(Omega)<-colnames(delta)<-rownames(delta)<-colnames(Sigma)<-rownames(Sigma)<-names(medians)<-colnames(x)
return(structure(list(center = center, cov = Sigma, dist = dist2, outliers = seq(1,nrow(x))[which(wt!=1)], weights = wt,
com = Omega, medians = medians, delta = delta),
class = "covRobust"))
}
if (hd){
x<-as.matrix(x);n<-nrow(x);p<-ncol(x);colmads<-apply(x,2,cvreg:::HDMAD)*1.4826
if (any(colmads==0)){return(cat("Warning: At least one variable has a MAD of 0. Try using method='aad' instead."))}
## Calculate Co-Median Matrix and Initial Covariance Estimate
imad <- 1/colmads;delta<-cvreg:::coHDMedianMatrix(x);delta[which(is.na(as.vector(delta)))]<-0;
omega<-imad*mpd(delta)*rep(imad,each=p);U<-wsvd(omega)$u;iQ<-imad*U;
z<-x%*%iQ;Sigma<-coMedHDfun(omega,z,x);for (i in 1:iter){Sigma<-coMedHDfun(Sigma$Omega,Sigma$z,x)}
medians<-drop(Sigma$Q %*% apply(Sigma$z,2,hdmedian)); Omega<-Sigma$Omega;
## Calculate Weights Based On Initial Covariance Estimate
alpha <- min(0.975, 1-((0.24815 - 0.0021885*p)/sqrt(n)));
wtfun = function(d,p){q<-1.4826*(qchisq(alpha,p)/qchisq(0.5,p));as.numeric(d<hdmedian(d)*q)}
dist2<-mahalanobis_dist(x,medians,Omega);wt=wtfun(dist2,p);idx=which(wt==1);h<-sum(wt);
## Calculate New Covariance Matrix
center<-matrixStats::colWeightedMeans(x, wt);
newx<-sweep(x[idx,],2,center);
Sigma<-mpd(crossprod(newx)/h);
colnames(Omega)<-rownames(Omega)<-colnames(delta)<-rownames(delta)<-colnames(Sigma)<-rownames(Sigma)<-names(medians)<-colnames(x)
return(structure(list(center = center, cov = Sigma, dist = dist2, outliers = seq(1,nrow(x))[which(wt!=1)], weights = wt,
com = Omega, medians = medians, delta = delta),
class = "covRobust"))
}
if (aad){
x<-as.matrix(x);n<-nrow(x);p<-ncol(x);colmads<-apply(x,2,aad)
## Calculate Co-Median Matrix and Initial Covariance Estimate
imad <- 1/colmads;delta<-cvreg:::coAADMatrix(x);delta[which(is.na(as.vector(delta)))]<-0;
omega<-imad*mpd(delta)*rep(imad,each=p);U<-wsvd(omega)$u;iQ<-imad*U;
z<-x%*%iQ;Sigma<-coAADfun(omega,z,x);for (i in 1:iter){Sigma<-coAADfun(Sigma$Omega,Sigma$z,x)}
medians<-drop(Sigma$Q %*% apply(Sigma$z,2,hdmedian)); Omega<-Sigma$Omega;
## Calculate Weights Based On Initial Covariance Estimate
alpha <- min(0.975, 1-((0.24815 - 0.0021885*p)/sqrt(n)));
wtfun = function(d,p){q<-sqrt(pi/2)*(qchisq(alpha,p)/qchisq(0.5,p));as.numeric(d<hdmedian(d)*q)}
dist2<-mahalanobis_dist(x,medians,Omega);wt=wtfun(dist2,p);idx=which(wt==1);h<-sum(wt);
## Calculate New Covariance Matrix
center<-matrixStats::colWeightedMeans(x, wt);
newx<-sweep(x[idx,],2,center);
Sigma<-mpd(crossprod(newx)/h);
colnames(Omega)<-rownames(Omega)<-colnames(delta)<-rownames(delta)<-colnames(Sigma)<-rownames(Sigma)<-names(medians)<-colnames(x)
return(structure(list(center = center, cov = Sigma, dist = dist2, outliers = seq(1,nrow(x))[which(wt!=1)], weights = wt,
com = Omega, medians = medians, delta = delta),
class = "covRobust"))
}
}
#' Weighted Co-Median Robust Covariance Matrix
#'
#' @description See \code{\link{cov.comed}} for details on how the co-median matrix is calculated. The difference
#' in this function is that the final covariance matrix is not based on dropping identified outliers, but rather,
#' smoothly downweighting them.
#'
#' @param x a data frame or matrix containing numeric variables
#' @param method one of "med", "hd", or "aad". "med" uses the typical median and MAD. "hd" uses the Harrell-Davis
#' estimate of the median in place of the median, and "aad" uses the average absolute deviation in lieu of the
#' median absolute deviation. if option "aad" is used the appropriate consistency constant, sqrt(pi/2), is used
#' instead of 1.4826. the only time "aad" is preferable is when there are columns in the data with a median absolute
#' deviation of zero.
#' @param iter number of refinement iterations
#' @return a covRobust object containing the following elements:
#' \itemize{
#' \item \strong{center}: multivariate mean of cleaned data set after discarding outliers identified by the mahalanobis distances of the co-median matrix.
#' \item \strong{cov}: covariance matrix of cleaned data set after discarding outliers identified by the mahalanobis distances of the co-median matrix.
#' \item \strong{medians}: estimated columnwise medians
#' \item \strong{com}: estimated co-median matrix
#' \item \strong{delta}: the initial raw comedian correlation matrix
#' \item \strong{dist}: the mahalanobis distances based on the cleaned covariance matrix
#' \item \strong{distL1}: the mahalanobis distances based on the co-median matrix
#' \item \strong{outliers}: the indices of the outliers identified by the co-median matrix based mahalanobis distances; these are the points removed to obtain the cleaned covariance matrix.
#' \item \strong{weights}: the weights for downweighting outliers.
#' }
#' @export
#' @examples
#' cov.wcomed(x)
#'
#' @references
#' Falk, M. (1997) On MAD and comedians. Annals of the Institute of Statistical Mathematics 49, 615-644. \cr
#' \cr
#' Falk, M. (1998). A Note on the Comedian for Elliptical Distributions. Journal of Multivariate Analysis, 67(2), 306-317. doi:10.1006/jmva.1998.1775 \cr
#' \cr
#' Harrell, F. E. & Davis, C. E. (1982). A new distribution-free quantile estimator. Biometrika, 69, 635–640 \cr
#' \cr
#' Sajesh, T. A., & Srinivasan, M. R. (2012). Outlier detection for high dimensional data using the Comedian approach. Journal of Statistical Computation and Simulation, 82(5), 745-757. doi:10.1080/00949655.2011.552504
#'
cov.wcomed <- function(x, hd = F, method = c("med", "hd", "aad"), iter = 2, k = 1.5){
method <- match.arg(method)
if(method=="med"){med<-T;hd<-F;aad<-F}
else if(method=="hd"){med<-F;hd<-T;aad<-F}
else if(method=="aad"){med<-F;hd<-F;aad<-T}
coMedfun <- function (omega,z,x){
p <- ncol(x);U<-wsvd(omega)$u;madx<-colMads(x);invmad<-1/madx
Q <-madx*U;iQ<-invmad*U;z<-x%*%iQ;madz<-colMads(z);Omega=mpd(tcrossprod(Q*rep(madz,each=p)))
return(list(Omega=Omega, z=z, Q=Q))
}
coMedHDfun <- function (omega,z,x){
hdmad <- function(i) cvreg:::HDMAD(i) * 1.4826
p <- ncol(x);U<-wsvd(omega)$u;madx<-apply(x,2,hdmad);invmad<-1/madx
Q <-madx*U;iQ<-invmad*U;z<-x%*%iQ;madz<-apply(z,2,hdmad);Omega=tcrossprod(Q*rep(madz,each=p))
return(list(Omega=Omega, z=z, Q=Q))
}
coAADfun <- function (omega,z,x){
p <- ncol(x);U<-wsvd(omega)$u;madx<-apply(x,2,aad);invmad<-1/madx
Q <-madx*U;iQ<-invmad*U;z<-x%*%iQ;madz<-apply(z,2,aad);Omega=tcrossprod(Q*rep(madz,each=p))
return(list(Omega=Omega, z=z, Q=Q))
}
if (med){
x<-as.matrix(x);n<-nrow(x);p<-ncol(x);colmads<-colMads(x)
if (any(colmads==0)){return(cat("Warning: At least one variable has a MAD of 0. Try using method='aad' instead."))}
## Calculate Co-Median Matrix and Initial Covariance Estimate
imad <- 1/colmads;delta<-cvreg:::coMedianMatrix(x);delta[which(is.na(as.vector(delta)))]<-0;
omega<-imad*mpd(delta)*rep(imad,each=p);U<-wsvd(omega)$u;iQ<-imad*U;
z<-x%*%iQ;Sigma<-coMedfun(omega,z,x);for (i in 1:iter){Sigma<-coMedfun(Sigma$Omega,Sigma$z,x)}
medians<-drop(Sigma$Q %*% colMedians(Sigma$z)); Omega<-Sigma$Omega;
## Calculate Weights Based On Initial Covariance Estimate
alpha <- min(0.975, 1-((0.24815 - 0.0021885*p)/sqrt(n)));
cv <- 1.4826*(qchisq(alpha,p)/qchisq(0.5,p));
dist2<-mahalanobis_dist(x,medians,Omega);
wt = cvreg:::mvSmoothWt(dist2/median(dist2), cv, k); h<-sum(wt)
outliers <- seq(1,nrow(x),by=1)[which(as.numeric(dist2>median(dist2)*cv)==1)]
## Calculate New Covariance Matrix
wcov <- cov.wt(x, wt, method = "ML")
center <- wcov$center
Sigma <- wcov$cov
colnames(Omega)<-rownames(Omega)<-colnames(delta)<-rownames(delta)<-colnames(Sigma)<-rownames(Sigma)<-names(medians)<-colnames(x)
return(structure(list(center = center, cov = Sigma, dist = dist2, outliers = outliers, weights = wt,
com = Omega, medians = medians, delta = delta),
class = "covRobust"))
}
if (hd){
x<-as.matrix(x);n<-nrow(x);p<-ncol(x);colmads<-apply(x,2,cvreg:::HDMAD)*1.4826
if (any(colmads==0)){return(cat("Warning: At least one variable has a MAD of 0. Try using method='aad' instead."))}
## Calculate Co-Median Matrix and Initial Covariance Estimate
imad <- 1/colmads;delta<-cvreg:::coHDMedianMatrix(x);delta[which(is.na(as.vector(delta)))]<-0;
omega<-imad*mpd(delta)*rep(imad,each=p);U<-wsvd(omega)$u;iQ<-imad*U;
z<-x%*%iQ;Sigma<-coMedHDfun(omega,z,x);for (i in 1:iter){Sigma<-coMedHDfun(Sigma$Omega,Sigma$z,x)}
medians<-drop(Sigma$Q %*% apply(Sigma$z,2,hdmedian)); Omega<-Sigma$Omega;
## Calculate Weights Based On Initial Covariance Estimate
alpha <- min(0.975, 1-((0.24815 - 0.0021885*p)/sqrt(n)));
cv <- 1.4826*(qchisq(alpha,p)/qchisq(0.5,p));
dist2<-mahalanobis_dist(x,medians,Omega);
wt = cvreg:::mvSmoothWt(dist2/hdmedian(dist2), cv, k); h<-sum(wt)
outliers <- seq(1,nrow(x),by=1)[which(as.numeric(dist2>median(dist2)*cv)==1)]
## Calculate New Covariance Matrix
wcov <- cov.wt(x, w, method = "ML")
center <- wcov$center
Sigma <- wcov$cov
colnames(Omega)<-rownames(Omega)<-colnames(delta)<-rownames(delta)<-colnames(Sigma)<-rownames(Sigma)<-names(medians)<-colnames(x)
return(structure(list(center = center, cov = Sigma, dist = dist2, outliers = outliers, weights = wt,
com = Omega, medians = medians, delta = delta),
class = "covRobust"))
}
if (aad){
x<-as.matrix(x);n<-nrow(x);p<-ncol(x);colmads<-apply(x,2,aad)
## Calculate Co-Median Matrix and Initial Covariance Estimate
imad <- 1/colmads;delta<-cvreg:::coAADMatrix(x);delta[which(is.na(as.vector(delta)))]<-0;
omega<-imad*mpd(delta)*rep(imad,each=p);U<-wsvd(omega)$u;iQ<-imad*U;
z<-x%*%iQ;Sigma<-coAADfun(omega,z,x);for (i in 1:iter){Sigma<-coAADfun(Sigma$Omega,Sigma$z,x)}
medians<-drop(Sigma$Q %*% apply(Sigma$z,2,hdmedian)); Omega<-Sigma$Omega;
## Calculate Weights Based On Initial Covariance Estimate
alpha <- min(0.975, 1-((0.24815 - 0.0021885*p)/sqrt(n)));
cv <- sqrt(pi/2)*(qchisq(alpha,p)/qchisq(0.5,p));
dist2<-mahalanobis_dist(x,medians,Omega);
wt = cvreg:::mvSmoothWt(dist2/hdmedian(dist2), cv, k); h<-sum(wt)
outliers <- seq(1,nrow(x),by=1)[which(as.numeric(dist2>median(dist2)*cv)==1)]
## Calculate New Covariance Matrix
wcov <- cov.wt(x, w, method = "ML")
center <- wcov$center
Sigma <- wcov$cov
colnames(Omega)<-rownames(Omega)<-colnames(delta)<-rownames(delta)<-colnames(Sigma)<-rownames(Sigma)<-names(medians)<-colnames(x)
return(structure(list(center = center, cov = Sigma, dist = dist2, outliers = outliers, weights = wt, com = Omega, medians = medians),
class = "covRobust"))
}
}
#' Minimum Regularized Covariance Determinant Estimator
#'
#' @description This implements the minimum regularized covariance determinant estimator. This is similiar to the implementation
#' in the rrcov package, but has a few tweaks to increase the speed of the algorithm and improve estimation, as well as being simpler to use.
#' The minimum covariance determinant estimator is a robust estimator of the covariance which finds the 1-delta percentage of the data that
#' yield the smallest determinant for the covariance matrix. It was introduced initially as part of the paper on least trimmed squares, and
#' further developed into a workable algorithm within a few years(Rousseeuw, 1984; Rousseeuw & Van Driessen, 1999). Since a gigantic number of
#' combinations of data points could be chosen, some heuristics are applied to seek good candidates and reduce computation time, which also ensures
#' consistency through the determinstic selection of subsets (Hubert et al., 2012; 2018). The regularized version implemented here from Boudt et al (2019)
#' further regularizes the calculated robust covariance matrix in a manner similar to that seen in the \code{\link{covShrink}} function.
#' The method of calculating the regularization parameter in Boudt et al (2019) has been replaced by the faster method utilized by
#' Schaefer & Strimmer (2005), which utilizes an analytic formula for calculating the optimal parameter value.
#'
#' \cr
#' The MRCD method requires choosing a scale estimator. Several options are offered here: \cr \cr
#' - "tau" is the tau-scale defined in Yohai and Zamar (1998). \cr
#' - "pb" is the percentage bend estimator (Shoemaker & Hettmansperger, 1982). \cr
#' - "bisq" is Tukey's bisquare estimator. \cr
#' - "huber" is Huber's psi estimator (Huber, 1964). \cr
#' - "mopt" is the modified optimal estimator. \cr
#' - "mad" is the (adjusted) median absolute deviation. \cr
#' - "Qn" and "Sn" are two alternatives to the median based measures of location and scale (Rousseeuw, Peter, & Croux, 1993). \cr
#'
#' @param x a data frame or matrix of numeric covariates
#' @param kappa the the proportion of the data to use in each subset. defaults to 0.80. must be > 0.50.
#' @param alpha a custom value for the regularization parameter. suggested to leave as NULL unless numerical problems are encountered.
#' @param method a scale estimator. the tau-scale estimator is the default.
#' @param opts list of options for the various scale estimators. "b" determines the percentage bend coefficient for "pb", "eff" determines the efficiency of the "huber", "bisquare" and "mopt"
#' scale estimators.
#' @return a covRobust object containing the following elements:
#' \itemize{
#' \item \strong{center}: multivariate mean of cleaned data set after applying casewise weights.
#' \item \strong{cov}: covariance matrix of cleaned data set after applying casewise weights.
#' \item \strong{dist}: the mahalanobis distances used in calculating the weights.
#' \item \strong{outliers}: the indices of the outliers identified.
#' \item \strong{weights}: the weights for downweighting outliers.
#' }
#' @export
#'
#' @references
#' Boudt, K.; Rousseeuw, P.J.; Vanduffel, S. (2019) The minimum regularized covariance determinant estimator. Stat Comput . doi: 10.1007/s11222-019-09869-x \cr
#' \cr
#' Huber, P. J. (1964). Robust estimation of a location parameter. Ann. Math. Statist. 35, 73–101. \cr
#' \cr
#' Hubert, M.; Rousseeuw, P.J.; Verdonck, T. (2012) A Deterministic Algorithm for Robust Location and Scatter, Journal of Computational and Graphical Statistics, 21:3, 618-637, DOI: 10.1080/10618600.2012.672100 \cr
#' \cr
#' Hubert, M.; Debruyne, M.; Rousseeuw, P.J. (2018) Minimum covariance determinant and extensions. WIREs Comput Stat. 10. doi: 10.1002/wics.1421 \cr
#' \cr
#' Rousseeuw, P.J. (1984) Least median of squares regression. J Am Stat Assoc 79:871–880. \cr
#' \cr
#' Rousseeuw, P.J.; Van Driessen, K. (1999) A fast algorithm for the Minimum Covariance Determinant estimator. Technometrics, 41:212–223. \cr
#' \cr
#' Schaefer, J. ; K. Strimmer (2005) A shrinkage approach to large-scale covariance estimation and implications for functional genomics. Statist. Appl. Genet. Mol. Biol. 4:32. \cr
#'
cov.mrcd = function (x, kappa = 0.80, alpha = NULL, method = c("tau", "pb", "bisq", "huber", "mopt", "Qn", "Sn"), opts = list("b" = 0.10, "eff" = 0.90)) {
method <- match.arg(method);scalefn <- .scfun(method, opts);mufn <- .mufun(method, opts)
x<-as.matrix(x);n=nrow(x);p=ncol(x);minscale=0.000001;varnames<-colnames(x);
if(p<=10){maxcsteps=2000}else if(p<100&&p>10){maxcsteps=1500}else if(p>100&&p<=200){maxcsteps=750}else if(p>200&&p<600){maxcsteps=500}else{maxcsteps=250}
hsets.init = NULL;save.hsets=missing(hsets.init);full.h=T;trace= FALSE
rpack <- function(x, h, full.h, scaled = TRUE, scalefn, mufn) {
initset <- function(X, scalefn, mufn, V, h) {
stopifnot(length(d <- dim(X)) == 2, length(h) == 1, h >= 1)
n <- d[1]
stopifnot(h <= n)
XC <- Scale2(X, mufn, NULL);lambda <- apply(XC,2,scalefn)
sqrtcov <- V%*%(lambda*t(V));sqrtinvcov<-V%*%(t(V)/lambda)
estloc <- apply(X%*%sqrtinvcov,2,mufn)%*%sqrtcov
XC <- (X-rep(estloc,each = n))%*%V
sort.list(mahalanobis_dist(XC,apply(XC,2,mufn),diag(lambda),tol=1e-100))[1:h]
}
stopifnot(length(dx <- dim(x)) == 2)
n <- dx[1];p <- dx[2]
if (!scaled) {x <- as.matrix(Scale2(x, center = mufn, scale = scalefn))}
bacon.cov <- function(x){
znorm <- sqrt(rowSums(x^2));ind5 <- order(znorm);half <- ceiling(n/2)
Hinit <- ind5[1:half];return(cov(x[Hinit, , drop = FALSE]))
}
.cov <- function(x) {
x<-na.omit(as.matrix(x));p<-ncol(x);n<-nrow(x);res<-crossprod(x)/n;return(res)
}
hsets <- matrix(integer(), h, 9); Thr <- 1/(4 * (n^0.25) * sqrt(pi * log(n)))
hsets[, 1] <- initset(x, scalefn = scalefn, mufn = mufn, V = eigen(mpd(cor(tanh(x))),symmetric=T)$vectors, h=h)
hsets[, 2] <- initset(x, scalefn = scalefn, mufn = mufn, V = eigen(mpd(cor(x,method="s")),symmetric=T)$vectors, h=h)
hsets[, 3] <- initset(x, scalefn = scalefn, mufn = mufn, V = eigen(mpd(cor(qnorm(apply(x,2,rank)/(n+1)))),symmetric=T)$vectors, h=h)
hsets[, 4] <- initset(x, scalefn = scalefn, mufn = mufn, V = eigen(cov2cor(mpd(bacon.cov(x))),symmetric=T)$vectors, h=h)
hsets[, 5] <- initset(x, scalefn = scalefn, mufn = mufn, V = eigen(mpd(cor(psych::winsor(x,trim = 0.10))),symmetric=T)$vectors, h=h)
hsets[, 6] <- initset(x, scalefn = scalefn, mufn = mufn, V = eigen(mpd(cor(qunif(apply(x,2,rank)/(n+1),-3, 3))),symmetric=T)$vectors, h=h)
hsets[, 7] <- initset(x, scalefn = scalefn, mufn = mufn, V = eigen(mpd(cor(qnorm(apply(x,2,rank)/(n+1)))),symmetric=T)$vectors, h=h)
hsets[, 8] <- initset(x, scalefn = scalefn, mufn = mufn, V = eigen(mpd(cov2cor(covShrink(apply(x,2,rank)))),symmetric=T)$vectors, h=h)
hsets[, 9] <- initset(x, scalefn = scalefn, mufn = mufn, V = eigen(cor(qnorm(pmin(pmax(apply(x, 2, rank)/n, Thr), 1 - Thr))),symmetric=T)$vectors, h=h)
if (full.h){hsetsN <- matrix(integer(), n, 9)}
for (k in 1:9) {
xk <- x[hsets[, k], , drop = FALSE]
Eig <- eigen(crossprod(scale(xk,T,F))/nrow(xk), symmetric=T)
score <- (x - rep(colMeans(xk), each = n)) %*% Eig$vectors
values <- Eig$values
ord <- order(mahalanobis_dist(score, colMeans(score), diag(sqrt(abs(as.vector(values)))),tol=1e-100))
if (full.h){hsetsN[, k] <- ord}else{hsets[, k] <- ord[1:h]}
}
if (full.h){hsetsN}else{hsets}
}
.Inv <- function(alpha, mT, nu, mU) {
p = dim(mT)[1];pp = dim(mU);vD = sqrt(diag(mT));imD = diag(vD^(-1));R = imD %*% mT %*% imD
cr = R[2, 1];I = diag(1, p);J = matrix(1, p, p);imR = 1/(1 - cr) * (I - cr/(1 + (p - 1) * cr) * J)
imB = (alpha)^(-1) * imD %*% imR %*% imD;Temp <- pseudoinverse(diag(pp[2]) + nu * (t(mU) %*% (imB %*% mU)),tol=1e-100)
return(imB - (imB %*% mU) %*% (nu * Temp) %*% (t(mU) %*% imB))
}
.COV <- function(XX, vMu, alpha = NULL, mT, scfac, invert = FALSE) {
mE <- XX - vMu;n <- dim(mE)[2];p <- dim(mE)[1]
mS <- tcrossprod(mE)/n;rcov <- alpha * mT + (1 - alpha) * scfac * mS
if (invert) {
if (p>n||!pdcheck(mS)){nu<-(1-alpha)*scfac;mU<-mE/sqrt(n);inv_rcov<-.Inv(alpha=alpha,mT=mT,nu=nu,mU=mU)}
else {inv_rcov<-pseudoinverse(rcov,tol=1e-100)}
return(list(alpha = alpha, mT = mT, cov = mS, rcov = rcov, inv_rcov = inv_rcov))
}
else return(list(alpha = alpha, mT = mT, cov = mS, rcov = rcov))
}
.cons<-function(p,alpha){qalpha<-qchisq(alpha,p);caI<-pgamma(qalpha/2,p/2+1)/alpha;1/caI}
.cstep <- function(mX,alpha=NULL,mT=NULL,vMu=NULL,mIS = NULL,h,scfac,index = NULL,maxcsteps) {
n <- dim(mX)[2];p <- dim(mX)[1];if(is.null(index)){index<-sample(1:n,h)}
XX <- mX[, index];if(is.null(vMu)){vMu<-rowMeans(XX)}
if (is.null(mIS)){ret=.COV(XX=XX,vMu=vMu,alpha=alpha,mT=mT,scfac = scfac,invert=T);mIS = ret$inv_rcov}
vdst = diag(t(mX-vMu)%*%(mIS%*%(mX - vMu)));index = sort(sort.int(vdst, index.return = T)$ix[1:h]);iter = 0
while (iter < maxcsteps) {
XX <- mX[, index];vMu<-rowMeans(XX)
ret <- .COV(XX = XX, vMu = vMu, alpha = alpha, mT = mT, scfac = scfac, invert = T);mIS <- ret$inv_rcov
vdst <- diag(t(mX - vMu) %*% (mIS %*% (mX - vMu)));nndex <- sort(sort.int(vdst, index.return = T)$ix[1:h])
if (all(nndex == index))
break
index <- nndex;iter <- iter + 1
}
return(list(index=index,numit=iter,mu=vMu,cov=ret$rcov,icov=ret$inv_rcov,alpha=ret$alpha,mT=ret$mT,dist=vdst,scfac=scfac))
}
mX<-t(x);mindet<-0;n<-dim(mX)[2];p<-dim(mX)[1]
if (!is.null(kappa)){h <- ceiling(kappa*n)}else{kappa<-0.80;h<-ceiling(kappa*n)}
if (kappa < 0.5 | kappa > 1) stop("'kappa' must be between 0.5 and 1.0!")
obj <- function(x) {det(x)^(1/p)}
vmx <- apply(mX,1,mufn);vsd<-apply(mX,1,scalefn);vsd[vsd<minscale]<-minscale
Dx <- diag(vsd);mU <- scale(t(mX), center = vmx, scale = vsd);mX <- t(mU);mT <- diag(p)
if (is.null(hsets.init)) {
hsets.init <- rpack(x = t(mX), h=h, full.h = full.h, scaled = FALSE, scalefn = scalefn, mufn = mufn)
dh <- dim(hsets.init)
}
else {
if (is.vector(hsets.init))
hsets.init <- as.matrix(hsets.init)
dh <- dim(hsets.init)
if (dh[1] < h || dh[2] < 1)
stop("'hsets.init' must be a h' x L matrix (h' >= h) of observation indices")
if (full.h && dh[1] != n)
warning("'full.h' is true, but 'hsets.init' has less than n rows")
if (min(hsets.init) < 1 || max(hsets.init) > n)
stop("'hsets.init' must be in {1,2,...,n}; n = ",
n)
}
hsets.init <- hsets.init[1:h, ]
scfac <- .cons(p, h/n)
alpha9pack <- condnr <- c()
nsets <- ncol(hsets.init)
if (is.null(alpha)) {
for (k in 1:9) {
mXsubset <- mX[, hsets.init[, k]]
alpha9pack[k] <- .estimateAlpha(t(mXsubset))
}
cutoffalpha <- max(c(0.25, hdmedian(alpha9pack)))
alpha <- max(alpha9pack[alpha9pack <= cutoffalpha])
Vselection <- seq(1, 9)
Vselection[alpha9pack > cutoffalpha] = NA
if (sum(!is.na(Vselection)) == 0) {
stop("None of the initial subsets are positive-definite")
}
initV <- min(Vselection, na.rm = TRUE)
setsV <- Vselection[!is.na(Vselection)]
setsV <- setsV[-1]
}
else {
setsV <- 1:ncol(hsets.init)
initV <- 1
}
hset.csteps <- integer(9)
ret <- .cstep(mX = mX, alpha = alpha, mT = mT, h=h, scfac = scfac, index = hsets.init[, initV], maxcsteps = maxcsteps)
objret <- obj(ret$cov)
hindex <- ret$index
best9pack <- initV
for (k in setsV) {
tmp <- .cstep(mX = mX, alpha = alpha, mT = mT, h=h, scfac = scfac, index = hsets.init[, k], maxcsteps = maxcsteps)
objtmp <- obj(tmp$cov)
hset.csteps[k] <- tmp$numit
if (objtmp < objret) {
ret <- tmp;objret <- objtmp;hindex <- tmp$index;best9pack <- k
}
else if (objtmp == objret)
best9pack <- c(best9pack, k)
}
c_kappa <- ret$scfac
mE <- mX[, hindex] - ret$mu
weightedScov <- tcrossprod(mE)/(h-1)
D <- c_kappa * diag(1,p)
muvec = apply(mX[, hindex], 1, mean)
Sigma = alpha * mT + (1-alpha) * c_kappa * weightedScov
if (p > n) {
nu <- (1-alpha) * c_kappa; mU <- mE/sqrt(h - 1)
Tau <- .Inv(alpha = alpha, mT = mT, nu = nu, mU = mU)
Sigma <- pseudoinverse(Tau, tol = 1e-100)
}
else{
Tau <- pseudoinverse(Sigma,tol = 1e-100)
}
mX <- t(crossprod(mX, Dx)) + vmx
muvec <- as.vector(Dx %*% muvec + vmx)
Sigma <- Dx %*% (Sigma %*% Dx)
colnames(Sigma) <- rownames(Sigma) <- names(muvec) <- varnames
return(structure(list(center = muvec, cov = Sigma,
dist = mahalanobis_dist(x, center=muvec, cov=Sigma, tol = 1e-100),
outliers = seq(1,nrow(x))[-hindex]), class = "covRobust"))
}
#' Ortogonalized Gnanadesikan-Kettenring (OGK) Robust Regularized Covariance Estimator
#'
#' @description Computes a robust covariance and location estimator using the pairwise algorithm
#' proposed by Marona and Zamar (2002), which solves the problem concerning the lack of affine
#' equivariance in the method proposed by Gnanadesikan-Kettenring (1972). Furthermore, the OGK
#' estimator is guaranteed to return a positive-definite covariance matrix when n > p. In the
#' implementation in this package, when n < p, mild shrinkage is applied to induce a
#' positive-definite matrix. \cr
#'
#' \cr
#' Several options to use as scale and location estimators are offered here: \cr \cr
#' - "tau" is the tau-scale defined in Yohai and Zamar (1998). \cr
#' - "pb" is the percentage bend estimator (Shoemaker & Hettmansperger, 1982). \cr
#' - "bisq" is Tukey's bisquare estimator. \cr
#' - "huber" is Huber's psi estimator (Huber, 1964). \cr
#' - "mopt" is the modified optimal estimator. \cr
#' - "Qn" and "Sn" are two alternatives to the median based measures of location and scale (Rousseeuw, Peter, & Croux, 1993). \cr
#'
#' @param x a data frame or matrix of numeric covariates
#' @param method "tau", "pb", "bisq", "huber", "mopt", "Qn", "Sn"
#' @param opts list of options for the various scale estimators. "b" determines the percentage bend coefficient for "pb", "eff"
#' determines the efficiency of the "huber", "bisquare" and "mopt" scale estimators.
#' @param iter the number of refinement steps. defaults to 2. can be 0 if the raw ogk estimator is desired.
#'
#' @return a covRobust object containing the following elements:
#' \itemize{
#' \item \strong{center}: multivariate mean of cleaned data set after removing outliers.
#' \item \strong{cov}: covariance matrix of cleaned data set after removing outliers.
#' \item \strong{dist}: the mahalanobis distances used in calculating the weights.
#' \item \strong{outliers}: the indices of the outliers identified.
#' \item \strong{weights}: the weights for downweighting outliers.
#' }
#' @export
#'
#' @references
#' Huber, P. J. (1964). Robust estimation of a location parameter. Ann. Math. Statist. 35, 73–101. \cr \cr
#' Shoemaker, L. H., & Hettmansperger, T. P. (1982). Robust estimates and tests for the one- and two-sample scale models. Biometrika, 69:47–54 \cr \cr
#' Rousseeuw, Peter J.; Croux, Christophe (1993), Alternatives to the Median Absolute Deviation. Journal of the American Statistical Association, 88(424): 1273–1283, doi:10.2307/229126 \cr \cr
#' Yohai, R.A. and Zamar, R.H. (1998) High breakdown point estimates of regression by means of the minimization of efficient scale. Journal of the American Statistical Association, 86:403–413.
cov.ogk <- function(x, method = c("tau", "pb", "bisq", "huber", "mopt", "Qn", "Sn"), iter = 2, opts = list("b" = 0.10, "eff" = 0.90)){
x <- as.matrix(x);n <- nrow(x); p <- ncol(x)
method <- match.arg(method)
mufun <- .mufun(method, opts); scfun <- .scfun(method, opts)
if (opts$b >= 0.50 || is.null(opts$b)) opts$b <- 0.49
vrob <- .OGKcustomvrob(mufun, scfun)
covfun <- .OGKmakecovfun(mufun,scfun,vrob)
new <- first <- covfun(x)
cov <- first$cov
center <- as.vector(first$center)
Z <- first$Z
for (i in 1:iter) {
old <- new
new <- covfun(old$Z)
cov <- mpd(old$A %*% tcrossprod(new$cov, old$A))
center <- as.vector(old$A %*% as.vector(new$center))
Z <- new$Z
}
mu <- apply(Z, 2, function(i) mufun(i))
sigma <- apply(Z, 2, function(i) scfun(i))
Z <- sweep(Z, 2, mu)
Z <- sweep(Z, 2, sigma, "/")
dist2 <- rowSums(Z^2)
cdelta <- median(dist2)/qchisq(0.5, p)
cov <- cdelta * cov
dist2<-mahalanobis_dist(x,mu,cov);
qq <- (qchisq(1 - (0.4/n^0.6), p)*median(dist2))/qchisq(0.5, p)
wt <- ifelse(dist2 < qq, 1, 0)
wcenter <- colMeans(x[which(wt==1), ])
X <- scale(x[which(wt==1), ], wcenter, F)
wcov <- mpd(crossprod(X)/nrow(X))
if (!pdcheck(wcov)){
tr1 <- sum(diag(wcov))
wcov <- covShrink(x, wt, target = "adaptive")
tr2 <- sum(diag(wcov)); adj <- tr1/tr2; wcov <- adj * wcov
}
return(structure(list(center=wcenter, cov=wcov, dist=dist2, outliers = which(wt!=1), weights = wt), class = "covRobust"))
}
#' Weighted Ortogonalized Gnanadesikan-Kettenring (WOGK) Robust Covariance Estimator
#'
#' @description See \code{\link{cov.ogk}} for information on how this estimator is computed. The only difference here
#' is that in this function is that the final covariance matrix is not based on dropping identified outliers, but rather,
#' smoothly downweighting them.
#'
#' \cr
#' Several options to use as scale and location estimators are offered here: \cr \cr
#' - "tau" is the tau-scale defined in Yohai and Zamar (1998). \cr
#' - "pb" is the percentage bend estimator (Shoemaker & Hettmansperger, 1982). \cr
#' - "bisq" is Tukey's bisquare estimator. \cr
#' - "huber" is Huber's estimator (Huber, 1964). \cr
#' - "mopt" is the modified optimal estimator. \cr
#' - "Qn" and "Sn" are two alternatives to the median based measures of location and scale (Rousseeuw, Peter, & Croux, 1993). \cr
#'
#' @param x a data frame or matrix of numeric covariates
#' @param method one of "tau", "pb", "bisq", "huber", "mopt", "Qn", or "Sn"
#' @param opts list of options for the various scale estimators. "b" determines the percentage bend coefficient for "pb", "eff"
#' determines the efficiency of the "huber", "bisquare" and "mopt" scale estimators.
#' @param iter the number of refinement steps. defaults to 2.
#' @return a covRobust object containing the following elements:
#' \itemize{
#' \item \strong{center}: multivariate mean of cleaned data set after applying casewise weights.
#' \item \strong{cov}: covariance matrix of cleaned data set after applying casewise weights.
#' \item \strong{dist}: the mahalanobis distances used in calculating the weights.
#' \item \strong{outliers}: the indices of the outliers identified.
#' \item \strong{weights}: the weights for downweighting outliers.
#' }
#' @export
#'
#' @references
#' Huber, P. J. (1964). Robust estimation of a location parameter. Ann. Math. Statist. 35, 73–101. \cr \cr
#' Shoemaker, L. H., & Hettmansperger, T. P. (1982). Robust estimates and tests for the one- and two-sample scale models. Biometrika, 69:47–54 \cr \cr
#' Rousseeuw, Peter J.; Croux, Christophe (1993). Alternatives to the Median Absolute Deviation, Journal of the American Statistical Association, 88(424): 1273–1283, doi:10.2307/229126 \cr \cr
#' Yohai, R.A. and Zamar, R.H. (1998). High breakdown point estimates of regression by means of the minimization of efficient scale, Journal of the American Statistical Association, 86:403–413. \cr \cr
cov.wogk <- function(x, method =c("tau", "pb", "bisq", "huber", "mopt", "Qn", "Sn"), iter = 2, opts = list("b" = 0.10, "eff" = 0.90)){
x <- as.matrix(x);n <- nrow(x); p <- ncol(x)
method <- match.arg(method)
mufun <- .mufun(method, opts); scfun <- .scfun(method, opts)
if (opts$b >= 0.50 || is.null(opts$b)) opts$b <- 0.49
vrob <- .OGKcustomvrob(mufun, scfun)
covfun <- .OGKmakecovfun(mufun,scfun,vrob)
new <- first <- covfun(x)
cov <- first$cov
center <- as.vector(first$center)
Z <- first$Z
for (i in 1:iter) {
old <- new
new <- covfun(old$Z)
cov <- mpd(old$A %*% tcrossprod(new$cov, old$A))
center <- as.vector(old$A %*% as.vector(new$center))
Z <- new$Z
}
mu <- apply(Z, 2, function(i) mufun(i))
sigma <- apply(Z, 2, function(i) scfun(i))
Z <- sweep(Z, 2, mu)
Z <- sweep(Z, 2, sigma, "/")
dist2 <- rowSums(Z^2)
cdelta <- hdmedian(dist2)/qchisq(0.5, p)
cov <- cdelta * cov
## Calculate Weights Based On Initial Covariance Estimate
if (p<=10){alpha <- (0.241-0.0029*p)/sqrt(n)}
if (p>=11){alpha <- (0.252-0.0018*p)/sqrt(n)}
cv <- qchisq(alpha,p)/qchisq(0.5,p);
dist2<-mahalanobis_dist(x,mu,cov);
wt = cvreg:::mvSmoothWt(dist2/median(dist2), cv, 1.5); h<-sum(wt)
outliers <- seq(1,nrow(x))[as.numeric(dist2>median(dist2)*cv)]
## Calculate New Covariance Matrix
wcov <- cov.wt(x, wt, method = "ML")
wcenter <- wcov$center
wcov <- wcov$cov
if (!pdcheck(wcov)){
tr1 <- sum(diag(wcov))
wcov <- covShrink(x, wt, target = "identity")
tr2 <- sum(diag(wcov)); adj <- tr1/tr2; wcov <- adj * wcov
}
return(structure(list(center=wcenter, cov=wcov, dist=dist2, outliers = outliers, weights = wt), class = "covRobust"))
}
#' Reweighted Median Ball Covariance Estimator
#'
#' @description This implements the reweighted median ball estimator of Olive (2004). The algorithm is
#' initialized using the BACON algorithm of Billor, Hadi, and Velleman (2000).
#'
#' The implementation in this package checks each iteration's estimate of the covariance, and if the estimated covariance matrix
#' is not positive definite, shrinkage is applied to obtain a well conditioned covariance matrix. Following
#' the shrinkage, the matrix is adjusted to keep the positive-definiteness while not allowing the overall
#' size of the estimates be affected. This is done using the ratio of the trace of the initial non-positive definite
#' matrix to the trace of the shrinkage estimated matrix, ie, \eqn{cov_adj = cov_shrink * \left(tr(cov_init)/tr(cov_shrink)\right)}.
#'
#' @param x a data frame or matrix of numeric covariates
#' @param iter number of iterations. defaults to 5. recommended to leave as 5.
#' @param delta the chi-square quantile used to declare outliers. defaults to 0.975
#'
#' @return a covRobust object containing the following elements:
#' \itemize{
#' \item \strong{center}: multivariate mean of cleaned data set after discarding outliers.
#' \item \strong{cov}: covariance matrix of cleaned data set after discardng outliers.
#' \item \strong{dist}: the mahalanobis distances used in calculating the weights.
#' \item \strong{outliers}: the indices of the outliers identified.
#' \item \strong{weights}: the weights for downweighting outliers.
#' }
#' @export
#' @references
#' Olive, D. J. (2004). A resistant estimator of multivariate location and dispersion. Computational Statistics & Data Analysis, 46, 93–102 \cr \cr
#' Billor, N., Hadi, A. S., & Velleman , P. F. (2000). BACON: Blocked Adaptive Computationally-Efficient Outlier Nominators; Computational Statistics and Data Analysis 34, 279–298. doi: 10.1016/S0167-9473(99)00101-2
cov.rmb<-function(x, iter = 5){
is.pd <- function(sigma) {min(eigen(sigma,T,T)$values) > 1e-6}
x=as.matrix(x);varnames <- colnames(x)
x=na.omit(x)
p <- dim(x)[2]
n <- dim(x)[1]
init <- cov.bacon(x)
covs <- init$cov
mns <- init$center
for(i in 1:iter) {
md2 <- mahalanobis_dist(x, mns, covs)
medd2 <- hdmedian(md2)
mns <- colMeans(x[md2 <= medd2, ])
covs <- crossprod(sweep(x[md2 <= medd2,],2,mns))/nrow(x[md2 <= medd2,])
if (!is.pd(covs)){
## If the calculated covariance is not positive definite, calculate
## a shrinkage estimate of the covariance to obtain a positive
## definite matrix. Adjust the shrinkage estimate to have the same
## trace as the initial matrix to prevent sudden jumps in the iterations.
tr1 <- sum(diag(covs))
covs <- covShrink(sweep(x[md2 <= medd2,],2,mns), target = "pooled")
tr2 <- sum(diag(covs))
adj <- tr1/tr2
covs <- adj * covs
}
}
covb <- covs
mnb <- mns
critb <- sqrt(det(covb))
covv <- diag(p)
med <- apply(x, 2, hdmedian)
md2 <- mahalanobis_dist(x, center = med, covv)
medd2 <- hdmedian(md2)
mns <- apply(x[md2 <= medd2, ], 2, cvreg:::huberMean)
covs <- crossprod(sweep(x[md2 <= medd2,],2,mns))/nrow(x[md2 <= medd2,])
if (!is.pd(covs)){
tr1 <- sum(diag(covs))
covs <- covShrink(x[md2 <= medd2,], target = "adaptive")
tr2 <- sum(diag(covs))
adj <- tr1/tr2
covs <- adj * covs
}
for(i in 1:iter){
md2 <- mahalanobis_dist(x, mns, covs)
medd2 <- hdmedian(md2)
mns <- colMeans(x[md2 <= medd2, ])
covs <- crossprod(sweep(x[md2 <= medd2,],2,mns))/nrow(x[md2 <= medd2,])
if (!is.pd(covs)){
tr1 <- sum(diag(covs))
covs <- covShrink(sweep(x[md2 <= medd2,],2,mns), target = "pooled")
tr2 <- sum(diag(covs))
adj <- tr1/tr2
covs <- adj * covs
}
}
crit <- sqrt(det(covs))
if(crit < critb) {
critb <- crit
covb <- covs
mnb <- mns
}
## Adjust Scale
rd2 <- mahalanobis_dist(x, mnb, covb)
const <- hdmedian(rd2)/(qchisq(0.5, p))
covb <- const * covb
## First Reweighting Step
rd2 <- mahalanobis_dist(x, mnb, covb)
up <- qchisq(1 - (0.3/n^0.45), p)
rmnb <- colMeans(x[rd2<=up,])
rcovb <- crossprod(sweep(x[rd2<=up,], 2, rmnb)) / nrow(x[rd2<=up,])
if (!is.pd(rcovb)){
tr1 <- sum(diag(rcovb))
rcovb <- cov.nlshrink(x[rd2 <= up,])
tr2 <- sum(diag(rcovb))
adj <- tr1/tr2
covs <- adj * covs
}
rd2 <- mahalanobis_dist(x, rmnb, rcovb)
const <- hdmedian(rd2)/(qchisq(0.5, p))
rcovb <- const * rcovb
## Second Reweighting Step
rd2 <- mahalanobis_dist(x, rmnb, rcovb)
up <- qchisq(1 - (0.3/n^0.45), p)
rmnb <- colMeans(x[rd2<=up,])
rcovb <- crossprod(sweep(x[rd2<=up,], 2, rmnb)) / nrow(x[rd2<=up,])
if (!is.pd(rcovb)){
tr1 <- sum(diag(rcovb))
rcovb <- cov.nlshrink(x[rd2 <= up,])
tr2 <- sum(diag(rcovb))
adj <- tr1/tr2
covs <- adj * covs
}
outliers <- seq(1,nrow(x),by=1)[rd2 >= up]
rd2 <- mahalanobis_dist(x, rmnb, rcovb)
const <- hdmedian(rd2)/(qchisq(0.5, p))
rcovb <- const * rcovb
names(rmnb) <- colnames(rcovb) <- rownames(rcovb) <- varnames
return(structure(list(center = rmnb, cov = rcovb, dist=rd2, outliers=outliers, weights = as.numeric(rd2 <= up)), class = "covRobust"))
}
#' Multivariate Student's T Distribution Covariance Estimator
#'
#' @param x a data frame or matrix of numeric covariates
#' @param nu the normality parameter. must be >= 3. lower values give more outlier resistant estimates. defaults to NULL, which estimates the parameter automatically.
#' @param wt an optional vector of initial weights equal in length to the number of observations.
#' @param maxit maximum number of iterations. defaults to 100.
#' @param tol convergence tolerance. defaults to 1e-4
#' @return a covRobust object containing the following elements:
#' \itemize{
#' \item \strong{center}: multivariate mean of cleaned data set after applying casewise weights.
#' \item \strong{cov}: covariance matrix of cleaned data set after applying casewise weights.
#' \item \strong{dist}: the mahalanobis distances used in calculating the weights.
#' \item \strong{outliers}: the indices of the outliers identified.
#' \item \strong{weights}: the weights for downweighting outliers.
#' }'
#' @export
#'
#' @references
#' Kent, J.T., Tyler, D.E., and Vardi, Y. (1994). A curious likelihood identity for the multivariate t-distribution. Communications in Statistics—Simulation and Computation 23, 441–453.
#'
cov.st = function (x, nu = NULL, wt = NULL, maxit = 100, tol = 0.0001) {
if (is.null(wt)) wt <- rep(1, nrow(x))
covfun <- function(x, nu = NULL, wt = NULL, maxit = 100, tol = 0.0001){
center = TRUE
test.values <- function(x) {
if (any(is.na(x)) || any(is.infinite(x)))
stop("missing or infinite values in 'x'")
}
scale.fast <- function(x, center, n, p) x - rep(center, rep(n, p))
x <- as.matrix(x)
n <- nrow(x)
p <- ncol(x)
dn <- colnames(x)
test.values(x)
if (is.null(wt)){wt <- rep(1, n)}else{wt <- wt; if(sum(wt)==1) wt <- (wt/sum(wt)) * n}
wt0 <- wt
test.values(wt)
if (length(wt) != n)
stop("length of 'wt' must equal number of observations")
if (any(wt < 0))
stop("negative weights not allowed")
if (!sum(wt))
stop("no positive weights")
x <- x[wt > 0, , drop = FALSE]
wt <- wt[wt > 0]
n <- nrow(x); wt <- (wt/sum(wt)) * n
loc <- colSums(wt * x)/sum(wt)
if (is.numeric(center)) {
if (length(center) != p)
stop("'center' is not the right length")
loc <- center
}
else if (is.logical(center) && !center)
loc <- rep(0, p)
use.loc <- is.logical(center) && center
w <- wt * (1 + p/nu)
endit <- 0
for (iter in 1L:maxit) {
w0 <- w
X <- scale.fast(x, loc, n, p)
sX <- svd(sqrt(w/sum(w)) * X, nu = 0)
if (any(sX$d < 0.0001)){
dm <- sX$d
for (i in 2:length(dm)){
dm[i] <- sum(c((0.95*dm[i]), (0.05*dm[i-1])))
}
dm[1] <- dm[1] * 0.95
sX$d <- dm
}
wX <- X %*% sX$v %*% diag(1/sX$d, length(sX$d), ncol = p)
Q <- drop(wX^2 %*% rep(1, p))
w <- (wt * (nu + p))/(nu + Q)
if (use.loc)
loc <- colSums(w * x)/sum(w)
if (all(abs(w - w0) < tol))
break
endit <- iter
}
cov <- mpd(crossprod(sqrt(w)*X)/sum(wt))
if (length(dn)) {
dimnames(cov) <- list(dn, dn)
names(loc) <- dn
}
ans <- list(Sigma = cov, center = loc, w = w)
ans
}
if (is.null(nu)){
estNu <- function(x){
make.nufun = function(x, weights){
x <- as.matrix(x)
function(df){
out = covfun(x, nu = df, wt = weights)
sum(mvtnorm::dmvt(x, out$center, out$Sigma, df = df, log = T))
}
}
init <- sqrt(sqrt(nrow(x)) * sqrt(ncol(x)));nufun=make.nufun(x,wt)
optim(init, fn = function(df){nufun = -1 * nufun(df)}, method = "L-BFGS-B", lower = 3, upper = 100)$par
}
nu <- estNu(x)
ans <- covfun(x, nu, wt, maxit, tol)
ans$nu <- nu
}
ans <- covfun(x, nu, wt, maxit, tol); ans$nu <- nu
dist=mahalanobis_dist(x,center=ans$center,cov=ans$Sigma)
crit<-sqrt(qchisq(0.975, ncol(x)))
vec<-1:nrow(x)
dis<-sqrt(dist)
chk<-ifelse(dis>crit,TRUE,FALSE)
outliers<-vec[chk]
return(structure(list(center=ans$center, cov=ans$Sigma, dist=dist, weights = ans$w, outliers=outliers, nu=ans$nu), class="covRobust"))
}
#' Minimum Regularized Generalized Variance Covariance Estimator
#'
#' @description This is very similar to the minimum regularized covariant determinant
#' covariance estimator. The generalized variance of a data set is in fact simply the
#' determinant of its covariance matrix. However, this function uses an initial
#' robust estimator of the covariance for a "leave one out" data set for each of the
#' N-ith subsets. This is used to calculate depth values, for which an adaptive threshold
#' is used to declare an observation an outlier. The covariance matrix is then recalculated
#' on the subset. This process is then re-iterated until the generalized variance
#' of the matrix converges. In order to ensure that the generalized variance is non-negative
#' at each iteration, the estimated covariance is checked for positive-definiteness and shrinkage
#' applied in the event of a non-positive definite matrix.
#'
#' Optionally, the number of maximum iterations can be set to
#' zero, in which case the raw generalized variance estimator will be returned.
#'
#' @param x a data frame or matrix of numeric covariates
#' @param maxit number of maximum iterations. defaults to 10. set to 0 to obtain the raw initial estimate.
#' @param tol the tolerance value for convergence. defaults to 1e-6.
#'
#' @return a covRobust object containing the following elements:
#' \itemize{
#' \item \strong{center}: multivariate mean of cleaned data set after removing outliers.
#' \item \strong{cov}: covariance matrix of cleaned data set after removing outliers.
#' \item \strong{dist}: the mahalanobis distances used in calculating the weights.
#' \item \strong{outliers}: the indices of the outliers identified.
#' \item \strong{weights}: the weights for downweighting outliers. here they are binary with 0 marking an outlier and 1 otherwise.
#' \item \strong{iter}: the number of iterations taken to finish.
#' \item \strong{converged}: TRUE if the algorithm converged by maxit, FALSE otherwise. If FALSE, re-run with a larger maxit.
#' \item \strong{trace}: The generalized variance values at each iteration.
#' }
#' @export
#'
cov.mrgv <-function(x, maxit = 10, tol = 1e-6){
gvar<-function(m){exp(sum(log(eigen(crossprod(scale(m,T,F))/nrow(m),symmetric=T,only.values=T)$values)))}
outFun <- function(x){
x<-x[!is.na(x)];n<-length(x);temp<-idealf(x)
gval <- (17.63*n-23.64)/(7.74*n-3.71)
cl<-temp$ql-gval*(temp$qu-temp$ql);cu<-temp$qu+gval*(temp$qu-temp$ql)
flag<-NA;outid<-NA;vec<-c(1:n);for(i in 1:n){flag[i]<-(x[i]< cl || x[i]> cu)}
if(sum(flag)==0)outid<-NULL;if(sum(flag)>0)outid<-vec[flag]
keep<-vec[!flag];outval<-x[flag];n.out=sum(length(outid))
list(out.val=outval,out.id=outid,keep=keep,n=n,n.out=n.out,cl=cl,cu=cu)
}
idealf<-function(x,na.rm=FALSE){
if(na.rm)x<-x[!is.na(x)];j<-floor(length(x)/4 + 5/12);y<-sort(x)
g<-(length(x)/4)-j+(5/12);ql<-(1-g)*y[j]+g*y[j+1];k<-length(x)-j+1
qu<-(1-g)*y[k]+g*y[k-1];list(ql=ql,qu=qu)
}
m<-x<-na.omit(as.matrix(x));p<-ncol(x);names<-colnames(x);dval<-0
for(i in 1:nrow(m)){dval[i]<-gvar(m[-i,])}
wch<-outFun(dval)$keep;m<-m[wch,];keeps<-keeps[wch]
sigma <- crossprod(scale(m,T,F))/nrow(m);
pd <- min(eigen(sigma,T,T)$values) > 1e-6;
if(!pd){
sigma <- covShrink(m, target = "unequal")
attr(sigma, "alpha") <- NULL
attr(sigma, "alpha.var") <- NULL
}
center <- colMeans(m)
gv<-trace<-exp(sum(log((eigen(sigma,symmetric=T,only.values=T)$values))));iter<-0;converged<-TRUE
if (maxit > 0){
iter<-0;change<-1000;mm<-m;gv.old<-gv;converged<-FALSE
while(!converged & iter < maxit) {
## Recalculate Covariance
dval <-0; for(i in 1:nrow(mm)){dval[i]<-gvar(mm[-i,])}
wch <- outFun(dval)$keep;mm<-mm[wch,];keeps<-keeps[wch]
center <- colMeans(mm);
sigma <- crossprod(scale(mm,T,F))/nrow(mm);
## If the matrix is not positive definite, use a shrinkage
## estimate to obtain a well conditioned matrix, and adjust
## to have the same trace as the unpenalized covariance
## matrix. This ensures that the shrinkage does not result
## in the algorithm terminating prematurely due to shrinkage
## inevitably reducing the determinant.
pd <- min(eigen(sigma,T,T)$values) > 1e-6;
if(!pd) {
sigma <- covShrink(mm,target="adaptive")
tr <- sum(diag(cov(mm))); tr2 <- sum(diag(sigma))
tk <- tr/tr2; sigma <- sigma * tk
attr(sigma, "alpha") <- NULL
attr(sigma, "alpha.var") <- NULL
}
## Check for convergence
trace[2+iter]<- gv.new <- exp(sum(log(eigen(sigma,T,T)$values)))
iter <- iter + 1
change <- max(gv.old-gv.new, 0)
gv.old <- gv.new
converged <- change <= tol
}
}
else{
mm <- m
}
## Adjust Scale
rd2 <- mahalanobis_dist(mm,center,sigma)
const <- median(rd2)/(qchisq(0.5, p))
covb <- mpd(const*sigma)
colnames(covb)<-rownames(covb)<-names
weights <- rep(0, nrow(x)); weights[keeps] <- 1
return(structure(list(center = center, cov = covb, dist=rd2, outliers = setdiff(1:nrow(x),keeps), weights = , iter = iter, trace = trace, converged = converged), class="covRobust"))
}
#' Minimum Variance Vector Covariance Estimator
#'
#' @description This implements the minimum variance vector covariance estimator described in
#' Herwindiati, Djauhari, and Mashuri (2007). It is exactly analagous to the minimum covariance
#' determinant (MCD), except the trace of the matrix is the objective function, rather than
#' the determinant. The deterministic MCD algorithm described in Hubert, Rousseeuw, and Verdonck (2012)
#' is adapted here, rather than adapting the fast MCD algorithm in the paper introducing the MVV.
#'
#' @param x a data frame or matrix of numeric covariates
#' @param kappa the the proportion of the data to use in each subset. defaults to 0.75. must be > 0.50.
#' @param method the method of measuring location and scale. see \code{\link{cov.mrcd}} for details on
#' the available options.
#' @param opts list of options for the various scale estimators. "b" determines the percentage bend coefficient for "pb", "eff" determines the efficiency of the "huber", "bisquare" and "mopt"
#' scale estimators.
#' @return a covRobust object containing the following elements:
#' \itemize{
#' \item \strong{center}: multivariate mean of cleaned data set after removing outliers.
#' \item \strong{cov}: covariance matrix of cleaned data set after removing outliers.
#' \item \strong{dist}: the mahalanobis distances used in calculating the weights.
#' \item \strong{outliers}: the indices of the outliers identified.
#' \item \strong{weights}: the weights for downweighting outliers. here they are binary, with 0 marking an outlier and 1 otherwise.
#' }
#' @export
#'
#' @references
#' Herwindiati, D. E.; Djauhari, M. A.; Mashuri, M. (2007). Robust Multivariate Outlier Labeling. Communications in Statistics - Simulation and Computation, 36(6), 1287–1294. doi:10.1080/03610910701569044 \cr \cr
#' Hubert, M.; Rousseeuw, P.J.; Verdonck, T. (2012) A Deterministic Algorithm for Robust Location and Scatter, Journal of Computational and Graphical Statistics, 21(3), 618-637, doi: 10.1080/10618600.2012.672100 \cr
cov.mvv = function (x, kappa = 0.75, method = c("tau", "pb", "bisq", "huber", "mopt", "Qn", "Sn"), opts = list("b" = 0.10, "eff" = 0.90)) {
method <- match.arg(method);scalefn <- .scfun(method, opts);mufn <- .mufun(method, opts)
x<-as.matrix(x);n=nrow(x);p=ncol(x);minscale=0.000001;varnames<-colnames(x);maxcsteps=2000
hsets.init = NULL;save.hsets=missing(hsets.init);full.h=T;trace= FALSE
rpack <- function(x, h, full.h, scaled = TRUE, scalefn, mufn) {
initset <- function(X, scalefn, mufn, V, h) {
stopifnot(length(d <- dim(X)) == 2, length(h) == 1, h >= 1)
n <- d[1]
stopifnot(h <= n)
XC <- Scale2(X, mufn, NULL);lambda <- apply(XC,2,scalefn)
sqrtcov <- V%*%(lambda*t(V));sqrtinvcov<-V%*%(t(V)/lambda)
estloc <- apply(X%*%sqrtinvcov,2,mufn)%*%sqrtcov
XC <- (X-rep(estloc,each = n))%*%V
sort.list(mahalanobis_dist(XC,apply(XC,2,mufn),diag(lambda),tol=1e-100))[1:h]
}
stopifnot(length(dx <- dim(x)) == 2)
n <- dx[1];p <- dx[2]
if (!scaled) {x <- as.matrix(Scale2(x, center = mufn, scale = scalefn))}
bacon.cov <- function(x){
znorm <- sqrt(rowSums(x^2));ind5 <- order(znorm);half <- ceiling(n/2)
Hinit <- ind5[1:half];return(cov(x[Hinit, , drop = FALSE]))
}
.cov <- function(x) {
x<-na.omit(as.matrix(x));p<-ncol(x);n<-nrow(x);res<-crossprod(x)/n;return(res)
}
hsets <- matrix(integer(), h, 7); Thr <- 1/(4 * (n^0.25) * sqrt(pi * log(n)))
hsets[, 1] <- initset(x, scalefn = scalefn, mufn = mufn, V = eigen(mpd(cor(tanh(x))),symmetric=T)$vectors, h=h)
hsets[, 2] <- initset(x, scalefn = scalefn, mufn = mufn, V = eigen(mpd(cor(x,method="s")),symmetric=T)$vectors, h=h)
hsets[, 3] <- initset(x, scalefn = scalefn, mufn = mufn, V = eigen(mpd(cor(qnorm(apply(x,2,rank)/(n+1)))),symmetric=T)$vectors, h=h)
hsets[, 4] <- initset(x, scalefn = scalefn, mufn = mufn, V = eigen(cov2cor(mpd(bacon.cov(x))),symmetric=T)$vectors, h=h)
hsets[, 5] <- initset(x, scalefn = scalefn, mufn = mufn, V = eigen(mpd(cor(psych::winsor(x,trim = 0.10))),symmetric=T)$vectors, h=h)
hsets[, 6] <- initset(x, scalefn = scalefn, mufn = mufn, V = eigen(cov2cor(covShrink(apply(x,2,rank))),symmetric=T)$vectors, h=h)
hsets[, 7] <- initset(x, scalefn = scalefn, mufn = mufn, V = eigen(cor(qnorm(pmin(pmax(apply(x, 2, rank)/n, Thr), 1 - Thr))),symmetric=T)$vectors, h=h)
hsetsN <- matrix(integer(), n, 7)
for (k in 1:7) {
xk <- x[hsets[, k], , drop = FALSE]
Eig <- eigen(crossprod(scale(xk,T,F))/nrow(xk), symmetric=T)
score <- (x - rep(colMeans(xk), each = n)) %*% Eig$vectors
values <- Eig$values
ord <- order(mahalanobis_dist(score, colMeans(score),diag(sqrt(abs(as.vector(values)))),tol=1e-100))
hsetsN[, k] <- ord
}
hsetsN
}
.Inv <- function(alpha, mT, nu, mU) {
p = dim(mT)[1];pp = dim(mU);vD = sqrt(diag(mT));imD = diag(vD^(-1));R = imD %*% mT %*% imD
cr = R[2, 1];I = diag(1, p);J = matrix(1, p, p);imR = 1/(1 - cr) * (I - cr/(1 + (p - 1) * cr) * J)
imB = (alpha)^(-1) * imD %*% imR %*% imD;Temp <- pseudoinverse(diag(pp[2]) + nu * (t(mU) %*% (imB %*% mU)),tol=1e-100)
return(imB - (imB %*% mU) %*% (nu * Temp) %*% (t(mU) %*% imB))
}
.COV <- function(XX, vMu, alpha = NULL, mT, scfac, invert = FALSE) {
mE <- XX - vMu;n <- dim(mE)[2];p <- dim(mE)[1]
mS <- tcrossprod(mE)/n;rcov <- alpha * mT + (1 - alpha) * scfac * mS
if (invert) {
if (p>n||!pdcheck(mS)){nu<-(1-alpha)*scfac;mU<-mE/sqrt(n);inv_rcov<-.Inv(alpha=alpha,mT=mT,nu=nu,mU=mU)}
else {inv_rcov<-pseudoinverse(rcov,tol=1e-100)}
return(list(alpha = 0, mT = mT, cov = mS, rcov = rcov, inv_rcov = inv_rcov))
}
else return(list(alpha = 0, mT = mT, cov = mS, rcov = rcov))
}
.cons<-function(p,alpha){qalpha<-qchisq(alpha,p);caI<-pgamma(qalpha/2,p/2+1)/alpha;1/caI}
.cstep <- function(mX,alpha=0,mT=NULL,vMu=NULL,mIS = NULL,h,scfac,index = NULL,maxcsteps) {
n <- dim(mX)[2];p <- dim(mX)[1];if(is.null(index)){index<-sample(1:n,h)}
XX <- mX[, index];if(is.null(vMu)){vMu<-rowMeans(XX)}
if (is.null(mIS)){ret=.COV(XX=XX,vMu=vMu,alpha=alpha,mT=mT,scfac = scfac,invert=T);mIS = ret$inv_rcov}
vdst = diag(t(mX-vMu)%*%(mIS%*%(mX - vMu)));index = sort(sort.int(vdst, index.return = T)$ix[1:h]);iter = 0
while (iter < maxcsteps) {
XX <- mX[, index];vMu<-rowMeans(XX)
ret <- .COV(XX = XX, vMu = vMu, alpha = 0, mT = mT, scfac = scfac, invert = T);mIS <- ret$inv_rcov
vdst <- diag(t(mX - vMu) %*% (mIS %*% (mX - vMu)));nndex <- sort(sort.int(vdst, index.return = T)$ix[1:h])
if (all(nndex == index))
break
index <- nndex;iter <- iter + 1
}
return(list(index=index,numit=iter,mu=vMu,cov=ret$rcov,icov=ret$inv_rcov,alpha=ret$alpha,mT=ret$mT,dist=vdst,scfac=scfac))
}
mX<-t(x);mintrace<-0;n<-dim(mX)[2];p<-dim(mX)[1]
if (!is.null(kappa)){h <- ceiling(kappa*n)}else{kappa<-0.80;h<-ceiling(kappa*n)}
if (kappa < 0.5 | kappa > 1) stop("'kappa' must be between 0.5 and 1.0!")
obj <- function(x) {sum(diag(x))}
vmx <- apply(mX,1,mufn);vsd<-apply(mX,1,scalefn);vsd[vsd<minscale]<-minscale
Dx <- diag(vsd);mU <- scale(t(mX), center = vmx, scale = vsd);mX <- t(mU);mT <- diag(p)
if (is.null(hsets.init)) {
hsets.init <- rpack(x = t(mX), h=h, full.h = full.h, scaled = FALSE, scalefn = scalefn, mufn = mufn)
dh <- dim(hsets.init)
}
else {
if (is.vector(hsets.init))
hsets.init <- as.matrix(hsets.init)
dh <- dim(hsets.init)
if (dh[1] < h || dh[2] < 1)
stop("'hsets.init' must be a h' x L matrix (h' >= h) of observation indices")
if (full.h && dh[1] != n)
warning("'full.h' is true, but 'hsets.init' has less than n rows")
if (min(hsets.init) < 1 || max(hsets.init) > n)
stop("'hsets.init' must be in {1,2,...,n}; n = ",
n)
}
setsV <- 1:ncol(hsets.init)
hsets.init <- hsets.init[1:h, ]
scfac <- .cons(p, h/n)
nsets <- ncol(hsets.init)
hset.csteps <- integer(7)
ret <- .cstep(mX = mX, alpha = 0, mT = mT, h=h, scfac = scfac, index = hsets.init[, 1], maxcsteps = maxcsteps)
objret <- obj(ret$cov)
hindex <- ret$index
best7pack <- 1
for (k in setsV) {
tmp <- .cstep(mX = mX, alpha = 0, mT = mT, h=h, scfac = scfac, index = hsets.init[, k], maxcsteps = maxcsteps)
objtmp <- obj(tmp$cov)
hset.csteps[k] <- tmp$numit
if (objtmp < objret) {
ret <- tmp;objret <- objtmp;hindex <- tmp$index;best7pack <- k
}
else if (objtmp == objret)
best7pack <- c(best7pack, k)
}
c_kappa <- ret$scfac
mE <- mX[, hindex] - ret$mu
weightedScov <- tcrossprod(mE)/(h-1)
D <- c_kappa * diag(1,p)
muvec = apply(mX[, hindex], 1, mean)
Sigma = c_kappa * weightedScov
mX <- t(crossprod(mX, Dx)) + vmx
muvec <- as.vector(Dx %*% muvec + vmx)
Sigma <- Dx %*% (Sigma %*% Dx)
colnames(Sigma) <- rownames(Sigma) <- names(muvec) <- varnames
weights <- rep(0, n); weights[hindex] <- 1
return(structure(list(center = muvec, cov = Sigma, weights = weights,
dist = mahalanobis_dist(x,center=muvec, cov=Sigma, tol = 1e-100),
outliers = seq(1, nrow(x))[-hindex], weights = weights),
class = "covRobust"))
}
#' Rocke's Multivariate Translated Biweight S-Estimator
#'
#' @description Using a robust \eqn{\rho} function in multivariate estimation has a major
#' drawback. As the dimensionality of the data matrix increases (variables) the rows of
#' the matrix (observations) will receive increasingly uniform weights. Hence, as p grows
#' in size, the robust multivariate estimate converges to the standard estimator. However,
#' this is undesirable. While this would be a good thing if the relative effeciency to the
#' MLE inreased with the number of observations, it is not desirable to occur as a result of
#' increasing dimensionality - the robustness is lost. Worse is that severe outliers may
#' receive larger weights than inliers and consequently induce bias to the estimation. David
#' Rocke (1996) proposed a modified "translated" bisquare \eqn{\rho} function that adapts to
#' the dimensionality of a data set to prevent the ARE from increasing to values infinitesimally
#' close to one while also preserving the robustness and unbiasedness of the estimation.
#'
#' The algorithm is initialized here by using BACON (Blocked Adaptive Computationally-Efficient
#' Outlier Nominator) algorithm of Billor, Hadi, and Velleman (2000).
#'
#'
#' @param X a data frame or matrix of numeric covariates.
#' @param phi the factor determining the minimum number of observations not declared outliers following the
#' initial estimation stage. the number of points not declared outliers will be at least phi * p. the default
#' is 2. put differently, the number of observations declared outliers in the initial step will be at most n - phi*p.
#' however, this has only an indirect effect on the final points declared outliers.
#' @param q a tuning parameter. defaults to 2. recommended to not change unless you know what you are doing.
#' @param maxit maximum number of iterations
#' @param maxsteps limit on the number of steps for the line search section of the algorithm.
#' @param tol numeric tolerance for convergence.
#'
#' @return a covRobust object containing the following elements:
#' \itemize{
#' \item \strong{center}: multivariate mean of cleaned data set after applying casewise weights.
#' \item \strong{cov}: covariance matrix of cleaned data set after applying casewise weights.
#' \item \strong{dist}: the mahalanobis distances used in calculating the weights.
#' \item \strong{outliers}: the indices of the outliers identified.
#' \item \strong{weights}: the weights for downweighting outliers.
#' }
#' @export
#'
#' @references
#' Rocke, D. M. (1996). Robustness properties of S-estimators of multivariate location and shape in high dimension. Annals of Statistics, 24, 1327–1345. \cr \cr
#' Rocke, D. M., & Woodruff, D. L. (1996). Identification of outliers in multivariate data. Journal of the American Statistical Association, 91, 1047–1061. \cr \cr
#' Billor, N., Hadi, A. S., & Velleman , P. F. (2000). BACON: Blocked Adaptive Computationally-Efficient Outlier Nominators; Computational Statistics and Data Analysis 34, 279–298. doi: 10.1016/S0167-9473(99)00101-2
#'
cov.rocke <- function (X, phi = 2, q = 2, maxit = 25, maxsteps = 5, tol = 1e-06) {
X <- as.matrix(X)
n <- nrow(X); p <- ncol(X)
consts <- .rockconst(p = p, n = n)
pcrit <- consts$alpha
gamma0 <- consts$gamma
init <- cov.bacon(X)
S0 = init$cov ; mu0 = init$center
S0 = S0/(det(S0)^(1/p))
mah0 = init$dist
mah = mah0
delta <- (1 - p/n)/2
sig <- .rocke_scale(x = mah, gamma = gamma0, q = q, delta = delta)
didi <- mah/sig
dife <- sort(abs(didi - 1))
gg <- min(dife[(1:n) >= (phi * p)])
gamma <- max(gg, gamma0)
sig0 <- .rocke_scale(x = mah, gamma = gamma, delta = delta, q = q)
iter <- 0
difpar <- difsig <- +Inf
while ((((difsig > tol) | (difpar > tol)) & (iter < maxit)) &
(difsig > 0)) {
iter <- iter + 1
w <- wt.rocke(tt = mah/sig, gamma = gamma, q = q)
mu <- colMeans(X * w)/mean(w)
Xcen <- scale(X, center = mu, scale = FALSE)
S <- t(Xcen) %*% (Xcen * w)/n
S <- S/(det(S)^(1/p))
mah <- mahalanobis_dist(x = X, center = mu, cov = S)
sig <- .rocke_scale(x = mah, gamma = gamma, delta = delta, q = q)
step <- 0
delgrad <- 1
while ((sig > sig0) & (step < maxsteps)) {
delgrad <- delgrad/2
step <- step + 1
mu <- delgrad * mu + (1 - delgrad) * mu0
S <- delgrad * S + (1 - delgrad) * S0
S <- S/(det(S)^(1/p))
mah <- mahalanobis_dist(x = X, center = mu, cov = S)
sig <- .rocke_scale(x = mah, gamma = gamma, delta = delta, q = q)
}
dif1 <- as.vector(t(mu - mu0) %*% solve(S0, mu - mu0))/p
dif2 <- max(abs(solve(S0, S) - diag(p)))
difpar <- max(dif1, dif2)
difsig <- 1 - sig/sig0
mu0 <- mu
S0 <- S
sig0 <- sig
}
tmp <- .rockemat(S0 = S0, dis = mah)
S <- tmp$S
ff <- tmp$ff
mah <- mah/ff
crit<-sqrt(qchisq(1 - pcrit, ncol(X)))
vec<-1:nrow(X)
dis<-sqrt(mah)
chk<-ifelse(dis>crit,TRUE,FALSE)
outliers<-vec[chk]
return(structure(list(cov = S, center = mu, dist = mah, outliers = outliers, weights = w, gamma = gamma), class = "covRobust"))
}
#' Yohai's Smoothed Hard Rejection Multivariate M-Estimator
#'
#' @description This implements the 'smoothed hard rejection' estimator. It is very
#' similar to Rocke's translated biweight S-estimator in the \code{\link{cov.shr}}
#' function, but has been found to work better in problems where the number of
#' variables is greater than 15 or so (Maronna & Yohai, 2017). The algorithm is
#' initialized with Billor, Hadi, and Velleman's (2000) BACON algorithm.
#'
#' @param X a data frame or matrix of numeric covariates
#' @param maxit maximum number of iterations. defaults to 50.
#' @param tol convergence tolerance
#'
#' @return a covRobust object containing the following elements:
#' \itemize{
#' \item \strong{center}: multivariate mean of cleaned data set after applying casewise weights.
#' \item \strong{cov}: covariance matrix of cleaned data set after applying casewise weights.
#' \item \strong{dist}: the mahalanobis distances used in calculating the weights.
#' \item \strong{outliers}: the indices of the outliers identified.
#' \item \strong{weights}: the weights for downweighting outliers.
#' }
#' @export
#'
#' @references
#' Muler, N. & Yohai, V.J. (2002). Robust estimates for arch processes. Journal of Time Series Analysis, 23(3), 341–375. doi:10.1111/1467-9892.00268 \cr \cr
#' Maronna, R.A. & Yohai, V.J. (2017) Robust and efficient estimation of multivariate scatter and location. Computational Statistics and Data Analysis, 109, 64–75. doi: 10.1016/j.csda.2016.11.006 \cr \cr
#' Billor, N., Hadi, A. S., & Velleman , P. F. (2000). BACON: Blocked Adaptive Computationally-Efficient Outlier Nominators; Computational Statistics and Data Analysis, 34, 279–298. doi: 10.1016/S0167-9473(99)00101-2
#'
cov.shr <- function (X, maxit = 50, tol = 1e-04){
X = as.matrix(X)
d <- dim(X)
n <- d[1]
p <- d[2]
delta <- 0.5 * (1 - p/n)
cc <- .shrconst(p, n)
const <- cc[1]
init <- cov.bacon(X)
S0 <- init$cov ; mu0 <- init$center
S0 <- S0/(det(S0)^(1/p))
mah0 <- init$dist
mah <- mah0
sigma <- .scale_shr(t = mah, delta = delta) * const
iter <- 0
difpar <- +Inf
S0 <- diag(p)
mu0 <- rep(0, p)
dist <- mah
while ((difpar > tol) & (iter < maxit)) {
iter <- iter + 1
w <- wt.shr(dist/sigma)
mu <- colMeans(X * w)/mean(w)
Xcen <- scale(X, center = mu, scale = FALSE)
S <- t(Xcen) %*% (Xcen * w)
S <- S/(det(S)^(1/p))
dist <- mahalanobis_dist(x = X, center = mu, cov = S)
dif1 <- t(mu - mu0) %*% solve(S0, mu - mu0)
dif2 <- max(abs(c(solve(S0, S) - diag(p))))
difpar <- max(c(dif1, dif2))
mu0 <- mu
S0 <- S
}
tmp <- .shrmat(S0 = S0, dis = dist)
dist <- dist/tmp$ff
crit<-sqrt(qchisq(0.975, ncol(X)))
vec<-1:nrow(X)
dis<-sqrt(dist)
chk<-ifelse(dis>crit,TRUE,FALSE)
outliers<-vec[chk]
structure(list(center = mu0, cov = tmp$S, dist = dist, weights = w, outliers = outliers), class = "covRobust")
}
.minNotZero <- function(x){
zeros = any(x==0)
if (zeros) return(x[-which(x==0)])
else min(x)
}
.mufun <- function(method = c("tau", "pb", "bisq", "huber", "mopt", "Qn", "Sn", "mad"), opts){
method <- match.arg(method)
switch(method,
"pb" = function(x){unname(pbLocScale(x,opts$b,T)[1])},
"bisq" = function(x){.locScaleMmod(x, psi = "bisquare", eff = opts$eff)$mu},
"mopt" = function(x){.locScaleMmod(x, psi = "mopt", eff = opts$eff)$mu},
"huber" = function(x){.locScaleMmod(x, psi = "huber", eff = opts$eff)$mu},
"Qn" = function(x){median(x)},
"Sn" = function(x){median(x)},
"tau" = function(x){unname(cvreg::tauLocScale(x, mu=T)[1])},
"mad" = function(x){hdmedian(x)},
)
}
.scfun <- function(method = c("tau", "pb", "bisq", "huber", "mopt", "Qn", "Sn", "mad"), opts){
method <- match.arg(method)
switch(method,
"pb" = function(x){unname(pbLocScale(x,opts$b,F))},
"bisq" = function(x){.locScaleMmod(x, psi = "bisquare", eff = opts$eff)$disper},
"mopt" = function(x){.locScaleMmod(x, psi = "mopt", eff = opts$eff)$disper},
"huber" = function(x){.locScaleMmod(x, psi = "huber", eff = opts$eff)$disper},
"Qn" = function(x){robustbase::s_Qn(x,F)},
"Sn" = function(x){robustbase::s_Sn(x,F)},
"tau" = function(x){unname(cvreg::tauLocScale(x,mu=F))},
"mad" = function(x){cvreg:::HDMAD(x)*1.4826},
)
}
.rocke_scale <- function (x, gamma, q, delta = 0.5, tol = 1e-05) {
n <- length(x)
y <- sort(abs(x))
n1 <- floor(n * (1 - delta))
n2 <- ceiling(n * (1 - delta)/(1 - delta/2))
qq <- y[c(n1, n2)]
u <- 1 + gamma * (delta - 1)/2
sigin <- c(qq[1]/(1 + gamma), qq[2]/u)
if (qq[1] >= 1) {
tolera <- tol
}
else {
tolera <- tol * qq[1]
}
if (mean(x == 0) > (1 - delta)) {
sig <- 0
}
else {
.averho <- function(sig, x, gamma, delta, q){
u <- (x/sig - 1)/gamma
y <- ((u/(2 * q) * (q + 1 - u^q) + 0.5))
y[u >= 1] <- 1
y[u < (-1)] <- 0
mean(y) - delta
}
sig <- saferoot(f = .averho, interval = sigin, x = x, gamma = gamma, delta = delta, q = q, tol = tolera)$root
}
return(sig)
}
wt.rocke <- function (tt, gamma, q) {
ss <- (tt - 1)/gamma
w <- 1 - ss^q
w[abs(ss) > 1] <- 0
return(w)
}
.rockemat <-function (S0, dis) {
p <- dim(S0)[1]
sig <- hdmedian(dis)
cc <- qchisq(0.5, df = p)
ff <- sig/cc
return(list(ff = ff, S = S0 * ff))
}
.rockconst <- function (p, n) {
beta <- c(-6.1357, -1.0078, 0.81564)
if (p >= 15) {
a <- c(1, log(p), log(n))
alpha <- exp(sum(beta * a))
gamma <- qchisq(1 - alpha, df = p)/p - 1
gamma <- min(gamma, 1)
}
else {
gamma <- 1
alpha <- 1e-06
}
return(list(gamma = gamma, alpha = alpha))
}
.shrconst <- function (p, n){
x <- c(1/p, p/n, 1)
beta <- c(4.3331193, -1.0018907, 0.6352537, 2.4980042, -0.6920007, 0.7344242)
beta <- matrix(beta, byrow = TRUE, ncol = 3)
return(t(x) %*% t(beta))
}
.shrmat <-function (S0, dis) {
p <- dim(S0)[1]
sig <- hdmedian(dis)
cc <- qchisq(0.5, df = p)
ff <- sig/cc
return(list(ff = ff, S = S0 * ff))
}
.scale_shr <- function (t, delta = 0.5, sig0 = hdmedian(t), niter = 50, tol = 1e-04) {
.averho <- function (sig, d, delta) {
d <- d/sig
G1 <- (-1.944); G2 <- 1.728; G3 <- (-0.312); G4 <- 0.016
u <- (d > 9); v <- (d < 4); w <- (1 - u) * (1 - v)
z <- v * d/2 + w * ((G4/8) * (d^4) + (G3/6) * (d^3) + (G2/4) *
(d^2) + (G1/2) * d + 1.792) + 3.25 * u
z <- z/3.25
return(sig * mean(z)/delta)
}
if (mean(t < 1e-16) >= 0.5) {
sig <- 0
}
else {
sig1 <- sig0
y1 <- .averho(sig1, t, delta)
while ((y1 > sig1) & ((sig1/sig0) < 1000)) {
sig1 <- 2 * sig1
y1 <- .averho(sig1, t, delta)
}
if ((sig1/sig0) >= 1000) {
sig <- sig0
}
else {
iter <- 0
sig2 <- y1
while (iter <= niter) {
iter <- iter + 1
y2 <- .averho(sig2, t, delta)
den <- sig2 - y2 + y1 - sig1
if (abs(den) < tol * sig1) {
sig3 <- sig2
iter <- niter + 1
}
else {
sig3 <- (y1 * sig2 - sig1 * y2)/den
}
sig1 <- sig2
sig2 <- sig3
y1 <- y2
}
sig <- sig2
}
}
return(sig)
}
wt.shr <- function(u){
G1 <- -1.944
G2 <- 1.728
G3 <- -0.312
G4 <- 0.016
o <- (u > 9)
v <- (u < 4)
w <- (1 - o) * (1 - v)
z <- v/2 + w * ((G4/2) * (u^3) + (G3/2) * (u^2) + (G2/2) * u + (G1/2))
w <- z/3.25
return(w / max(w))
}
.OGKmakecovfun <- function(mufun, scfun, vrob){
function(X){
X<-as.matrix(X);n<-nrow(X);p<-ncol(X)
sigma <- apply(X, 2, function(i) scfun(i))
Y <- X %*% diag(1/sigma)
U <- matrix(1, p, p)
for (i in 1:p) for (j in i:p) {
U[j, i] <- U[i, j] <- vrob(Y[, i], Y[, j])
}
diag(U) <- 1
E <- eigen(U)$vectors
A <- diag(sigma) %*% E
Z <- Y %*% E
cent <- apply(Z, 2, mufun)
sigma <- apply(Z, 2, scfun)
cov <- A %*% tcrossprod(diag(sigma^2), A)
loc <- A %*% cent
list(cov = mpd(cov), center = loc, A = A, Z = Z)
}
}
.OGKcustomvrob <- function(mufun, scfun){
function(x, y){
x<-x-mufun(x);y<-y-mufun(y);
return(((scfun(x+y)^2)-(scfun(x-y)^2))/4)
}
}
## Modification of RobStatTM's locScaleM function to accomodate variables with mad=0
.locScaleMmod <- function (x, psi = "bisquare", eff = 0.95, maxit = 150, tol = 1e-06){
suppressPackageStartupMessages(library(RobStatTM))
kpsi <- switch(psi, bisquare = 1, huber = 2, mopt = 3)
if (kpsi == 1) {
if (eff < 0.80 || eff > 0.99){return(cat(crayon::red("eff must be > 0.80 and < 0.99")))}
kBis = splinefun(c(0.80,0.85,0.9,0.95,0.999),c(3.136909,3.44369,3.882662,4.685065,5.526268))(eff)
efis = c(0.80,0.85,0.9,0.95,0.999)
keff = eff
if (kpsi > 0 & keff > 0) {
ktun = kBis; mu0 = hdmedian(x); sig0 = cvreg:::huberMean(abs(x - hdmedian(x)))/sqrt(2/pi)
resi <- (x-mu0)/sig0
if (sig0 < 1e-10) {
mu <- mu0
sigma <- 0
}
else{
dife = 1e+10
iter = 0
while (dife > tol & iter < maxit) {
iter = iter + 1
resi = (x-mu0)/sig0
ww = wt.Bisquare(resi/ktun, kpsi)
mu = sum(ww*x)/sum(ww)
dife = abs(mu-mu0)/sig0
mu0 = mu
}
}
}
rek = resi/ktun
pp = RobStatTM:::psif(rek, kpsi)*ktun
n = length(x)
a = mean(pp^2)
b = mean(RobStatTM:::psipri(rek, kpsi))
sigmu = sig0^2 * a/(n * b^2)
sigmu = sqrt(sigmu)
scat <- RobStatTM:::mscale(u = x - mu, delta = 0.5, tuning.chi = 1.54764, family = "bisquare")
resu <- list(mu = mu, std.mu = sigmu, disper = scat)
}
else if (kpsi==2){
if (eff < 0.80 || eff > 0.99){return(cat(crayon::red("eff must be > 0.80 and < 0.99")))}
kHub = splinefun(c(0.80,0.85,0.9,0.95,0.99),c(0.593,0.732,0.981,1.34,1.809))(eff)
efis = c(0.80,0.85,0.9,0.95,0.99)
keff = eff
ktun = kHub; mu0 = hdmedian(x);
huberscale <- function (y, k = 1.5, mu, tol = 1e-06, maxit = 1000){
y <- y[!is.na(y)];n<-length(y);mu0 <- mu;mu1 <- mu;n1<-n;s0 <- cvreg:::huberMean(abs(y-hdmedian(y)))/sqrt(2/pi)
th <- 2 * pt(k, df = n) - 1;beta <- th + k^2 * (1 - th) - 2 * k * dt(k, df = n)
iter <- 0
converged <- F
while(!converged & iter < maxit) {
yy <- pmin(pmax(mu0 - k * s0, y), mu0 + k*s0)
ss <- sum((yy - mu1)^2)/n1;s1 <- sqrt(ss/beta)
if ((abs(mu0 - mu1) < tol * s0) && abs(s0 - s1) < tol * s0){converged <- TRUE}
else{converged<-FALSE}
iter <- iter + 1;mu0 <- mu1;s0 <- s1
}
s0
}
scale <- huberscale(x, k = ktun, mu = mu0, tol = tol, maxit = maxit)
resu <- list(mu=MASS::hubers(x, k = ktun, s = scale, initmu = mu0)$mu, disper = scale)
resi <- (x - resu$mu) / resu$disper
sigmu <- sqrt(mean(resi^2))/sqrt(length(x))
resu <- list(mu=resu$mu, sigmu = sigmu, disper = resu$disper)
}
else {
family <- "mopt"
eff <- match.closest(eff, c(0.80, 0.85,0.9,0.95,0.99))
efis <- c(0.80,0.85,0.9,0.95,0.99)
if (eff < 0.80 || eff > 0.99){return(cat(crayon::red("eff must be > 0.80 and < 0.99")))}
keff <- eff
if (!is.na(keff)) {
cc <- RobStatTM::mopt(eff)
mu0 <- hdmedian(x)
sig0 <- cvreg:::huberMean(abs(x-hdmedian(x)))/sqrt(2/pi)
resi <- (x - mu0)/sig0
if (sig0 < 1e-10) {
mu <- mu0
sigma <- 0
}
else {
dife <- 1e+10
iter <- 0
while (dife > tol && iter < maxit) {
iter <- iter + 1
resi <- (x - mu0)/sig0
ww <- RobStatTM:::Mwgt(resi, cc, "mopt")
mu <- sum(ww * x)/sum(ww)
dife <- abs(mu - mu0)/sig0
mu0 <- mu
}
}
pp <- RobStatTM::rhoprime(resi, "mopt", cc)
n <- length(x)
a <- mean(pp^2)
b <- mean(RobStatTM:::rhoprime2(resi, "mopt", cc))
sigmu <- sqrt(sig0^2 * a/(n * b^2))
f <- function(u, family = "mopt", cc) {
cc["c"] <- u
integrate(function(x,fam,cc) RobStatTM:::rho(x, fam, cc) * dnorm(x), -Inf, Inf, fam = family, cc = cc)$value - 0.5
}
cc["c"] <- cvreg::saferoot(f, c(0.01, 10), family = family, cc = cc)$root
scat <- RobStatTM:::mscale(x - mu, delta = 0.5, tuning.chi = cc, family = "mopt")
resu <- list(mu = mu, std.mu = sigmu, disper = scat)
}
else {
print(c(eff, " No such eff"))
resu <- NA
}
}
return(resu)
}
#' Calculate Adaptive Huber Covariance Matrix
#'
#' @param x a data frame or matrix of numeric variables
#'
#' @return a covariance matrix
#' @export
#' @keywords internal
#' @examples
#' cov.ahuber(x)
#'
#' @references
#' Huber, P. J. (1964). Robust estimation of a location parameter. Ann. Math. Statist. 35, 73–101. \cr
cov.ahuber <- function(x){
cvreg:::huberCov(as.matrix(x))
}
#' Blocked Adaptive Computationally-Efficient Outlier Nominators
#'
#' @description Intended only for use as an initial estimator for more robust covariance estimates.
#'
#' @param x a data frame or matrix of numeric covariates
#' @param collect numeric factor chosen by the user to define the size of the initial subset (p * collect)
#' @param center0 the initial estimate of the multivariate center.
#' defaults to "SpatialMedian". The other option is "Medians" which uses
#' the componentwise medians (column univariate medians).
#' @param maxiter
#'
#' @return a list
#' @export
#' @keywords internal
cov.bacon <- function(x, collect = 4, center0 = c("SpatialMedian", "Medians"), maxiter = 500){
## Original code written May 25, 2001 by U. Oetliker for S-Plus 5.1.
## more Modifications in porting to R : Martin Maechler, May 2009.
## Additional modifications by Brandon Vaughan, 3/13/2020.
x <- as.matrix(x)
center0 <- match.arg(center0)
n <- nrow(x); p <- ncol(x)
m = min(collect * p, n * 0.5)
alpha <- min(0.975, 1-((0.24815 - 0.0021885*p)/sqrt(n)))
chiCrit <- function(n, p, r, alpha){
h <- (n + p + 1) / 2
chr <- max(0, (h - r) / (h + r))
if (n > p){cnp <- 1 + (p + 1) / (n - p) + 2 / (n - 1 - 3*p)}
else{cnp <- 1 + (n + 1) / (p - n) + 2 / (p - 1 - 3*n)}
cnpr <- cnp + chr
cnpr * sqrt(qchisq(alpha / n, p, lower.tail = FALSE))
}
ordered.indices <-
switch(center0,
"Medians" = {
n <- nrow(x); p <- ncol(x)
center <- colMedians(x)
x.centr <- sweep(x, 2, center)
sigma <- crossprod(x.centr)/nrow(x.centr)
rho <- cov2cor(sigma); sigma <- diag(sigma)
alpha <- cvreg:::.estimateAlpha(x.centr)
rho <- ((1-alpha)*rho) + (alpha*diag(p))
order(mahalanobis_dist(x, center, cor2cov(rho, sigma)))
},
"SpatialMedian" = {
n <- nrow(x); p <- ncol(x)
center <- L1median(x)$center
x.centr <- sweep(x, 2, center)
sigma <- crossprod(x.centr)/nrow(x.centr)
rho <- cov2cor(sigma); sigma <- diag(sigma)
alpha <- cvreg:::.estimateAlpha(x.centr)
rho <- ((1-alpha)*rho) + (alpha*diag(p))
order(mahalanobis_dist(x, center, cor2cov(rho, sigma)))
}
)
m <- as.integer(m)
stopifnot(n >= m, m > 0)
ordered.x <- x[ordered.indices, , drop = FALSE]
while (m < n && p > (rnk <- qr(var(ordered.x[1:m, , drop = FALSE]))$rank))
m <- m + 1L
subset <- 1:n %in% ordered.indices[1:m]
presubset <- rep(FALSE, n)
converged <- FALSE
steps <- 1L
while (steps < maxiter &&
!converged) {
r <- sum(subset)
xhat <- x[subset, , drop = FALSE]
center <- colMeans(xhat)
cov <- cov(xhat)
if (!all(eigen(cov,T,T)$values > 0)){
cov <- covShrink(xhat)
}
dis <- sqrt(mahalanobis_dist(x, center, cov))
converged <- !any(xor(presubset, subset))
presubset <- subset
limit <- chiCrit(n, p, r, alpha)
subset <- dis < limit
steps <- steps + 1L
}
list(center = center, cov = cov, dist = dis^2, outliers = which(!subset),
steps = steps, limit = limit, converged = converged)
}
#' Estimate a Robust Correlation Matrix
#'
#' @param x a data frame or matrix of numeric covariates
#' @param covfun a covariance function that returns a list with items named "center" and "cov". defaults to \code{\link{cov.shr}}
#' @param args an optional named list of arguments to pass to covfun
#'
#' @return a correlation matrix
#' @export
robcor <- function(x, covfun = cov.shr, args = NULL){
if (!is.null(args)){
args[["x"]] <- x
out <- do.call(covfun, args)
}
else{
out <- covfun(x)
}
return(cov2cor(out$cov))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.