Nothing
###########################################################################################################################################################
# match_multivariate
#
# Internal function that matches precomputed predictive means of multivariate missing data against the predictive means of the observed data
#
# Parameters:
# - y: Matrix that is to be imputed on.
# - y_hat: Matrix containing all predictive means.
# - ry: Logical vector of length length(y) indicating the rows of observed data that are matched against.
# - wy: Logical vector of length length(y) indicating the rows in y for which imputations are created.
# - donors: Size of the donor pool that is computed and drawn from.
# - distmetric: Character string specifying which distance metric is used when computing the donor pool.
# - dimweights: Numerical vector of weights that are applied on the columns of the predictive mean matrices when computing the donor pool.
# - match_partition: List containing partitions of y[ry] and y[wy] that are based on the values of an additional column that has to be matched by which
# has been specified externally by the variable match_vars.
#
# Return value: Matrix containing the imputed values for the missing data.
#
# Details:
# Matching procedure as proposed by Little (1988). When computing the distances, we use the RANN package to perform a fast k-nearest-neighbor
# search via kd-trees. If we choose Mahalanobis distance or its analogon that uses the residual covariance (which is recommended by Little) to
# compute the donor pool, we first have to transform the y_hats as we cannot feed quadratic forms into RANN::nn2. To do this, we first compute
# the square root of the (pseudo)inverse of the covariance matrix via its eigendecomposition, taking advantage of its symmetric positive
# semidefinite structure. We then multiply this squareroot onto the y_hats to obtain the desired transformation and feed the results into the
# nearest-neighbor search, after applying weights in case they have been specified. In the end, we randomly sample from the resulting donor pool
# and return the drawn values as the imputations.
#
############################################################################################################################################################
match_multivariate <- function(y, y_hat, ry, wy, donors, distmetric, dimweights, match_partitions)
{
#if a column is categoric, it has to be binary, so reduce to integer level -1 to get to binary standard
ynum <- do.call(cbind, lapply(1:ncol(y), FUN =
function(i)
{
if(is.factor(y[,i]))
as.integer(y[,i]) -1
else
y[,i]
}))
#initialize y_obs and y_hat_obs/mis
y_obs <- ynum[ry,]
y_hat_obs <- y_hat[ry,]
y_hat_mis <- y_hat[wy,]
#if number of donors is bigger than or equal to number of observed values, we can sample immediately
if(donors >= sum(ry))
{
message("Specified number of donors is bigger than number of observed values that donors can be taken from.")
idx <- replicate(sum(wy), sample(1:sum(ry),1))
return(idx)
}
## if we want to compute distance via a covariance matrix, we have some more work to do:
## we want to use RANN for fastest nearest neighbor search which only provides euclidian distance,
## hence we have to multiply y with the square root of the inverse to get the same effect
if(!distmetric %in% c("manhattan", "euclidian"))
{
#compute fitting covariance matrix
if(distmetric == "mahalanobis")
cov_matrix <- cov(y_obs)
else
cov_matrix <- cov(y_obs - y_hat_obs)
## compute inverse of covariance matrix via eigen decomposition
## if matrix isn't invertible, we can use and simply compute the pseudo inverse from this decomposition as well
# get eigen decomposition of covariance matrix
cov_eigen <- eigen(cov_matrix, symmetric = TRUE)
# check which eigenvalues are sufficiently bigger than 0 so we can invert them
inv_ind <- cov_eigen$values > (cov_eigen$values[[1]] * .Machine$double.eps)
#inverted eigenvalues > eps and get their square root
ev_inv_sqrt <- rep(0, ncol(y))
ev_inv_sqrt[inv_ind] <- 1/sqrt(cov_eigen$values[inv_ind])
# compute distance matrix = (pseudo) inverse of covariance matrix via eigen decomposition
dist_matrix <- cov_eigen$vectors %*% diag(ev_inv_sqrt) %*% t(cov_eigen$vectors)
# transform y_hats
y_hat_obs <- y_hat_obs %*% dist_matrix
y_hat_mis <- y_hat_mis %*% dist_matrix
}
## apply weights
if(!is.null(dimweights))
{
y_hat_obs <- t(t(y_hat_obs) * dimweights) # '*' operator multiplies component-wise periodically in first dimension, so we need to transpose
y_hat_mis <- t(t(y_hat_mis) * dimweights) # to get column-wise multiplication
}
# check whether we also match by external data
# -> if yes, match on partitioned data
if(is.null(match_partitions))
{
# now do nearest neighbor search, either by manhattan or euclidian norm
if(distmetric == "manhattan")
{
nearest_neighbors <- RANN.L1::nn2(y_hat_obs, query = y_hat_mis, k = donors)
}
else
{
nearest_neighbors <- RANN::nn2(y_hat_obs, query = y_hat_mis, k = donors)
}
# sample from the donor pool of each missing observation
idx <- apply(nearest_neighbors$nn.idx, 1, sample, size = 1)
}
else
{
# prepare index of soon-to-be imputed rows and current partitions of missing and observed data
idx <- 1:sum(wy)
obs_partition <- match_partitions$obs_partition
mis_partition <- match_partitions$mis_partition
for(i in seq_along(obs_partition))
{
# if number of donors is bigger than or equal to number of observed values, we can sample immediately
if(donors >= length(obs_partition[[i]]))
{
idx[mis_partition[[i]]] <- replicate(length(mis_partition[[i]]), sample(obs_partition[[i]],1))
message("Specified number of donors is bigger than number of observed values in current partition that donors can be taken from.")
}
else
{
# grab y_hat, y_obs of current partition subset
y_hat_obs_tmp <- y_hat_obs[obs_partition[[i]],]
y_hat_mis_tmp <- y_hat_mis[mis_partition[[i]],]
# now do nearest neighbor search, either by manhattan or euclidian norm
if(distmetric == "manhattan")
{
nearest_neighbors <- RANN.L1::nn2(y_hat_obs_tmp, query = y_hat_mis_tmp, k = donors)
}
else
{
nearest_neighbors <- RANN::nn2(y_hat_obs_tmp, query = y_hat_mis_tmp, k = donors)
}
# sample from the donor pool of each missing observation
# we want index of drawn match donor among all observed values
# -> we draw from indices of current partition which is itself a set of indices, so we have to apply drawn index on current subset of partition
idx[mis_partition[[i]]] <- obs_partition[[i]][apply(nearest_neighbors$nn.idx, 1, sample, size = 1)]
}
}
}
#return sample index, not value of y, as given y might contain dummy values
return(y[ry,][idx,])
}
###########################################################################################################################################################
# match_univariate
#
# Univariate analogon to match_multivariate-function above, having the same functionalities, but, by nature of univariate matching, does not have parameters
# for metric or weight.
#
# Parameters:
# - y: Vector that is to be imputed on.
# - y_hat: Vector containing all predictive means.
# - ry: Logical vector of length length(y) indicating the rows of observed data that are matched against.
# - wy: Logical vector of length length(y) indicating the rows in y for which imputations are created.
# - donors: Size of the donor pool that is computed and drawn from.
# - match_partition: List containing partitions of y[ry] and y[wy] that are based on the values of an additional column that has to be matched by which
# has been specified externally by the variable match_vars.
#
# Return value: Vector containing the imputed values for the missing data.
#
############################################################################################################################################################
match_univariate <- function(y, y_hat, ry, wy, donors, match_partitions)
{
#if y is categoric, it has to be binary, so reduce to integer level -1 to get to binary standard
ynum <- y
if (is.factor(y))
ynum <- as.integer(y) - 1
#initialize y_obs and y_hat_obs/mis
y_obs <- y[ry]
y_hat_obs <- y_hat[ry]
y_hat_mis <- y_hat[wy]
#if number of donors is bigger than or equal to number of observed values, we can sample immediately
if(donors >= sum(ry))
{
idx <- replicate(sum(wy), sample(1:sum(ry),1))
return(idx)
}
# normalize y_hat matrices by median norm of shared rows
med_norm <- median(abs(c(y_hat_mis, y_hat_obs)))
y_hat_mis <- med_norm * y_hat_mis
y_hat_obs <- med_norm * y_hat_obs
# add random epsilon error on normalized y_hat_mis to simulate a random tie break in RANN
eps_vec <- runif(length(y_hat_mis), min = 1e-15, max = 1e+15)
y_hat_mis <- y_hat_mis + eps_vec;
# check whether we also match by external data
# -> if yes, match on partitioned data
if(is.null(match_partitions))
{
# draw nearest neighbors
nearest_neighbors <- RANN::nn2(y_hat_obs, query = y_hat_mis, k = donors)
# sample from the donor pool of each missing observation
idx <- apply(nearest_neighbors$nn.idx, 1, sample, size = 1)
}
else
{
idx <- 1:sum(wy)
obs_partition <- match_partitions$obs_partition
mis_partition <- match_partitions$mis_partition
for(i in seq_along(obs_partition))
{
# if number of donors is bigger than or equal to number of observed values, we can sample immediately
if(donors >= length(obs_partition[[i]]))
{
idx[mis_partition[[i]]] <- replicate(length(mis_partition[[i]]), sample(obs_partition[[i]],1))
}
else
{
# grab y_hat, y_obs of current partition subset
y_hat_obs_tmp <- y_hat_obs[obs_partition[[i]]]
y_hat_mis_tmp <- y_hat_mis[mis_partition[[i]]]
# now do nearest neighbor search
nearest_neighbors <- RANN::nn2(y_hat_obs_tmp, query = y_hat_mis_tmp, k = donors)
# sample from the donor pool of each missing observation
# we want index of drawn match donor among all observed values
# -> we draw from indices of current partition which is itself a set of indices, so we have to apply drawn index on current subset of partition
idx[mis_partition[[i]]] <- obs_partition[[i]][apply(nearest_neighbors$nn.idx, 1, sample, size = 1)]
}
}
}
# return sample index, not value of y, as given y might contain dummy values
return(y_obs[idx])
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.