#' prepare_df_for_em
#' @description Change the position of categorical columns to adapt to the
#' em.mix input form.
#' @param df Original dataframe
#' @param col_cat Categorical columns index in \code{df}
#' @return Dataframe where the categorical columns are the first columns
prepare_df_for_em <- function(df, col_cat) {
allcol <- c(1:length(colnames(df)))
col_rest <- allcol[!allcol %in% col_cat]
return(df[, c(col_cat, col_rest)])
}
#' prob_vector_cat
#' @description Extract the probability vector from a tensor of probability.
#' For example, if there are only 2 categorical variables \code{(Y1, Y2)} with
#' \code{nlevels(Y1)=2, nlevels(Y2)=3}, then the tensor is a matrix of
#' probability. \code{tensor[i,j]} is the probability of \code{Y1=i, Y2=j}. If
#' \code{obs=(NA,2)}, then \code{prob_vector_cat} returns the probability vector
#' of \code{Y1} when \code{Y2=2} and the onehot probability vector of
#' \code{Y2=2}. If \code{obs=(NA,NA)}, then \code{prob_vector_cat} returns the
#' marginal probability vector of \code{Y1} and that of \code{Y2=2}.
#' @param obs One observation of all categorical variables.
#' @param tensor A tensor (multidimensional array) of probability, with
#' dimension \code{nlevels(Y1) x nlevels(Y2) x...x nlevels(Yk)} if there are k
#' categorical columns.
#' @param dict_name_cat A dictionary of categorical variable names and their
#' corresponding onehot name. For example, \code{Y7: Y7_1, Y7_2,...}.
#' @export
#' @return Probability vector for all categorical columns.
prob_vector_cat <- function(obs, tensor, dict_name_cat) {
cat_name <- unlist(dict_name_cat)
obs.disj <- data.frame(matrix(rep(0, length(cat_name)), nrow = 1))
colnames(obs.disj) <- cat_name
for (name in names(dict_name_cat)) {
if (!is.na(obs[[name]])) {
cat <- paste0(name, "_", obs[[name]])
obs.disj[[cat]] <- 1
obs[[name]] <- cat
}
}
if (any(is.na(obs))) {
# if there are NA(s)
obs2 <- as.list(obs)
obs2[is.na(obs)] <- TRUE
prob_matrix <- do.call(`[`, c(list(tensor), obs2))
if (is.null(dim(prob_matrix))) {
obs.disj[names(prob_matrix)] <- prob_matrix
} else {
dimt <- dim(prob_matrix)
dimt <- dimt[dimt != 1]
vec <- c()
for (t in c(1:length(dimt))) {
vec <- c(vec, apply(prob_matrix, t, sum))
}
obs.disj[names(vec)] <- vec
}
}
return(unlist(obs.disj))
}
#' em: modified EM Imputation with probability vector
#'
#' @description \code{em} is a em imputation function that returns categorical
#' columns results both in factor and in onehot probability vector form.
#' Please find the detailed documentation of \code{em.mix} and \code{imp.mix}
#' in the 'mix' package. Only the modifications are explained on this page.
#' After the estimation of parameter \code{pi} in \code{em.mix}, we change it
#' into a tensor (multidimensional array) and
#' extract the probability vector from this tensor with the help of function
#' \code{prob_vector_cat}.
#' @param df Data matrix with missing values.
#' @param col_cat Categorical columns index
#' @export
#' @return \code{ximp} imputed data matrix.
#' @return \code{ximp.disj} imputed data matrix of same type as 'ximp' for the
#' numeric columns.
#' For the categorical columns, the prediction of probability for each
#' category is shown in form of onehot probability vector.
em <- function(df, col_cat) {
is_norm_package_installed()
is_mix_package_installed()
exist_cat <- !all(c(0, col_cat) == c(0))
if (!exist_cat) { # if there are only numerical columns
s <- norm::prelim.norm(as.matrix(df)) # do preliminary manipulations
thetahat <- norm::em.norm(s) # ML estimate for unrestricted model
norm::rngseed(43) # set random number generator seed
ximp <- norm::imp.norm(s, thetahat, df) # impute under newtheta (one draw)
return(list(ximp = ximp, ximp.disj = ximp))
}
name_cat <- colnames(df)[col_cat]
# Deal with the problem that nlevels(df[[col]]) > length(unique(df[[col]]))
for (col in name_cat) {
df[[col]] <- factor(as.character(df[[col]]))
}
# remember the levels for each categorical column
dict_lev <- dict_level(df, col_cat)
# preserve colnames for ximp.disj
dummy <- dummyVars(" ~ .", data = df, sep = "_")
col_names.disj <- colnames(data.frame(predict(dummy, newdata = df)))
# represent the factor columns with their ordinal levels
df <- factor_ordinal_encode(df, col_cat)
# Create dict_cat with categroical columns
dict_name_cat <- dict_onehot(df, col_cat)
df.cat <- df[, col_cat, drop = FALSE]
# prepare for em and imputation
# The categorical columns must be ordinal encoded and they must be the first
# columns of the dataframe
df_for_em <- prepare_df_for_em(df, col_cat)
# do preliminary manipulations
s <- mix::prelim.mix(df_for_em, length(col_cat))
thetahat <- mix::em.mix(s) # ML estimate for unrestricted model
mix::rngseed(43) # set random number generator seed
# impute under newtheta (one draw)
ximp <- mix::imp.mix(s, thetahat, df_for_em)
ximp <- factor_encode(data.frame(ximp), c(1:length(col_cat)))
# rearrange columns
cols <- colnames(df)
ximp <- ximp[cols]
# create disjonctive table
dims <- c()
dimnames <- list()
dummy <- dummyVars(" ~ .", data = ximp, sep = "_")
ximp.disj <- data.frame(predict(dummy, newdata = ximp))
# construction of tensor pi
i <- 1
for (name in names(dict_name_cat)) {
# ximp.disj[[name]] <- df_with_mv[[name]]
lev <- dict_name_cat[[name]]
dims <- c(dims, length(lev))
dimnames[[i]] <- lev
i <- i + 1
}
pi <- thetahat$pi
tensor_pi <- array(pi, dims, dimnames)
# ximp.disj onehot probability
cat_name <- unlist(dict_name_cat)
ximp.disj[cat_name] <- t(apply(as.matrix(df.cat), 1,
FUN = function(x) prob_vector_cat(x, tensor_pi, dict_name_cat)
))
# Add: change back to original levels
for (col in name_cat) {
levels(ximp[[col]]) <- dict_lev[[col]]
}
colnames(ximp.disj) <- col_names.disj
return(list(ximp = data.frame(ximp), ximp.disj = data.frame(ximp.disj)))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.