process_factors = function(data.fac,
quantile_probs_factor,
miss.cut,
verbose = FALSE) {
# Set some default return values.
# TODO: improve names for these objects.
num_factors = 0L
miss.fac = NULL
datafac.dum = NULL
#####################
if (ncol(data.fac) > 0L) {
if (verbose) cat("Processing factors. Start count:", ncol(data.fac), "\n")
######################################
# Replace blank factor values with NA's.
# We re-use this num_cols variable in the next section.
num_cols = ncol(data.fac)
for (i in 1:num_cols) {
new_factor = as.character(data.fac[, i])
# The exclude argument replaces any empty strings with NAs.
new_factor = factor(new_factor, exclude = "")
data.fac[, i] = new_factor
}
###################
# For each factor, apply function and get rid of those where
# 'true' data.fac is data frame of variables that are factors
if (!is.null(quantile_probs_factor)) {
data.fac = restrict_by_quantiles(data.fac, quantile_probs = quantile_probs_factor,
verbose = verbose)
}
dropped_cols = num_cols - ncol(data.fac)
if (verbose) {
if (dropped_cols > 0) {
cat("Dropped", dropped_cols, "factors due to lack of variation.\n")
} else {
cat("No factors dropped due to lack of variation.\n")
}
}
# We don't seem to use this yet.
# num.cat = sapply(data.fac, length_unique)
######################
# Remove columns with missing data % greater than the threshold.
sum_nas = sapply(data.fac, sum_na)
if (length(sum_nas) == 0L) {
if (verbose) {
cat("All factors were dropped.\n")
}
data.fac = NULL
} else {
if (verbose) cat("Factors with missingness:", sum(sum_nas > 0L), "\n")
miss_pct = sum_nas / nrow(data.fac)
data.fac = data.fac[, miss_pct < miss.cut, drop = FALSE]
if (verbose) {
cat("Dropped", sum(miss_pct >= miss.cut), "factors due to the missingness threshold.\n")
}
# Save how many separate factors we have in this dataframe.
num_factors = ncol(data.fac)
factor_results = factors_to_indicators(data.fac, verbose = verbose)
datafac.dum = factor_results$data
# Here 1 = defined, 0 = missing.
miss.fac = factor_results$missing_indicators
if (verbose) {
cat("End factor count:", num_factors, "Indicators:", ncol(datafac.dum),
"Missing indicators:", ncol(miss.fac), "\n")
}
}
} else {
data.fac = NULL
}
(results =
list(
num_factors = num_factors,
miss.fac = miss.fac,
datafac.dum = datafac.dum,
data.fac = data.fac
))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.