knitr::opts_chunk$set(echo=F, warning=F, message = F)
{ library(tidyverse) library(lubridate) library(Data.Quality.Reports) } modal.size <- function(values){ modal.size <- table(values) %>% sort() %>% tail(1) modal.size / length(values) } data <- params$path %>% readRDS() missing <- c(NA, "", " ", "NA", "nan", " / / ", "NULL", "null", "Null") missing.rates <- data %>% is.missing(missing) %>% apply(2, mean) miss.1 <- which(missing.rates == 1) %>% names() miss.gr0.8 <- which((missing.rates >= 0.8) & (missing.rates < 1))%>% names() miss.gr0.5 <- which((missing.rates >= 0.5) & (missing.rates < 0.8))%>% names() miss.gr0.2 <- which((missing.rates >= 0.2) & (missing.rates < 0.5))%>% names() miss.gr0.0 <- which((missing.rates > 0.0) & (missing.rates < 0.2))%>% names() miss.0 <- which((missing.rates == 0.0))%>% names() missing.cat <- c(length(miss.1), length(miss.gr0.8), length(miss.gr0.5), length(miss.gr0.2), length(miss.gr0.0), length(miss.0)) m <- max(missing.cat) lpad <- function(values, n, with=""){ c(values, rep(with, n - length(values))) }
Data with high levels of missing data (e.g. >80%) will be of little use analytically. Note - when performing sub-group analyses variables with high, but not 100%, missingness may be more valuable.
tibble( `Missing Group` = c("100% Missing", "[80%, 100%) Missing", "[50%, 80%) Missing", "[20%, 50%) Missing", "(0%, 20%) Missing", "0% Missing"), `Frequency` = c(length(miss.1), length(miss.gr0.8), length(miss.gr0.5), length(miss.gr0.2), length(miss.gr0.0), length(miss.0)) ) %>% knitr::kable(caption = "Frequnecy of variables by rate of missing values.") tibble( `100% Missing` = lpad(miss.1, m), `[80%, 100%) Missing` = lpad(miss.gr0.8, m), `[50%, 80%) Missing` = lpad(miss.gr0.5, m), `[20%, 50%) Missing` = lpad(miss.gr0.2, m), `(0%, 20%) Missing` = lpad(miss.gr0.0, m), `0% Missing` = lpad(miss.0, m) ) %>% knitr::kable(caption = "Variable IDs for each rate of missing values.")
modal.sizes <- data %>% apply(2, modal.size) modal.1 <- which(modal.sizes == 1) %>% names() modal.gr0.8 <- which((modal.sizes >= 0.8) & (modal.sizes < 1))%>% names() modal.gr0.5 <- which((modal.sizes >= 0.5) & (modal.sizes < 0.8))%>% names() modal.gr0.2 <- which((modal.sizes >= 0.2) & (modal.sizes < 0.5))%>% names() modal.gr0.0 <- which((modal.sizes > 0.0) & (modal.sizes < 0.2))%>% names() modal.0 <- which((modal.sizes == 0.0))%>% names() tibble( `Modal Group` = c("100% Modal", "[80%, 100%) Modal", "[50%, 80%) Modal", "[20%, 50%) Modal", "(0%, 20%) Modal", "0% Modal"), `Frequency` = c(length(modal.1), length(modal.gr0.8), length(modal.gr0.5), length(modal.gr0.2), length(modal.gr0.0), length(modal.0)) ) %>% knitr::kable(caption = "Frequnecy of variables by rate of missing values.")
Variables where the modal (most common) value dominates the variable have little impact on analytical models. Where the modal value makes up 100% of the variable, including in an analytical model is pointless.
m.modal <- c(length(modal.1), length(modal.gr0.8), length(modal.gr0.5), length(modal.gr0.2), length(modal.gr0.0), length(modal.0)) %>% max() tibble( `100% Modal` = lpad(modal.1, m.modal), `[80%, 100%) Modal` = lpad(modal.gr0.8, m.modal), `[50%, 80%) Modal` = lpad(modal.gr0.5, m.modal), `[20%, 50%) Modal` = lpad(modal.gr0.2, m.modal), `(0%, 20%) Modal` = lpad(modal.gr0.0, m.modal), `0% Modal` = lpad(modal.0, m.modal) ) %>% knitr::kable(caption = "Variable IDs for each rate of modal values.")
low.missing.high.modal <- which((missing.rates < 0.5) & (modal.sizes > 0.8)) %>% names() extract.modal <- function(values){ modal <- values %>% table() %>% sort() %>% tail(1) return(modal) rate <- round(modal / length(values)*100, 1) name <- names(modal) c(name, rate) } modal.groups <- data[,low.missing.high.modal] %>% lapply(extract.modal) group <- sapply(modal.groups, names) freq <- sapply(modal.groups, function(i) i) modal.perc <- tryCatch( round(100*freq / nrow(data), 1), error = function(i) c() ) tibble( Variable = low.missing.high.modal, `Modal Group` = group, `Modal %` = modal.perc ) %>% knitr::kable(caption = "Singular data (low missing rates)")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.