knitr::opts_chunk$set(cache = TRUE, cache.lazy = FALSE, warning = FALSE, message =FALSE, echo = TRUE, dpi = 180, fig.width = 8, fig.height = 5) theme_minimal() update_geom_defaults("rect", list(fill = "midnightblue", alpha = 0.8))
# assign values for missing surveyor IDs. #
# read and skim testing data dummy_data <- read.dta(create_abs_path("data-raw/dummy_main.dta")) skimr::skim(dummy_data)
count_missing_values
# Things to be improved and other notes: # 1. tidy.number is deprecated warning message: Remove this. # 2. Add a plot from this function to autoplot. It should have options to # plot both percentage and counts. # Let us consider the variables which are unique and you want to run the missing checks for. # Suppose we consider "surveyor_id" and "name" variable_names <- c("surveyor_id", "name") # you can put as many variables as you want foo <- dummy_data[, variable_names] %>% purrr::map(function(x) sum(is.na(x))) %>% unlist() %>% broom::tidy() %>% dplyr::rename("variable_name" = "names", "num_missing_values" = "x") sapply(dummy_data[,variable_names], function(x) sum(is.na(x))) dummy_data_missing <- count_missing_values(dummy_data) ggplot2::ggplot(data = dummy_data_missing, mapping = aes(x = variable_name, y = num_missing_values, size = num_missing_values)) + geom_col() + coord_flip() + labs(x = "Number of missing values", y = "Variable(Column) Name", title = "Number of missing Values in variables (columns)") + theme(legend.title = element_blank())
# Things to be improved and other notes # # 1. How to use "group_by" variable (as function arguments) in writing functions? # 1.1: https://github.com/tidyverse/dplyr/issues/4734 # 1.2: https://stackoverflow.com/questions/21208801/group-by-multiple-columns-in-dplyr-using-string-vector-input dummy_data_duplicates <- count_duplicates(dummy_data, c("surveyor_id")) %>% arrange(percent_of_tot_dup) ggplot2::ggplot(data = dummy_data_duplicates, mapping = aes(x = reorder(surveyor_id, -percent_of_tot_dup), y = percent_of_tot_dup)) + geom_col() + labs(x = "Surveyor Id", y = "% of total Duplicates", title = "Percent (of total) duplicate Values in Surveyor Id") + theme(legend.title = element_blank()) ggplot2::ggplot(data = dummy_data_duplicates, mapping = aes(x = reorder(surveyor_id, -percent_of_tot_dup), y = percent_of_tot_dup)) + geom_point(mapping = aes(size = percent_of_tot_dup)) + labs(x = "Surveyor Id", y = "% of total Duplicates", title = "Percent (of total) duplicate Values in Surveyor Id") + theme(legend.title = element_blank())
foo <- display_duplicates(dummy_data, c("surveyor_id")) foo %>% filter(surveyor_id %in% c(101:105)) %>% ggplot(mapping = aes(x = age_c)) + geom_histogram(stat = "count") + facet_grid(gender~surveyor_id)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.