knitr::opts_chunk$set(cache = TRUE, cache.lazy = FALSE, warning = FALSE, 
                      message =FALSE, echo = TRUE, dpi = 180, 
                      fig.width = 8, fig.height = 5)

theme_minimal()
update_geom_defaults("rect", list(fill = "midnightblue", alpha = 0.8))
# assign values for missing surveyor IDs.
#

Read in sample dataset

# read and skim testing data

dummy_data <- read.dta(create_abs_path("data-raw/dummy_main.dta"))
skimr::skim(dummy_data)

Testing area for package functions (One section for each function)

 

count_missing_values

# Things to be improved and other notes:

# 1. tidy.number is deprecated warning message: Remove this.
# 2. Add a plot from this function to autoplot. It should have options to 
#    plot both percentage and counts.


# Let us consider the variables which are unique and you want to run the missing checks for.
# Suppose we consider "surveyor_id" and "name"

variable_names <- c("surveyor_id", "name")      # you can put as many variables as you want
foo <- dummy_data[, variable_names] %>%
  purrr::map(function(x) sum(is.na(x))) %>%
  unlist() %>%
  broom::tidy() %>%
  dplyr::rename("variable_name" = "names", "num_missing_values" = "x")



sapply(dummy_data[,variable_names], function(x) sum(is.na(x)))

dummy_data_missing <- count_missing_values(dummy_data)

ggplot2::ggplot(data = dummy_data_missing, mapping = aes(x = variable_name, y = num_missing_values, size = num_missing_values)) +
  geom_col() + coord_flip() +
  labs(x = "Number of missing values", y = "Variable(Column) Name", 
       title = "Number of missing Values in variables (columns)") +
  theme(legend.title = element_blank())
# Things to be improved and other notes
#
# 1. How to use "group_by" variable (as function arguments) in writing functions? #    1.1: https://github.com/tidyverse/dplyr/issues/4734
#    1.2: https://stackoverflow.com/questions/21208801/group-by-multiple-columns-in-dplyr-using-string-vector-input

dummy_data_duplicates <- count_duplicates(dummy_data, c("surveyor_id")) %>%
  arrange(percent_of_tot_dup)

ggplot2::ggplot(data = dummy_data_duplicates, mapping = aes(x = reorder(surveyor_id, -percent_of_tot_dup), y = percent_of_tot_dup)) +
  geom_col()  +
  labs(x = "Surveyor Id", y = "% of total Duplicates", 
       title = "Percent (of total) duplicate Values in Surveyor Id") +
  theme(legend.title = element_blank())

ggplot2::ggplot(data = dummy_data_duplicates, mapping = aes(x = reorder(surveyor_id, -percent_of_tot_dup), y = percent_of_tot_dup)) +
  geom_point(mapping = aes(size = percent_of_tot_dup))  +
  labs(x = "Surveyor Id", y = "% of total Duplicates", 
       title = "Percent (of total) duplicate Values in Surveyor Id") +
  theme(legend.title = element_blank())
foo <- display_duplicates(dummy_data, c("surveyor_id"))

foo %>%
  filter(surveyor_id %in% c(101:105)) %>%
  ggplot(mapping = aes(x = age_c)) +
  geom_histogram(stat = "count") +
  facet_grid(gender~surveyor_id)


AarshBatra/econR documentation built on Dec. 17, 2021, 6:45 a.m.