In emilelatour/lamisc: Latour's Little Helpers

#### Global chunk options -----------------------------

knitr::opts_chunk$set(
  echo       = TRUE,  # Whether to show code chunk in final output
  results    = "asis", 
  NULL
)

#### Other options --------------------------------

## Width for code chunks ---------------- 

options(width = 120)

## Scientific notation ---------------
# Find out what it is set to
# getOption("scipen")

# Turn off scientific notation
options(scipen = 999)

## define format ----------------
# If you don't define format here, you'll need put `format = "html"` in 
# every kable function.

options(knitr.table.format = "html")  # use this option when knitting html
# options(knitr.table.format = "latex")  # use this option when knitting pdf


#### Packages --------------------------------------
# This is where we load in all the packages we plan to use

## Define the repository for packages ----------------
options(repos = c(CRAN = "https://cran.rstudio.com"))

## universally useful packages ----------------
# if (!require("pacman")) {install.packages("pacman")}
# if (!require("devtools")) {install.packages("devtools")}


## Package version ---------------- 

# As of 2019-10-18, I've found the GitHub version to be more reliable and
# recommend installing from there.

# https://github.com/dcomtois/summarytools

# install_github("dcomtois/summarytools")


## Load the list of packages ----------------

pacman::p_load(
  tidyverse,     # packages ggplot2, dplyr, tidyr, readr, purrr, tibble, 
                 # stringr, and forcats
  broom,         # functions tidy(), glance(), augment()
  flextable,     # easily create tables for reporting and publications
  fs,            # Cross-platform interface to file system operations
  glue,          # Glue strings to data in R
  here,          # Constructs paths to your project's files
  janitor,       # for working with dirty data 
  mice,          # Multiple imputation using Fully Conditional Specification
  naniar,        # structures, summaries, and visualisations for missing data 
  officer,       # Used by flextable
  readxl,        # read in excel files
  summarytools,  # neatly and quickly summarize data
  install = FALSE
)


#### Other packages -----------------------------

#### Some resources ---------------------------

#### Themes and colors -------------------------------

#### Load fonts --------------------------------
# extrafont::fonts()  # Vector of font family names
# extrafont::fonttable()  # Show entire table
# extrafont::font_import()  # imports fonts installed on the system
extrafont::loadfonts(device = "pdf", quiet = TRUE)

#### Summarytools options -------------------------------- 

summarytools::st_options(bootstrap.css     = FALSE,       # Already part of the theme so no need for it
                         plain.ascii       = FALSE,       # One of the essential settings
                         style             = "rmarkdown", # Idem.
                         dfSummary.silent  = TRUE,        # Suppresses messages about temporary files
                         dfSummary.varnumbers = FALSE, # This keeps results narrow enough
                         footnote          = NA)          # Keeping the results minimalistic

# summarytools::st_css()

Load the data

# data("riskfactors", package = "naniar")
data <- riskfactors

Data frame summary

print(summarytools::dfSummary(data, 
                              style = "grid", 
                              graph.magnif = 0.75, 
                              valid.col = FALSE), method = 'render')

Explore missingness

Percent of cases with missing or complete values

data |> 
  summarise(
    n_rows = dim(data)[[1]], 
    n_missing = naniar::n_case_miss(data), 
    prop_missing = n_missing / n_rows, 
    n_complete = naniar::n_case_complete(data), 
    prop_complete = n_complete / n_rows
  ) |> 
  flextable::flextable() |> 
  flextable::fontsize(part = "body", 
                      size = 12) |> 
  flextable::fontsize(part = "header", 
                      size = 14)

Missingness plot

Missingness plot with variables that have any missing at all selected.

# https://cran.r-project.org/web/packages/naniar/vignettes/getting-started-w-naniar.html

dplyr::select_if(.tbl = data, 
                 .predicate = naniar::any_na) |> 
  naniar::vis_miss(cluster = TRUE, 
                   sort_miss = FALSE)

Number / percent missing for each variable

Summary of the number of columns with any missing values.

purrr::map_df(.x = data, 
              .f = ~ sum(is.na(.x))) |> 
  tidyr::pivot_longer(cols = dplyr::everything(), 
                      names_to = "vars", 
                      values_to = "n_miss") |> 
  summarise(n_var = length(n_miss), 
            n_missing = sum(n_miss > 0), 
            prop_missing = n_missing / n_var, 
            n_complete = sum(n_miss == 0), 
            prop_complete = n_complete / n_var, 
  ) |> 
  flextable::flextable() |> 
  flextable::fontsize(part = "body", 
                      size = 12) |> 
  flextable::fontsize(part = "header", 
                      size = 14)

Summary for each variable of the number or missings and the percent missings of the variables. By default, it orders by the most missings in each variable

data |> 
  naniar::miss_var_summary() |> 
  # dplyr::slice(1:20) |> 
  flextable::flextable() |> 
  flextable::fontsize(part = "body", 
                      size = 12) |> 
  flextable::fontsize(part = "header", 
                      size = 14)

Missing cases

The number of cases with 0, 1, 2, up to n, missing values and the proportion of the number of cases those cases make up.

data |> 
  naniar::miss_case_table() |> 
  flextable::flextable() |> 
  flextable::fontsize(part = "body", 
                      size = 12) |> 
  flextable::fontsize(part = "header", 
                      size = 14)