R/aggregate.R

Defines functions plot_hist_numeric_group_impl plot_hist_numeric.grouped_df plot_hist_numeric_impl plot_hist_numeric.data.frame plot_hist_numeric plot_box_numeric_group_impl plot_box_numeric.grouped_df plot_box_numeric_impl plot_box_numeric.data.frame plot_box_numeric plot_qq_numeric_group_impl plot_qq_numeric.grouped_df plot_qq_numeric_impl plot_qq_numeric.data.frame plot_qq_numeric plot_bar_category_group_impl plot_bar_category.grouped_df plot_bar_category_impl plot_bar_category.data.frame plot_bar_category

Documented in plot_bar_category plot_bar_category.data.frame plot_bar_category.grouped_df plot_box_numeric plot_box_numeric.data.frame plot_box_numeric.grouped_df plot_hist_numeric plot_hist_numeric.data.frame plot_hist_numeric.grouped_df plot_qq_numeric plot_qq_numeric.data.frame plot_qq_numeric.grouped_df

#' @rdname plot_bar_category.data.frame
#' @export
plot_bar_category <- function(.data, ...) {
  UseMethod("plot_bar_category", .data)
}

#' Plot bar chart of categorical variables 
#'
#' @description The plot_bar_category() to visualizes the distribution of 
#' categorical data by level or relationship to specific numerical data by level.
#'
#' @details The distribution of categorical variables can be understood by 
#' comparing the frequency of each level. The frequency table helps with this. 
#' As a visualization method, a bar graph can help you understand 
#' the distribution of categorical data more easily than a frequency table.
#'
#' The base_family is selected from "Roboto Condensed", "Liberation Sans Narrow",
#' "NanumSquare", "Noto Sans Korean". If you want to use a different font, 
#' use it after loading the Google font with import_google_font().
#' 
#' @param .data a data.frame or a \code{\link{tbl_df}} or a \code{\link{grouped_df}}.
#' @param \dots one or more unquoted expressions separated by commas.
#' You can treat variable names like they are positions.
#' Positive values select variables; negative values to drop variables.
#' If the first expression is negative, plot_bar_category() will automatically
#' start with all variables.
#' These arguments are automatically quoted and evaluated in a context where
#' column names represent column positions.
#' They support unquoting and splicing.
#' @param top an integer. Specifies the upper top rank to extract.
#' Default is 10.
#' @param add_character logical. Decide whether to include text variables in the
#' diagnosis of categorical data. The default value is TRUE, which also includes character variables.
#' @param title character. a main title for the plot.
#' @param each logical. Specifies whether to draw multiple plots on one screen. 
#' The default is FALSE, which draws multiple plots on one screen.
#' @param typographic logical. Whether to apply focuses on typographic elements to ggplot2 visualization. 
#' The default is TRUE. if TRUE provides a base theme that focuses on typographic elements using hrbrthemes package.
#' @param base_family character. The name of the base font family to use 
#' for the visualization. If not specified, the font defined in dlookr is applied. (See details)
#' 
#' 
#' @examples
#' # Generate data for the example
#' heartfailure2 <- heartfailure
#' heartfailure2[sample(seq(NROW(heartfailure2)), 20), "platelets"] <- NA
#' heartfailure2[sample(seq(NROW(heartfailure2)), 5), "smoking"] <- NA
#' 
#' set.seed(123)
#' heartfailure2$test <- sample(LETTERS[1:15], 299, replace = TRUE)
#' heartfailure2$test[1:30] <- NA
#' 
#' # Visualization of all numerical variables
#' plot_bar_category(heartfailure2)
#' 
#' # Select the variable to diagnose
#' plot_bar_category(heartfailure2, "test", "smoking")
#' 
#' # Visualize the each plots
#' # Visualize just 7 levels of top frequency
#' # Visualize only factor, not character 
#' plot_bar_category(heartfailure2, each = TRUE, top = 7, add_character = FALSE)
#' 
#' # Not allow typographic argument
#' plot_bar_category(heartfailure2, typographic = FALSE)
#' 
#' # Using pipes ---------------------------------
#' library(dplyr)
#' 
#' # Using groupd_df  ------------------------------
#' heartfailure2 %>% 
#'   group_by(death_event) %>% 
#'   plot_bar_category(top = 5)
#'    
#' @method plot_bar_category data.frame
#' @importFrom tidyselect vars_select
#' @importFrom rlang quos
#' @export
#' @rdname plot_bar_category.data.frame 
plot_bar_category.data.frame <- function(.data, ..., top = 10, add_character = TRUE,
                                         title = "Frequency by levels of category",
                                         each = FALSE, typographic = TRUE,
                                         base_family = NULL) {
  vars <- tidyselect::vars_select(names(.data), !!! rlang::quos(...))
  
  plot_bar_category_impl(.data, vars, top, add_character, title, each, 
                         typographic, base_family)
}

#' @import ggplot2
#' @import hrbrthemes
#' @import dplyr
#' @importFrom purrr map
#' @importFrom gridExtra arrangeGrob grid.arrange
#' @importFrom grid textGrob gpar
#' @importFrom tibble is_tibble as_tibble
plot_bar_category_impl <- function(df, vars, top, add_character, title, each, 
                                   typographic, base_family) {
  if (length(vars) == 0) vars <- names(df)
  
  if (length(vars) == 1 & !tibble::is_tibble(df)) df <- tibble::as_tibble(df)
  
  if (add_character)
    nm_factor <- find_class(df[, vars], type = "categorical2", index = FALSE)
  else
    nm_factor <- find_class(df[, vars], type = "categorical", index = FALSE)
  
  if (length(nm_factor) == 0) {
    message("There is no categorical variable in the data or variable list.\n")
    return(NULL)
  }
  
  get_tally <- function(data, var, top = 10) {
    raws <- data %>% 
      select(levels = var) %>% 
      group_by(levels) %>% 
      tally() %>% 
      mutate(na_flag = is.na(levels)) %>% 
      mutate(menas = round(sum(n) / (length(n) - sum(na_flag)))) %>%     
      arrange(na_flag, desc(n)) %>%     
      mutate(levels = factor(levels, levels = levels)) %>% 
      mutate(rank = row_number())
    
    tops <- raws %>% 
      filter(rank <= top) %>% 
      filter(!na_flag) 
    
    others <- raws %>% 
      filter(rank > top) %>% 
      filter(!na_flag) %>% 
      summarise(levels = "<Other>",
                n = sum(n),
                na_flag = FALSE,
                menas = 0,
                rank = top + 1) %>% 
      filter(n > 0)
    
    missing <- raws %>% 
      filter(na_flag) %>% 
      mutate(rank = top + 2)
    
    rbind(tops, others, missing) %>% 
      mutate(flag = ifelse(na_flag, "Missing", 
                           ifelse(menas == 0, "OtherTop", "Tops")))
  }
  
  if (typographic) {
    def_colors <- c("Tops" = "#d18975", "OtherTop" = "#8fd175",
                    "Missing" = "grey50")
  } else {
    def_colors <- c("Tops" = "#ff7f0e", "OtherTop" = "#1f77b4",
                    "Missing" = "grey50")    
  }
  
  plist <- purrr::map(nm_factor, function(x) {
    return(data.frame(variables = x, 
                      get_tally(df, x, top)))
  }) %>% 
    lapply(function(data) {
      if (each) {
        ylab <- "Frequency"
      } else {
        ylab <- ""
        title <- ""
      }
      
      na_flag <- any(is.na(data$levels))
      reverse_x <- rev(levels(data$levels))
      reverse_x <- reverse_x[reverse_x %in% data$levels]
      
      if (na_flag) {
        reverse_x <- c(NA, reverse_x)
      }
      
      p <- data %>% 
        arrange(rank) %>% 
        ggplot(aes(x = levels, y = n)) +
        geom_bar(aes(fill = flag), stat = "identity") +
        scale_colour_manual(values = def_colors,
                            aesthetics = c("colour", "fill")) +
        geom_hline(yintercept = max(data$menas), linetype = "dashed",
                   col = "blue") +
        facet_wrap(~ variables) + 
        scale_x_discrete(limits = reverse_x) + 
        coord_flip() +   
        labs(title = title, x = "", y = ylab) +     
        theme_grey(base_family = base_family) +         
        theme(legend.position = "none",
              axis.title.y = element_blank())
      
      if (typographic) {
        p <- p + 
          theme_typographic(base_family) +
          theme(legend.position = "none",
                axis.title.x = element_text(size = 12))
        
        if (!each) {
          p <- p +
            theme(axis.text.x = element_text(size = 10),
                  axis.text.y = element_text(size = 10),
                  plot.margin = margin(0, 10, 0, 10)
            )
        }
      }
      
      p
    })
  
  if (each) {
    for (i in seq(plist))
      do.call("print", plist[i])
  } else {
    n <- length(plist)
    n_row <- floor(sqrt(n))
    
    if (typographic) {
      if (is.null(base_family)) {
        base_family <- "Roboto Condensed"  
      }
      
      title <- grid::textGrob(title, gp = grid::gpar(fontfamily = base_family, 
                                                     fontsize = 18, font = 2),
                              x = unit(0.075, "npc"), just = "left")
    }
    
    suppressWarnings(gridExtra::grid.arrange(
      gridExtra::arrangeGrob(grobs = plist, nrow = n_row), top = title))
  }  
}


#' @method plot_bar_category grouped_df
#' @rdname plot_bar_category.data.frame
#' @importFrom tidyselect vars_select
#' @importFrom rlang quos
#' @importFrom tibble is_tibble
#' @export
plot_bar_category.grouped_df <- function(.data, ..., top = 10, add_character = TRUE,
                                         title = "Frequency by levels of category",
                                         each = FALSE, typographic = TRUE,
                                         base_family = NULL) {
  vars <- tidyselect::vars_select(names(.data), !!! rlang::quos(...))
  
  plot_bar_category_group_impl(.data, vars, top, add_character, title, each, 
                               typographic, base_family)
}

plot_bar_category_group_impl <- function(df, vars, top, add_character, title, each, 
                                         typographic, base_family) {
  if (utils::packageVersion("dplyr") >= "0.8.0") {
    group_key <- setdiff(attr(df, "groups") %>% names(), ".rows")
    n_levels <- attr(df, "group") %>% nrow() 
  } else {
    group_key <- attr(df, "vars")
    n_levels <- attr(df, "group") %>% length() 
  }
  
  if (length(vars) == 0) vars <- names(df)
  
  if (length(vars) == 1 & !tibble::is_tibble(df)) df <- as_tibble(df)
  
  if (add_character)
    nm_factor <- find_class(df[, vars], type = "categorical2", index = FALSE)
  else
    nm_factor <- find_class(df[, vars], type = "categorical", index = FALSE)
  
  nm_factor <- setdiff(nm_factor, group_key)
  
  if (length(nm_factor) == 0) {
    message("There is no categorical variable in the data or variable list.\n")
    return(NULL)
  }
  
  get_tally_group <- function(data, group_key, var, top = 10) {
    new_group <- c(group_key, var)
    
    top_levels <- data[, var] %>% 
      table %>%
      sort(decreasing = TRUE) %>% 
      "["(seq(top)) %>% 
      names()
    
    raws <- data %>% 
      select(group_key, var) %>% 
      group_by_at(new_group) %>% 
      tally() %>% 
      rename(levels = var) %>% 
      mutate(na_flag = is.na(levels)) %>% 
      mutate(menas = round(sum(n) / (length(n) - sum(na_flag)))) %>%     
      arrange(na_flag, desc(n)) %>%     
      mutate(levels = factor(levels, levels = levels)) %>% 
      mutate(rank = match(levels, top_levels))
    
    tops <- raws %>% 
      filter(levels %in% top_levels) %>% 
      filter(!na_flag)
    
    others <- raws %>% 
      filter(!levels %in% top_levels) %>% 
      filter(!na_flag) %>% 
      summarise(levels = "<Other>",
                n = sum(n),
                na_flag = FALSE,
                menas = NA,
                rank = top + 1,
                .groups = "drop") %>% 
      filter(n > 0)
    
    missing <- raws %>% 
      filter(na_flag) %>% 
      mutate(rank = top + 2)
    
    suppressWarnings(bind_rows(tops, others, missing)) %>% 
      mutate(flag = ifelse(na_flag, "Missing", 
                           ifelse(is.na(menas), "OtherTop", "Tops")))
  }
  
  if (typographic) {
    def_colors <- c("Tops" = "#d18975", "OtherTop" = "#8fd175",
                    "Missing" = "grey50")
  } else {
    def_colors <- c("Tops" = "#ff7f0e", "OtherTop" = "#1f77b4",
                    "Missing" = "grey50")    
  }
  
  plist <- purrr::map(nm_factor, function(x) {
    return(data.frame(variables = x, 
                      get_tally_group(df, group_key, x, top)))
  }) %>% 
    lapply(function(data) {
      n_col <- data %>% 
        filter(!is.na(levels)) %>% 
        select(levels) %>% 
        unique() %>% 
        nrow()
      
      if (each) {
        ylab <- "Frequency"
        title <- paste(title, "(", 
                       paste(group_key, unique(data$variables), sep = " by "), ")")
      } else {
        ylab <- ""
        title <- ""
      }
      
      reverse_x <- data %>% 
        filter(!is.na(levels)) %>% 
        filter(!levels %in% "<Other>") %>%   
        group_by(levels) %>% 
        summarise(freq = sum(n)) %>% 
        arrange(desc(freq)) %>% 
        select(levels) %>% 
        pull()
      
      na_flag <- any(is.na(data$levels))
      other_flag <- any(data$levels %in% "<Other>")
      otherwise <- c(NA, "<Other>")[c(na_flag, other_flag)]
      
      reverse_x <- c(otherwise, rev(reverse_x))
      
      center <- round(nrow(df) / (n_levels * n_col))
      
      p <- ggplot(data, aes(x = levels, y = n)) +
        geom_bar(aes(fill = flag), stat = "identity") +
        scale_colour_manual(values = def_colors,
                            aesthetics = c("colour", "fill")) +
        geom_hline(yintercept = center, linetype = "dashed",
                   col = "blue") +
        facet_grid(reformulate("variables", group_key)) + 
        scale_x_discrete(limits = reverse_x) + 
        coord_flip() +
        labs(title = title, x = "", y = ylab) +     
        theme_grey(base_family = base_family) +         
        theme(legend.position = "none",
              axis.title.y = element_blank())
      
      if (typographic) {
        p <- p + 
          theme_typographic(base_family) +
          theme(legend.position = "none",
                axis.title.x = element_text(size = 12))
        
        if (!each) {
          p <- p +
            theme(axis.text.x = element_text(size = 10),
                  axis.text.y = element_text(size = 10),
                  plot.margin = margin(0, 10, 0, 10)
            )
        }
      }
      
      p
    })
  
  if (each) {
    for (i in seq(plist))
      do.call("print", plist[i])
  } else {
    n <- length(plist)
    n_row <- floor(sqrt(n))
    
    #do.call("grid.arrange", c(plist, nrow = n_row, top = title, 
    #                          right = group_key))
    
    if (typographic) {
      if (is.null(base_family)) {
        base_family <- "Roboto Condensed"     
      }
      
      title <- grid::textGrob(title, gp = grid::gpar(fontfamily = base_family, 
                                                     fontsize = 18, font = 2),
                              x = unit(0.075, "npc"), just = "left")
    }
    
    suppressWarnings(gridExtra::grid.arrange(
      gridExtra::arrangeGrob(grobs = plist, nrow = n_row), top = title, 
      right = grid::textGrob(group_key, rot = 270,
                             gp = gpar(fontfamily = base_family))))
  }
}


##----------------------------------------------------------------------------------

#' @rdname plot_qq_numeric.data.frame
#' @export
plot_qq_numeric <- function(.data, ...) {
  UseMethod("plot_qq_numeric", .data)
}

#' Plot Q-Q plot of numerical variables 
#'
#' @description The plot_qq_numeric() to visualizes the Q-Q plot of numeric data or 
#' relationship to specific categorical data.
#' 
#' @details The The Q-Q plot helps determine whether the distribution of a numeric variable 
#' is normally distributed. plot_qq_numeric() shows Q-Q plots of several numeric variables 
#' on one screen. This function can also display a Q-Q plot for each level of a specific 
#' categorical variable.
#'
#' The base_family is selected from "Roboto Condensed", "Liberation Sans Narrow",
#' "NanumSquare", "Noto Sans Korean". If you want to use a different font, 
#' use it after loading the Google font with import_google_font().
#'  
#' @param .data data.frame or a \code{\link{tbl_df}} or a \code{\link{grouped_df}}.
#' @param \dots one or more unquoted expressions separated by commas.
#' You can treat variable names like they are positions.
#' Positive values select variables; negative values to drop variables.
#' If the first expression is negative, plot_qq_numeric() will automatically
#' start with all variables.
#' These arguments are automatically quoted and evaluated in a context where
#' column names represent column positions.
#' They support unquoting and splicing.
#'
#' @param col_point character. a color of points in Q-Q plot.
#' @param col_line character. a color of line in Q-Q plot.
#' @param title character. a main title for the plot. 
#' @param each logical. Specifies whether to draw multiple plots on one screen. 
#' The default is FALSE, which draws multiple plots on one screen.
#' @param typographic logical. Whether to apply focuses on typographic elements to ggplot2 visualization. 
#' The default is TRUE. if TRUE provides a base theme that focuses on typographic elements using hrbrthemes package.
#' @param base_family character. The name of the base font family to use 
#' for the visualization. If not specified, the font defined in dlookr is applied. (See details)
#' @examples
#' \donttest{
#' # Visualization of all numerical variables
#' plot_qq_numeric(heartfailure)
#' 
#' # Select the variable to diagnose
#' plot_qq_numeric(heartfailure, "age", "time")
#' plot_qq_numeric(heartfailure, -age, -time)
#' 
#' # Not allow the typographic elements
#' plot_qq_numeric(heartfailure, "age", typographic = FALSE)
#' 
#' # Using pipes ---------------------------------
#' library(dplyr)
#' 
#' # Plot of all numerical variables
#' heartfailure %>%
#'   plot_qq_numeric()
#' 
#' # Using groupd_df  ------------------------------
#' heartfailure %>% 
#'   group_by(smoking) %>% 
#'   plot_qq_numeric()
#' 
#' heartfailure %>% 
#'   group_by(smoking) %>% 
#'   plot_qq_numeric(each = TRUE)  
#' }
#' 
#' @method plot_qq_numeric data.frame
#' @import ggplot2
#' @import hrbrthemes
#' @import dplyr
#' @importFrom purrr map
#' @importFrom gridExtra grid.arrange
#' @importFrom grid textGrob gpar
#' @export
#' @rdname plot_qq_numeric.data.frame 
#' 
plot_qq_numeric.data.frame <- function(.data, ..., col_point = "steelblue", col_line = "black",
                                       title = "Q-Q plot by numerical variables",
                                       each = FALSE, typographic = TRUE,
                                       base_family = NULL) {
  vars <- tidyselect::vars_select(names(.data), !!! rlang::quos(...))
  
  plot_qq_numeric_impl(.data, vars, col_point, col_line, title, each, 
                       typographic, base_family)
}

plot_qq_numeric_impl <- function(df, vars, col_point, col_line, title, each, 
                                 typographic, base_family) {
  if (length(vars) == 0) vars <- names(df)
  
  if (length(vars) == 1 & !tibble::is_tibble(df)) df <- as_tibble(df)
  
  nm_numeric <- find_class(df[, vars], type = "numerical", index = FALSE)
  
  if (length(nm_numeric) == 0) {
    message("There is no numerical variable in the data or variable list.\n")
    return(NULL)
  }
  
  suppressWarnings({
    plist <- purrr::map(nm_numeric, function(var) {
      if (each) {
        xlab <- "Theoretical" 
        ylab <- "Sample"
      } else {
        xlab <- ""
        ylab <- ""
        title <- ""
      }
      
      p_qq <- df %>% 
        mutate(variables = var) %>% 
        ggplot(aes(sample = !!sym(var))) +
        stat_qq(col = col_point) + 
        stat_qq_line(col = col_line) +
        facet_wrap(~ variables) + 
        labs(title = title, x = xlab, y = ylab) +      
        theme_grey(base_family = base_family) +         
        theme(legend.position = "none")
      
      if (typographic) {
        p_qq <- p_qq + 
          theme_typographic(base_family) +
          theme(legend.position = "none",
                axis.title.x = element_text(size = 12),
                axis.title.y = element_text(size = 12))
        
        if (!each) {
          p_qq <- p_qq +
            theme(axis.text.x = element_text(size = 10),
                  axis.text.y = element_text(size = 10),
                  plot.margin = margin(0, 10, 0, 10)
            )
        }
      }
      
      p_qq
    })
    
    if (each) {
      for (i in seq(plist))
        do.call("print", plist[i])
    } else {
      n <- length(plist)
      n_row <- floor(sqrt(n))
      
      if (typographic) {
        if (is.null(base_family)) {
          base_family <- "Roboto Condensed"           
        }
        
        title <- grid::textGrob(title, gp = grid::gpar(fontfamily = base_family, 
                                                       fontsize = 18, font = 2),
                                x = unit(0.075, "npc"), just = "left")
      }
      
      suppressWarnings(gridExtra::grid.arrange(
        gridExtra::arrangeGrob(grobs = plist, nrow = n_row), top = title))
    }
  }) # End of suppressWarnings()
}

#' @method plot_qq_numeric grouped_df
#' @rdname plot_qq_numeric.data.frame
#' @importFrom tidyselect vars_select
#' @importFrom rlang quos
#' @importFrom tibble is_tibble
#' @export
#' 
plot_qq_numeric.grouped_df <- function(.data, ..., col_point = "steelblue", col_line = "black",
                                       title = "Q-Q plot by numerical variables",
                                       each = FALSE, typographic = TRUE,
                                       base_family = NULL) {
  vars <- tidyselect::vars_select(names(.data), !!! rlang::quos(...))
  
  plot_qq_numeric_group_impl(.data, vars, col_point, col_line, title, each, 
                             typographic, base_family)
}

#' @import ggplot2
#' @import hrbrthemes
#' @import dplyr
#' @importFrom purrr map
#' @importFrom gridExtra grid.arrange
#' @importFrom grid textGrob gpar
plot_qq_numeric_group_impl <- function(df, vars, col_point, col_line, title, 
                                       each, typographic, base_family) {
  if (utils::packageVersion("dplyr") >= "0.8.0") {
    group_key <- setdiff(attr(df, "groups") %>% names(), ".rows")
  } else {
    group_key <- attr(df, "vars")
  }
  
  #n_levels <- attr(df, "group") %>% nrow() 
  
  if (length(vars) == 0) vars <- names(df)
  
  if (length(vars) == 1 & !tibble::is_tibble(df)) df <- as_tibble(df)
  
  nm_numeric <- find_class(df[, vars], type = "numerical", index = FALSE)
  
  if (length(nm_numeric) == 0) {
    message("There is no numerical variable in the data or variable list.\n")
    return(NULL)
  }
  
  suppressWarnings({
    plist <- purrr::map(nm_numeric, function(var) {
      if (each) {
        xlab <- "Theoretical" 
        ylab <- "Sample"
        title <- paste(title, "(", paste("by", group_key), ")")
      } else {
        xlab <- ""
        ylab <- ""
        title = ""
      }
      
      p_qq <- df %>% 
        mutate(variables = var) %>% 
        ggplot(aes(sample = !!sym(var))) +
        stat_qq(col = col_point) + 
        stat_qq_line(col = col_line) +
        facet_grid(reformulate("variables", group_key)) + 
        labs(title = title, x = xlab, y = ylab) +    
        theme_grey(base_family = base_family) +         
        theme(legend.position = "none")
      
      if (typographic) {
        p_qq <- p_qq + 
          theme_typographic(base_family) +
          theme(legend.position = "none",
                axis.title.x = element_text(size = 12),
                axis.title.y = element_text(size = 12))
        
        if (!each) {
          p_qq <- p_qq +
            theme(axis.text.x = element_text(size = 10),
                  axis.text.y = element_text(size = 10),
                  plot.margin = margin(0, 10, 0, 10)
            )
        }
      }
      
      p_qq
    })
    
    if (each) {
      for (i in seq(plist))
        do.call("print", plist[i])
    } else {
      n <- length(plist)
      n_row <- floor(sqrt(n))
      
      if (typographic) {
        if (is.null(base_family)) {
          base_family <- "Roboto Condensed"           
        }
        
        title <- grid::textGrob(title, gp = grid::gpar(fontfamily = base_family, 
                                                       fontsize = 18, font = 2),
                                x = unit(0.075, "npc"), just = "left")
      }
      
      suppressWarnings(gridExtra::grid.arrange(
        gridExtra::arrangeGrob(grobs = plist, nrow = n_row), top = title, 
        right = grid::textGrob(group_key, rot = 270,
                               gp = gpar(fontfamily = base_family))))
    }
  }) # End of suppressWarnings()
}

##----------------------------------------------------------------------------------

#' @rdname plot_box_numeric.data.frame
#' @export
plot_box_numeric <- function(.data, ...) {
  UseMethod("plot_box_numeric", .data)
}

#' Plot Box-Plot of numerical variables 
#'
#' @description The plot_box_numeric() to visualizes the box plot of numeric data or 
#' relationship to specific categorical data.
#' 
#' @details The box plot helps determine whether the distribution of a numeric variable. 
#' plot_box_numeric() shows box plots of several numeric variables 
#' on one screen. This function can also display a box plot for each level of a specific 
#' categorical variable.
#' 
#' The base_family is selected from "Roboto Condensed", "Liberation Sans Narrow",
#' "NanumSquare", "Noto Sans Korean". If you want to use a different font, 
#' use it after loading the Google font with import_google_font().
#'
#' @param .data data.frame or a \code{\link{tbl_df}} or a \code{\link{grouped_df}}.
#' @param \dots one or more unquoted expressions separated by commas.
#' You can treat variable names like they are positions.
#' Positive values select variables; negative values to drop variables.
#' If the first expression is negative, plot_box_numeric() will automatically
#' start with all variables.
#' These arguments are automatically quoted and evaluated in a context where
#' column names represent column positions.
#' They support unquoting and splicing.
#'
#' @param title character. a main title for the plot. 
#' @param each logical. Specifies whether to draw multiple plots on one screen. 
#' The default is FALSE, which draws multiple plots on one screen.
#' @param typographic logical. Whether to apply focuses on typographic elements to ggplot2 visualization. 
#' The default is TRUE. if TRUE provides a base theme that focuses on typographic elements using hrbrthemes package.
#' @param base_family character. The name of the base font family to use 
#' for the visualization. If not specified, the font defined in dlookr is applied. (See details)
#' @examples
#' \donttest{
#' # Visualization of all numerical variables
#' plot_box_numeric(heartfailure)
#'
#' # Select the variable to diagnose
#' plot_box_numeric(heartfailure, "age", "time")
#' plot_box_numeric(heartfailure, -age, -time)
#'
#' # Visualize the each plots
#' plot_box_numeric(heartfailure, "age", "time", each = TRUE)
#' 
#' # Not allow the typographic elements
#' plot_box_numeric(heartfailure, typographic = FALSE)
#' 
#' # Using pipes ---------------------------------
#' library(dplyr)
#'
#' # Plot of all numerical variables
#' heartfailure %>%
#'   plot_box_numeric()
#'   
#' # Using groupd_df  ------------------------------
#' heartfailure %>% 
#'   group_by(smoking) %>% 
#'   plot_box_numeric()
#'   
#' heartfailure %>% 
#'   group_by(smoking) %>% 
#'   plot_box_numeric(each = TRUE)  
#' }
#'    
#' @method plot_box_numeric data.frame
#' @import ggplot2
#' @import hrbrthemes
#' @import dplyr
#' @importFrom purrr map
#' @importFrom gridExtra grid.arrange arrangeGrob
#' @importFrom grid textGrob gpar
#' @export
#' @rdname plot_box_numeric.data.frame 
#' 
plot_box_numeric.data.frame <- function(.data, ..., 
                                        title = "Distribution by numerical variables",
                                        each = FALSE, typographic = TRUE,
                                        base_family = NULL) {
  vars <- tidyselect::vars_select(names(.data), !!! rlang::quos(...))
  
  plot_box_numeric_impl(.data, vars, title, each, typographic, base_family)
}

plot_box_numeric_impl <- function(df, vars, title, each, typographic, base_family) {
  if (length(vars) == 0) vars <- names(df)
  
  if (length(vars) == 1 & !tibble::is_tibble(df)) df <- as_tibble(df)
  
  nm_numeric <- find_class(df[, vars], type = "numerical", index = FALSE)
  
  if (length(nm_numeric) == 0) {
    message("There is no numerical variable in the data or variable list.\n")
    return(NULL)
  }
  
  suppressWarnings({
    plist <- purrr::map(nm_numeric, function(var) {
      if (each) {
        xlab <- "" 
      } else {
        xlab <- ""
        title <- ""
      }
      
      p_box <- df %>% 
        mutate(variables = var) %>% 
        ggplot(aes(y = !!sym(var))) +
        geom_boxplot(fill = "steelblue", alpha = 0.8) +
        xlim(-0.7, 0.7) +
        coord_flip() + 
        labs(title = title, subtitle = var, x = xlab) +        
        theme_grey(base_family = base_family) +         
        theme(axis.title.y = element_blank(),
              axis.text.y = element_blank(),
              axis.ticks.y = element_blank())
      
      if (typographic) {
        p_box <- p_box + 
          theme_typographic(base_family) +
          theme(legend.position = "none",
                axis.title.x = element_blank(),
                axis.title.y = element_text(size = 12))
        
        if (!each) {
          p_box <- p_box +
            theme(plot.title = element_text(margin = margin(b = 0)),
                  axis.title.y = element_blank(),
                  axis.text.y = element_blank(),
                  axis.ticks.y = element_blank(),
                  axis.title.x = element_blank(),
                  axis.text.x = element_text(size = 10),
                  plot.margin = margin(10, 10, 10, 10)
            )
        }
      } else {
        p_box <- p_box +
          theme(axis.title.x = element_blank())
      }
      
      p_box
    }
    )
    
    if (each) {
      for (i in seq(plist))
        do.call("print", plist[i])
    } else {
      n <- length(plist)
      n_row <- floor(sqrt(n))
      
      if (typographic) {
        if (is.null(base_family)) {
          base_family <- "Roboto Condensed"           
        }
        
        title <- grid::textGrob(title, gp = grid::gpar(fontfamily = base_family, 
                                                       fontsize = 18, font = 2),
                                x = unit(0.075, "npc"), just = "left")
      }
      
      suppressWarnings(gridExtra::grid.arrange(
        gridExtra::arrangeGrob(grobs = plist, nrow = n_row), top = title))
    }  
  }) # End of suppressWarnings()
}

#' @method plot_box_numeric grouped_df
#' @rdname plot_box_numeric.data.frame
#' @importFrom tidyselect vars_select
#' @importFrom rlang quos
#' @importFrom tibble is_tibble
#' @export
#' 
plot_box_numeric.grouped_df <- function(.data, ..., 
                                        title = "Distribution by numerical variables",
                                        each = FALSE, typographic = TRUE,
                                        base_family = NULL) {
  vars <- tidyselect::vars_select(names(.data), !!! rlang::quos(...))
  
  plot_box_numeric_group_impl(.data, vars, title, each, typographic, base_family)
}

plot_box_numeric_group_impl <- function(df, vars, title, each, typographic, base_family) {
  if (utils::packageVersion("dplyr") >= "0.8.0") {
    group_key <- setdiff(attr(df, "groups") %>% names(), ".rows")
  } else {
    group_key <- attr(df, "vars")
  }
  
  n_levels <- attr(df, "group") %>% nrow() 
  
  if (length(vars) == 0) vars <- names(df)
  
  if (length(vars) == 1 & !tibble::is_tibble(df)) df <- as_tibble(df)
  
  nm_numeric <- find_class(df[, vars], type = "numerical", index = FALSE)
  
  if (length(nm_numeric) == 0) {
    message("There is no numerical variable in the data or variable list.\n")
    return(NULL)
  }
  
  suppressWarnings({
    plist <- purrr::map(nm_numeric, function(var) {
      if (each) {
        xlab <- "" 
        ylab <- ""
        title <- paste(title, "(", paste("by", group_key), ")")      
      } else {
        xlab <- ""
        ylab <- ""
        title <- ""
      }
      
      p_box <- df %>% 
        mutate(variables = var) %>% 
        ggplot(aes(y = !!sym(var), fill = group_key)) +
        geom_boxplot(alpha = 0.7) + 
        xlim(-0.7, 0.7) +
        coord_flip() +
        facet_grid(reformulate("variables", group_key)) + 
        labs(title = title, x = xlab, y = ylab) +        
        theme_grey(base_family = base_family) +         
        theme(legend.position = "none",
              axis.title.y = element_blank(),
              axis.text.y = element_blank(),
              axis.ticks.y = element_blank())
      
      if (typographic) {
        p_box <- p_box + 
          theme_typographic(base_family) +
          scale_fill_ipsum() + 
          theme(legend.position = "none",
                axis.title.x = element_text(size = 12),
                axis.text.y = element_blank(),
                axis.ticks.y = element_blank())
        
        if (!each) {
          p_box <- p_box +
            theme(axis.text.x = element_text(size = 10),
                  axis.text.y = element_blank(),
                  axis.ticks.y = element_blank(),
                  plot.margin = margin(0, 10, 0, 10),
                  panel.spacing = grid::unit(0, "lines")
            )
        }
      }
      
      p_box
    }
    )
    
    if (each) {
      for (i in seq(plist))
        do.call("print", plist[i])
    } else {
      n <- length(plist)
      n_row <- floor(sqrt(n))
      
      if (typographic) {
        if (is.null(base_family)) {
          base_family <- "Roboto Condensed"           
        }
        
        title <- grid::textGrob(title, gp = grid::gpar(fontfamily = base_family, 
                                                       fontsize = 18, font = 2),
                                x = unit(0.075, "npc"), just = "left")
      }
      
      suppressWarnings(gridExtra::grid.arrange(
        gridExtra::arrangeGrob(grobs = plist, nrow = n_row), top = title, 
        right = grid::textGrob(group_key, rot = 270,
                               gp = gpar(fontfamily = base_family))))
    }
  }) # End of suppressWarnings()
}


##----------------------------------------------------------------------------------

#' @rdname plot_hist_numeric.data.frame
#' @export
plot_hist_numeric <- function(.data, ...) {
  UseMethod("plot_hist_numeric", .data)
}

#' Plot histogram of numerical variables 
#'
#' @description The plot_hist_numeric() to visualizes the histogram of numeric data or 
#' relationship to specific categorical data.
#' 
#' @details The histogram helps determine whether the distribution of a numeric variable. 
#' plot_hist_numeric() shows box plots of several numeric variables 
#' on one screen. This function can also display a histogram for each level of a specific 
#' categorical variable.
#' The bin-width is set to the Freedman-Diaconis rule (2 * IQR(x) / length(x)^(1/3))
#'
#' The base_family is selected from "Roboto Condensed", "Liberation Sans Narrow",
#' "NanumSquare", "Noto Sans Korean". If you want to use a different font, 
#' use it after loading the Google font with import_google_font().
#' 
#' @param .data data.frame or a \code{\link{tbl_df}} or a \code{\link{grouped_df}}.
#' @param \dots one or more unquoted expressions separated by commas.
#' You can treat variable names like they are positions.
#' Positive values select variables; negative values to drop variables.
#' If the first expression is negative, plot_hist_numeric() will automatically
#' start with all variables.
#' These arguments are automatically quoted and evaluated in a context where
#' column names represent column positions.
#' They support unquoting and splicing.
#'
#' @param title character. a main title for the plot. 
#' @param each logical. Specifies whether to draw multiple plots on one screen. 
#' The default is FALSE, which draws multiple plots on one screen.
#' @param typographic logical. Whether to apply focuses on typographic elements to ggplot2 visualization. 
#' The default is TRUE. if TRUE provides a base theme that focuses on typographic elements using hrbrthemes package.
#' @param base_family character. The name of the base font family to use 
#' for the visualization. If not specified, the font defined in dlookr is applied. (See details)
#' @examples
#' \donttest{
#' # Visualization of all numerical variables
#' plot_hist_numeric(heartfailure)
#'
#' # Select the variable to diagnose
#' plot_hist_numeric(heartfailure, "age", "time")
#' plot_hist_numeric(heartfailure, -age, -time)
#'
#' # Visualize the each plots
#' plot_hist_numeric(heartfailure, "age", "time", each = TRUE)
#' 
#' # Not allow the typographic elements
#' plot_hist_numeric(heartfailure, typographic = FALSE)
#' 
#' # Using pipes ---------------------------------
#' library(dplyr)
#'
#' # Plot of all numerical variables
#' heartfailure %>%
#'   plot_hist_numeric()
#'   
#' # Using groupd_df  ------------------------------
#' heartfailure %>% 
#'   group_by(smoking) %>% 
#'   plot_hist_numeric()
#'   
#' heartfailure %>% 
#'   group_by(smoking) %>% 
#'   plot_hist_numeric(each = TRUE)  
#' }
#'    
#' @method plot_hist_numeric data.frame
#' @import ggplot2
#' @import hrbrthemes
#' @import dplyr
#' @importFrom purrr map
#' @importFrom gridExtra grid.arrange arrangeGrob
#' @importFrom grid textGrob gpar
#' @export
#' @rdname plot_hist_numeric.data.frame 
#' 
plot_hist_numeric.data.frame <- function(.data, ..., 
                                        title = "Distribution by numerical variables",
                                        each = FALSE, typographic = TRUE,
                                        base_family = NULL) {
  vars <- tidyselect::vars_select(names(.data), !!! rlang::quos(...))
  
  plot_hist_numeric_impl(.data, vars, title, each, typographic, base_family)
}

plot_hist_numeric_impl <- function(df, vars, title, each, typographic, base_family) {
  if (length(vars) == 0) vars <- names(df)
  
  if (length(vars) == 1 & !tibble::is_tibble(df)) df <- as_tibble(df)
  
  nm_numeric <- find_class(df[, vars], type = "numerical", index = FALSE)
  
  if (length(nm_numeric) == 0) {
    message("There is no numerical variable in the data or variable list.\n")
    return(NULL)
  }
  
  suppressWarnings({
    plist <- purrr::map(nm_numeric, function(var) {
      if (each) {
        xlab <- "" 
      } else {
        xlab <- ""
        title <- ""
      }
      
      p_box <- df %>% 
        rename(variables = {{var}}) %>% 
        ggplot(aes(x = variables)) +
        geom_histogram(color="#e9ecef", fill = "steelblue", alpha = 0.8,
                       bins = round(log2(nrow(df)) + 1)) +
        labs(title = title, subtitle = var, x = xlab) +     
        theme_grey(base_family = base_family) +
        theme(axis.title.y = element_blank(),
              axis.text.y = element_blank(),
              axis.ticks.y = element_blank())
      
      if (typographic) {
        p_box <- p_box + 
          theme_typographic(base_family) +
          theme(legend.position = "none",
                axis.title.x = element_blank(),
                axis.title.y = element_text(size = 12))
        
        if (!each) {
          p_box <- p_box +
            theme(plot.title = element_text(margin = margin(b = 0)),
                  axis.title.y = element_blank(),
                  axis.text.y = element_blank(),
                  axis.ticks.y = element_blank(),
                  axis.title.x = element_blank(),
                  axis.text.x = element_text(size = 10),
                  plot.margin = margin(10, 10, 10, 10)
            )
        }
      } else {
        p_box <- p_box +
          theme(axis.title.x = element_blank())
      }
      
      p_box
    }
    )
    
    if (each) {
      for (i in seq(plist))
        do.call("print", plist[i])
    } else {
      n <- length(plist)
      n_row <- floor(sqrt(n))
      
      if (typographic) {
        if (is.null(base_family)) {
          base_family <- "Roboto Condensed"           
        }
        
        title <- grid::textGrob(title, gp = grid::gpar(fontfamily = base_family, 
                                                       fontsize = 18, font = 2),
                                x = unit(0.075, "npc"), just = "left")
      }
      
      suppressWarnings(gridExtra::grid.arrange(
        gridExtra::arrangeGrob(grobs = plist, nrow = n_row), top = title))
    }  
  }) # End of suppressWarnings()
}

#' @method plot_hist_numeric grouped_df
#' @rdname plot_hist_numeric.data.frame
#' @importFrom tidyselect vars_select
#' @importFrom rlang quos
#' @importFrom tibble is_tibble
#' @export
#' 
plot_hist_numeric.grouped_df <- function(.data, ..., 
                                        title = "Distribution by numerical variables",
                                        each = FALSE, typographic = TRUE, 
                                        base_family = NULL) {
  vars <- tidyselect::vars_select(names(.data), !!! rlang::quos(...))
  
  plot_hist_numeric_group_impl(.data, vars, title, each, typographic, base_family)
}

#' @importFrom stats IQR
#' 
plot_hist_numeric_group_impl <- function(df, vars, title, each, typographic, base_family) {
  if (utils::packageVersion("dplyr") >= "0.8.0") {
    group_key <- setdiff(attr(df, "groups") %>% names(), ".rows")
  } else {
    group_key <- attr(df, "vars")
  }
  
  n_levels <- attr(df, "group") %>% nrow() 
  
  if (length(vars) == 0) vars <- names(df)
  
  if (length(vars) == 1 & !tibble::is_tibble(df)) df <- as_tibble(df)
  
  nm_numeric <- find_class(df[, vars], type = "numerical", index = FALSE)
  
  if (length(nm_numeric) == 0) {
    message("There is no numerical variable in the data or variable list.\n")
    return(NULL)
  }
  
  suppressWarnings({
    plist <- purrr::map(nm_numeric, function(var) {
      if (each) {
        xlab <- "" 
        ylab <- ""
        title <- paste(title, "(", paste("by", group_key), ")")      
      } else {
        xlab <- ""
        ylab <- ""
        title <- ""
      }
      
      p_box <- df %>% 
        mutate(variables = var) %>% 
        ggplot(aes(x = !!sym(var), fill = !!sym(group_key))) +
        geom_histogram(color="#e9ecef", alpha = 0.8,
                       binwidth = function(x) 2 * stats::IQR(x) / (length(x)^(1/3))) + 
        facet_grid(reformulate("variables", group_key)) + 
        labs(title = title, x = xlab, y = ylab) +    
        theme_grey(base_family = base_family) + 
        theme(legend.position = "none",
              axis.title.y = element_blank(),
              axis.text.y = element_blank(),
              axis.ticks.y = element_blank())
      
      if (typographic) {
        p_box <- p_box + 
          theme_typographic(base_family) +
          scale_fill_ipsum(na.value = "gray") + 
          theme(legend.position = "none",
                axis.title.x = element_text(size = 12),
                axis.text.y = element_blank(),
                axis.ticks.y = element_blank())
        
        if (!each) {
          p_box <- p_box +
            theme(axis.text.x = element_text(size = 10),
                  axis.text.y = element_blank(),
                  axis.ticks.y = element_blank(),
                  plot.margin = margin(0, 10, 0, 10),
                  panel.spacing = grid::unit(0, "lines")
            )
        }
      }
      
      p_box
    }
    )
    
    if (each) {
      for (i in seq(plist))
        do.call("print", plist[i])
    } else {
      n <- length(plist)
      n_row <- floor(sqrt(n))
      
      if (typographic) {
        if (is.null(base_family)) {
          base_family <- "Roboto Condensed"           
        }
        
        title <- grid::textGrob(title, gp = grid::gpar(fontfamily = base_family, 
                                                       fontsize = 18, font = 2),
                                x = unit(0.075, "npc"), just = "left")
      }
      
      suppressWarnings(gridExtra::grid.arrange(
        gridExtra::arrangeGrob(grobs = plist, nrow = n_row), top = title, 
        right = grid::textGrob(group_key, rot = 270,
                               gp = gpar(fontfamily = base_family))))
    }
  }) # End of suppressWarnings()
}
choonghyunryu/dlookr documentation built on June 11, 2024, 9:12 a.m.