R/distribution.R

Defines functions .fxna_rm .fxclean .fxtrim .force_class distr

Documented in distr

####################################################################
#' Compare Variables with their Distributions
#'
#' Compare the distribution of a target variable vs another variable. This
#' function automatically splits into quantiles for numerical variables.
#' Custom and tidyverse friendly.
#'
#' @family Exploratory
#' @family Visualization
#' @param data Dataframe
#' @param ... Variables. Main (target variable) and secondary (values
#' variable) to group by (if needed).
#' @param type Integer. 1 for both plots, 2 for counter plot only, 3 for
#' percentages plot only.
#' @param ref Boolean. Show a reference line if levels = 2? Quite useful
#' when data is unbalanced (not 50/50) because a reference line is drawn.
#' @param note Character. Caption for the plot.
#' @param top Integer. Filter and plot the most n frequent for categorical values.
#' @param breaks Integer. Number of splits for numerical values.
#' @param na.rm Boolean. Ignore \code{NA}s if needed.
#' @param force Character. Force class on the values data. Choose between 'none',
#' 'character', 'numeric', 'date'
#' @param trim Integer. Trim labels until the nth character for categorical values
#' (applies for both, target and values)
#' @param clean Boolean. Use \code{cleanText()} for categorical values (applies
#' for both, target and values)
#' @param abc Boolean. Do you wish to sort by alphabetical order?
#' @param custom_colours Boolean. Use custom colours function?
#' @param plot Boolean. Return a plot? Otherwise, a table with results
#' @param chords Boolean. Use a chords plot?
#' @param save Boolean. Save the output plot in our working directory
#' @param subdir Character. Into which subdirectory do you wish to save the plot to?
#' @return Plot when \code{plot=TRUE} with two plots in one: counter distribution
#' grouped by cuts, and proportions distribution grouped by same cuts. data.frame when
#' \code{plot=FALSE} with counting, percentages, and cumulative percentages results.
#' When \code{type} argument is used, single plots will be returned.
#' @examples
#' Sys.unsetenv("LARES_FONT") # Temporal
#' data(dft) # Titanic dataset
#'
#' # Relation for categorical/categorical values
#' distr(dft, Survived, Sex)
#'
#' # Relation for categorical/numeric values
#' dft %>%
#'   distr(Survived, Fare, plot = FALSE) %>%
#'   head(10)
#' # Sort values
#' dft %>% distr(Survived, Fare, abc = TRUE)
#' # Less splits/breaks
#' dft %>% distr(Survived, Fare, abc = TRUE, breaks = 5)
#'
#' # Distribution of numerical only
#' dft[dft$Fare < 20, ] %>% distr(Fare)
#'
#' # Distribution of numerical/numerical
#' dft %>% distr(Fare, Age)
#'
#' # Select only one of the two default plots of distr()
#' dft %>% distr(Survived, Age, type = 2)
#' dft %>% distr(Survived, Age, type = 3)
#' @export
distr <- function(data, ...,
                  type = 1,
                  ref = TRUE,
                  note = NA,
                  top = 10,
                  breaks = 10,
                  na.rm = FALSE,
                  force = "none",
                  trim = 0,
                  clean = FALSE,
                  abc = FALSE,
                  custom_colours = FALSE,
                  plot = TRUE,
                  chords = FALSE,
                  save = FALSE,
                  subdir = NA) {
  # # To handle scientific notation inputs correctly
  # on.exit(options("scipen" = 999))

  data <- data.frame(data)

  vars <- enquos(...)
  var1 <- vars[[1]]
  if (length(vars) > 1) {
    var2 <- vars[[2]]
  } else {
    var2 <- NULL
  }

  # When we only have one variable, use freqs()
  if (length(vars) == 1) {
    value <- select(data, !!var1)
    variable_name <- colnames(value)
    value <- value[, 1]
    value <- .force_class(value, force)
    value <- .fxtrim(value, trim)
    value <- .fxclean(value, clean)

    df <- data.frame(value = value, dummy = 0)
    df <- .fxna_rm(df, na.rm)

    is.Date <- function(x) inherits(x, "Date")
    is.POSIXct <- function(x) inherits(x, "POSIXct")
    is.POSIXlt <- function(x) inherits(x, "POSIXlt")
    if (is.numeric(value) || is.Date(value) || is.POSIXct(value) || is.POSIXlt(value)) {
      # Continuous and date values
      if (is.numeric(value)) {
        p <- ggplot(df, aes(x = .data$value))
      } else {
        p <- ggplot(df, aes(x = date(.data$value)))
      }
      p <- p +
        geom_density(fill = "deepskyblue", alpha = 0.7, adjust = 1 / 3) +
        labs(
          y = NULL, x = NULL, fill = "Density",
          title = "Density Distribution",
          subtitle = paste("Variable:", variable_name),
          caption = paste("Obs:", formatNum(nrow(df), 0))
        ) +
        theme_lares()
      if (top != 10) {
        p <- p + xlim(0, top)
      }
    } else {
      # Discrete values
      p <- freqs(df, value, plot = TRUE, variable_name = variable_name, abc = abc, top = top)
    }
    # Return table with results?
    if (!plot) {
      output <- df %>% freqs(value, top = top)
      return(output)
    }
    return(p)
  }

  # Check if secondary variable exists and fix if possible
  var <- as_label(var2)
  if (!var %in% colnames(data)) {
    msg <- paste("Not a valid input:", var, "was transformed or does not exist.")
    maybes <- colnames(data)[grepl(var, colnames(data))]
    if (length(maybes) > 0 && maybes[1] %in% colnames(data)) {
      message(paste0(
        "Maybe you meant one of: ", vector2text(maybes), ". ",
        "Automatically using '", maybes[1], "'"
      ))
      var2 <- quos(maybes[1])
      warning(msg)
    } else {
      stop(msg)
    }
  }

  targets <- select(data, !!var1)
  targets_name <- colnames(targets)
  targets <- targets[, 1]
  value <- select(data, !!var2)
  variable_name <- colnames(value)
  # Transformations
  value <- value[, 1] # do.call("c", value)
  value <- .force_class(value, force)
  value <- .fxtrim(value, trim)
  value <- .fxclean(value, clean)

  if (length(targets) != length(value)) {
    message("The targets and value vectors should be the same length.")
    stop(message(paste(
      "Currently, targets has", length(targets),
      "rows and value has", length(value)
    )))
  }

  # For num-num distributions or too many unique target variables
  if (length(unique(targets)) >= 8) {
    if (is.numeric(targets) && is.numeric(value)) {
      subtitle <- paste0(
        "Variables: ", variable_name, " vs. ", targets_name,
        ". Obs: ", formatNum(length(value), 0)
      )
      df <- data.frame(x = targets, y = value)
      df <- .fxna_rm(df, na.rm = TRUE)
      p <- df %>%
        ggplot(aes(x = .data$x, y = .data$y)) +
        stat_density_2d(aes(fill = after_stat(.data$level)), geom = "polygon") +
        labs(
          title = "2D Density Distribution",
          x = targets_name, y = variable_name,
          subtitle = subtitle
        ) +
        scale_x_comma() +
        scale_y_comma() +
        theme_lares()
      return(p)
    }
    message("You should try a 'target' variable with max 8 different values.")
    message("Automatically trying a chords plot...")
    chords <- TRUE
  }

  # Chords plot
  if (chords) {
    df <- data.frame(value = value, targets = targets)
    output <- freqs(df, targets, value)
    if (!na.rm) {
      output <- output %>% replaceall(NA, "NA")
    }
    title <- "Frequency Chords Diagram"
    subtitle <- paste("Variables:", targets_name, "to", variable_name)
    if (!plot) {
      return(output)
    }
    return(plot_chord(
      output$targets, output$value, output$n,
      mg = 13, title = title, subtitle = subtitle
    ))
  }

  # Only n numeric values, really numeric?
  if (is.numeric(value) && length(unique(value)) <= 8) {
    value <- .force_class(value, class = "char")
  }

  # Turn numeric variables into quantiles
  if (is.numeric(value)) {
    breaks <- ifelse(top != 10, top, breaks)
    value <- quants(value, breaks, return = "labels")
    cuts <- length(unique(value[!is.na(value)]))
    if (cuts != breaks) {
      message(paste(
        "When dividing", variable_name, "into", breaks, "quantiles,",
        cuts, "cuts/groups are possible."
      ))
    }
    top <- top + 1
  }

  # Finally, we have our data.frame
  df <- data.frame(targets = targets, value = value)
  df <- .fxna_rm(df, na.rm)

  # Captions for plots
  subtitle <- paste0(
    "Variables: ", targets_name, " vs. ", variable_name,
    ". Obs: ", formatNum(nrow(df), 0)
  )

  freqs <- df %>%
    group_by(.data$targets, .data$value) %>%
    count() %>%
    ungroup() %>%
    arrange(desc(.data$n)) %>%
    group_by(.data$value) %>%
    mutate(
      p = round(100 * .data$n / sum(.data$n), 2),
      pcum = cumsum(.data$p)
    ) %>%
    ungroup() %>%
    filter(!is.na(.data$value)) %>%
    mutate(
      row = row_number(),
      order = suppressWarnings(ifelse(
        grepl("\\(|\\)", .data$value),
        as.numeric(as.character(substr(gsub(",.*", "", .data$value), 2, 100))),
        .data$row
      ))
    )
  if (length(unique(value)) > top && !is.numeric(value)) {
    message(paste("Filtering the", top, "most frequent values. Use 'top' to overrule."))
    which <- freqs(df, .data$value) %>% slice(1:top)
    freqs <- freqs %>%
      mutate(value = ifelse(.data$value %in% which$value, as.character(.data$value), "OTHERS")) %>%
      group_by(.data$value, .data$targets) %>%
      select(-.data$row, -.data$order) %>%
      summarise(n = sum(.data$n)) %>%
      mutate(p = round(100 * n / sum(.data$n), 2)) %>%
      ungroup() %>%
      arrange(desc(.data$n)) %>%
      mutate(
        row = row_number(),
        order = row_number()
      )
  }

  # Sort values alphabetically or ascending if numeric
  if (abc) freqs <- mutate(freqs, order = rank(.data$value))

  # Counter plot
  if (type %in% c(1, 2)) {
    vadj <- ifelse(type == 1, -0.15, 0.5)
    hadj <- ifelse(type == 1, 0.5, -0.15)
    count <- ggplot(freqs, aes(
      x = reorder(as.character(.data$value), .data$order), y = .data$n,
      fill = as.character(.data$targets),
      label = formatNum(.data$n, 0), ymax = max(.data$n) * 1.1
    )) +
      geom_col(position = "dodge", colour = "transparent") +
      geom_text(
        colour = "black",
        check_overlap = TRUE,
        position = position_dodge(0.9),
        size = 3, vjust = vadj, hjust = hadj
      ) +
      labs(x = NULL, y = "Counter [#]", fill = targets_name, caption = note) +
      theme(legend.position = "top") +
      guides(colour = "none") +
      theme(axis.title.y = element_text(size = rel(0.8), angle = 90)) +
      scale_y_comma(expand = c(0, 0)) +
      theme_lares(pal = 1)
    # Give an angle to labels when more than...
    if (length(unique(value)) >= 7) {
      count <- count + theme(axis.text.x = element_text(angle = 30, hjust = 1))
    }
    # Custom colours if wanted...
    if (custom_colours) {
      count <- count + suppressWarnings(gg_fill_customs())
    }
  }

  # Proportions (%) plot
  if (type %in% c(1, 3)) {
    prop <- freqs %>%
      group_by(.data$value) %>%
      mutate(size = sum(.data$n) / sum(freqs$n)) %>%
      mutate(ptag = ifelse(p < 3, "", as.character(round(.data$p, 1)))) %>%
      ggplot(aes(
        x = reorder(.data$value, -.data$order),
        y = .data$p / 100, label = .data$ptag,
        fill = as.character(.data$targets)
      )) +
      geom_col(position = "fill", colour = "transparent") +
      geom_text(aes(size = .data$size, colour = as.character(.data$targets)),
        position = position_stack(vjust = 0.5)
      ) +
      scale_size(range = c(2.2, 3)) +
      coord_flip() +
      labs(x = "Proportions [%]", y = NULL, fill = targets_name, caption = note) +
      theme(legend.position = "top") +
      guides(colour = "none", size = "none") +
      scale_y_percent(expand = c(0, 0)) +
      theme(axis.title.y = element_text(size = rel(0.8), angle = 90)) +
      theme_lares(pal = 1)

    # Show a reference line if levels = 2; quite useful when data is unbalanced (not 50/50)
    if (length(unique(targets)) == 2 && ref) {
      distr <- df %>%
        freqs(.data$targets) %>%
        arrange(as.character(.data$targets))
      h <- signif(100 - distr$p[1], 3)
      prop <- prop +
        geom_hline(
          yintercept = h / 100, colour = "purple",
          linetype = "dotted", alpha = 0.8
        ) +
        geom_label(aes(0, h / 100, label = h, vjust = -0.05),
          size = 2.5, fill = "white", alpha = 0.8
        )
    }
    # Custom colours if wanted...
    if (custom_colours) {
      prop <- prop + suppressMessages(gg_fill_customs())
    }
  }

  # Export file name and folder
  if (save) {
    file_name <- paste0(
      "viz_distr_",
      cleanText(targets_name), ".vs.",
      cleanText(variable_name),
      case_when(type == 2 ~ "_c", type == 3 ~ "_p", TRUE ~ ""), ".png"
    )
    if (!is.na(subdir)) {
      # dir.create(file.path(getwd(), subdir), recursive = TRUE)
      file_name <- paste(subdir, file_name, sep = "/")
    }
  }

  # Plot the results and save if needed
  if (type == 1) {
    count <- count + labs(
      title = "Distribution and Proportions",
      subtitle = subtitle, caption = ""
    ) +
      theme(plot.margin = margin(10, 15, -15, 15))
    prop <- prop + guides(fill = "none") + labs(caption = note) +
      theme(plot.margin = margin(-5, 15, -15, 15))
    p <- (count / prop) + plot_layout(ncol = 1, nrow = 2)
    if (save) p <- p + ggsave(file_name, width = 10, height = 7)
  }
  if (type == 2) {
    count <- count + coord_flip() +
      labs(title = "Distribution Plot", subtitle = subtitle, caption = "")
    if (save) count <- count + ggsave(file_name, width = 8, height = 6)
    p <- count
  }
  if (type == 3) {
    prop <- prop + labs(title = "Proportions Plot", subtitle = subtitle, caption = "")
    if (save) prop <- prop + ggsave(file_name, width = 8, height = 6)
    p <- prop
  }

  if (!plot) {
    return(select(freqs, -.data$order, -.data$row))
  } else {
    return(p)
  }
}

.force_class <- function(value, class = "none") {
  if (class != "none") {
    if (grepl("char|fact", class) && is.numeric(value)) {
      value <- as.character(value)
    }
    if (grepl("num|int", class) && !is.numeric(value)) {
      value <- as.numeric(value)
    }
    if (grepl("dat|day|time", class)) {
      value <- gsub(" .*", "", as.character(value))
      value <- lubridate::date(value)
    }
  }
  return(value)
}

.fxtrim <- function(value, trim, targets = NA) {
  if (trim > 0) {
    if (!is.numeric(value)) {
      value <- substr(value, 1, trim)
    }
    if (!is.numeric(targets) && !is.na(targets)) {
      targets <- substr(targets, 1, trim)
    }
    message(paste("Chopping everything to", trim, "characters..."))
  }
  return(value)
}

.fxclean <- function(value, clean = FALSE, targets = NA) {
  if (clean) {
    if (!is.numeric(value)) {
      value <- cleanText(value, spaces = FALSE)
    }
    if (!is.numeric(targets) && !is.na(targets)) {
      targets <- cleanText(targets, spaces = FALSE)
    }
  }
  return(value)
}

.fxna_rm <- function(df, na.rm = FALSE) {
  if (na.rm) {
    df <- df[complete.cases(df), ]
  }
  return(df)
}

# options(lifecycle_repeat_warnings = TRUE)
# library(dplyr)
# data("starwars")
# foo <- function(x, ...) {
#   temp <- enquos(...)
#   group_by(x, !!!temp[[1]]) %>% tally()
# }
# foo(starwars, sex, gender)

# Unquoting language objects with `!!!` is deprecated as of rlang 0.4.0.
# Please use `!!` instead.
#
# # Bad:
# dplyr::select(data, !!!enquo(x))
#
# # Good:
# dplyr::select(data, !!enquo(x))    # Unquote single quosure
# dplyr::select(data, !!!enquos(x))  # Splice list of quosures

Try the lares package in your browser

Any scripts or data that you put into this service are public.

lares documentation built on Nov. 5, 2023, 1:09 a.m.