R/frequencies.R

Defines functions freqs

Documented in freqs

####################################################################
#' Frequencies Calculations and Plot
#'
#' This function lets the user group, count, calculate percentages
#' and cumulatives. It also plots results if needed. Tidyverse friendly.
#'
#' @family Frequency
#' @family Exploratory
#' @family Visualization
#' @param df Data.frame
#' @param ... Variables. Variables you wish to process. Order matters.
#' If no variables are passed, the whole data.frame will be considered
#' @param wt Variable, numeric. Weights.
#' @param rel Boolean. Relative percentages (or absolute)?
#' @param results Boolean. Return results in a dataframe?
#' @param variable_name Character. Overwrite the main variable's name
#' @param plot Boolean. Do you want to see a plot? Three variables tops.
#' @param rm.na Boolean. Remove NA values in the plot? (not filtered for
#' numerical output; use na.omit() or filter() if needed)
#' @param title Character. Overwrite plot's title with.
#' @param subtitle Character. Overwrite plot's subtitle with.
#' @param top Integer. Filter and plot the most n frequent for
#' categorical values. Set to NA to return all values
#' @param abc Boolean. Do you wish to sort by alphabetical order?
#' @param save Boolean. Save the output plot in our working directory
#' @param subdir Character. Into which subdirectory do you wish to
#' save the plot to?
#' @return Plot when \code{plot=TRUE} and data.frame with grouped frequency results
#' when \code{plot=FALSE}.
#' @examples
#' Sys.unsetenv("LARES_FONT") # Temporal
#' data(dft) # Titanic dataset
#'
#' # How many survived?
#' dft %>% freqs(Survived)
#'
#' # How many survived per Class?
#' dft %>% freqs(Pclass, Survived, abc = TRUE)
#'
#' # How many survived per Class with relative percentages?
#' dft %>% freqs(Pclass, Survived, abc = TRUE, rel = TRUE)
#'
#' # Using a weighted feature
#' dft %>% freqs(Pclass, Survived, wt = Fare / 100)
#'
#' # Let's check the results with plots:
#'
#' #' # How many survived and see plot?
#' dft %>% freqs(Survived, plot = TRUE)
#'
#' # How many survived per class?
#' dft %>% freqs(Survived, Pclass, plot = TRUE)
#'
#' # Per class, how many survived?
#' dft %>% freqs(Pclass, Survived, plot = TRUE)
#'
#' # Per sex and class, how many survived?
#' dft %>% freqs(Sex, Pclass, Survived, plot = TRUE)
#'
#' # Frequency of tickets + Survived
#' dft %>% freqs(Survived, Ticket, plot = TRUE)
#'
#' # Frequency of tickets: top 10 only and order them alphabetically
#' dft %>% freqs(Ticket, plot = TRUE, top = 10, abc = TRUE)
#' @export
freqs <- function(df, ..., wt = NULL,
                  rel = FALSE,
                  results = TRUE,
                  variable_name = NA,
                  plot = FALSE, rm.na = FALSE,
                  title = NA, subtitle = NA,
                  top = 20, abc = FALSE,
                  save = FALSE, subdir = NA) {
  vars <- quos(...)
  weight <- enquo(wt)

  if (is.vector(df)) {
    values <- df <- data.frame(values = df)
    vars <- quos(values)
  }

  df <- as.data.frame(df)

  # Probably an error but might be useful for the user instead
  # When it's a vector, frequencies; when it's a dataframe, global frequencies
  if (length(vars) == 0) {
    output <- freqs_df(df, plot = plot, save = save, subdir = subdir)
    return(output)
  }

  output <- df %>%
    group_by(!!!vars) %>%
    tally(wt = !!weight) %>%
    arrange(desc(.data$n)) %>%
    {
      if (!rel) ungroup(.) else .
    } %>%
    mutate(p = round(100 * .data$n / sum(.data$n), 2))

  # Sort values alphabetically or ascending if numeric
  if (abc) {
    output <- output %>%
      arrange(!!!vars, desc(n)) %>%
      mutate(order = row_number())
  } else {
    output <- output %>%
      arrange(desc(.data$n)) %>%
      mutate(order = row_number())
  }

  # Add cumulative percentage (given the abc logic)
  output <- mutate(output, pcum = cumsum(.data$p))

  # No plot
  if (!plot && !save) {
    if (results) {
      return(output)
    } else {
      invisible(return())
    }
  }

  if (ncol(output) - 4 >= 4) {
    stop(
      paste(
        "Sorry, but trying to plot more than 3 features is as complex as it sounds.",
        "You should try another method to understand your analysis!"
      )
    )
  }
  obs_total <- sum(output$n)
  obs <- sprintf("Total Obs.: %s", formatNum(obs_total, 0))
  weight_text <- ifelse((as_label(weight) != "NULL"),
    sprintf("(weighted by %s)", as_label(weight)), ""
  )

  # Use only the most n frequent values/combinations only
  values <- unique(output[, length(vars)])
  if (nrow(values) > top) {
    if (!is.na(top)) {
      output <- output %>%
        arrange(desc(.data$n)) %>%
        slice(1:top)
      message(
        sprintf(
          "Slicing the top %s (out of %s) values; use 'top' parameter to overrule.",
          top, nrow(values)
        )
      )
      note <- sprintf("[%s most frequent]", top)
      obs <- sprintf("Obs.: %s (out of %s)", formatNum(sum(output$n), 0), formatNum(obs_total, 0))
    }
  } else {
    note <- ""
  }

  if (abc) note <- gsub("most frequent", "A-Z sorted values", note)

  plot <- ungroup(output)

  if (rm.na) plot <- plot[complete.cases(plot), ]

  # Create some dynamic aesthetics
  plot$labels <- paste0(formatNum(plot$n, decimals = 0), " (", signif(plot$p, 4), "%)")
  plot$label_colours <- ifelse(plot$p > mean(range(plot$p)) * 0.9, "TRUE", "FALSE")
  lim <- 0.35
  plot$label_hjust <- ifelse(
    plot$n < min(plot$n) + diff(range(plot$n)) * lim, -0.1, 1.05
  )
  plot$label_colours <- ifelse(
    plot$label_colours == "TRUE" & plot$label_hjust < lim, "FALSE", plot$label_colours
  )
  variable <- colnames(plot)[1]

  # When one feature
  if (ncol(output) - 3 == 2) {
    type <- 1
    colnames(plot)[1] <- "names"
    p <- ggplot(plot, aes(
      x = reorder(.data$names, -.data$order),
      y = .data$n, label = .data$labels, fill = .data$p
    ))
  }
  # When two features
  else if (ncol(output) - 3 == 3) {
    type <- 2
    facet_name <- colnames(plot)[2]
    colnames(plot)[1] <- "facet"
    colnames(plot)[2] <- "names"
    plot$facet <- suppressWarnings(replacefactor(plot$facet, NA, "NA"))
    p <- plot %>%
      ggplot(aes(
        x = reorder_within(.data$names, -.data$order, .data$facet),
        y = .data$n, label = .data$labels, fill = .data$p
      )) +
      scale_x_reordered()
  }
  # When three features
  else if (ncol(output) - 3 == 4) {
    type <- 3
    facet_name1 <- colnames(plot)[2]
    facet_name2 <- colnames(plot)[3]
    colnames(plot)[1] <- "facet2"
    colnames(plot)[2] <- "facet1"
    colnames(plot)[3] <- "names"
    plot$facet2 <- suppressWarnings(replacefactor(plot$facet2, NA, "NA"))
    plot$facet1 <- suppressWarnings(replacefactor(plot$facet1, NA, "NA"))
    p <- plot %>%
      ggplot(aes(
        x = reorder_within(.data$names, .data$n, .data$facet2),
        y = .data$n, label = .data$labels, fill = .data$p
      )) +
      scale_x_reordered()
  }

  # Plot base
  p <- p + geom_col(alpha = 0.95, width = 0.8, colour = "transparent") +
    geom_text(aes(hjust = .data$label_hjust, colour = .data$label_colours), size = 2.8) +
    coord_flip() + guides(colour = "none") +
    labs(
      x = NULL, y = NULL, fill = NULL,
      title = ifelse(is.na(title), paste("Frequencies and Percentages"), title),
      subtitle = ifelse(is.na(subtitle),
        paste(
          "Variable:", ifelse(!is.na(variable_name), variable_name, variable),
          note, weight_text
        ),
        subtitle
      ), caption = obs
    ) +
    scale_fill_gradient(low = "lightskyblue2", high = "navy") +
    theme_lares(pal = 4, which = "c", legend = "none", grid = "Xx") +
    scale_y_comma(position = "right", expand = c(0, 0), limits = c(0, 1.03 * max(output$n)))

  # When two features
  if (type == 2) {
    p <- p +
      facet_grid(as.character(.data$facet) ~ ., scales = "free", space = "free") +
      labs(
        subtitle = ifelse(is.na(subtitle),
          paste("Variables:", facet_name, "grouped by", variable, "\n", note, weight_text),
          subtitle
        ),
        caption = obs
      )
  }

  # When three or more features
  else if (type >= 3) {
    if (length(unique(facet_name2)) > 3) {
      p <- freqs_plot(df, ..., top = top, rm.na = rm.na, title = title, subtitle = subtitle)
      return(p)
    }
    p <- p + facet_grid(as.character(.data$facet2) ~ as.character(.data$facet1), scales = "free") +
      labs(
        title = ifelse(is.na(title), "Frequencies and Percentages:", title),
        subtitle = ifelse(is.na(subtitle),
          paste(
            "Variables:", facet_name2, "grouped by", facet_name1, "[x] and",
            variable, "[y]", "\n", note, weight_text
          ),
          subtitle
        ),
        caption = obs
      )
  }

  # Export file name and folder for plot
  if (save) export_plot(p, "viz_freqs", vars, subdir = subdir)
  return(p)
}

reorder_within <- function(x, by, within, fun = mean, sep = "___", ...) {
  new_x <- paste(x, within, sep = sep)
  stats::reorder(new_x, by, FUN = fun)
}

scale_x_reordered <- function(..., sep = "___") {
  reg <- paste0(sep, ".+$")
  ggplot2::scale_x_discrete(labels = function(x) gsub(reg, "", x), ...)
}


####################################################################
#' Plot for All Frequencies on Dataframe
#'
#' This function lets the user analize data by visualizing the
#' frequency of each value of each column from a whole data frame.
#'
#' @family Frequency
#' @family Exploratory
#' @family Visualization
#' @param df Data.frame
#' @param max Numeric. Top variance threshold. Range: (0-1].
#' These variables will be excluded
#' @param min Numeric. Minimum variance threshold. Range: [0-1).
#' These values will be grouped into a high frequency (HF) value
#' @param novar Boolean. Remove no variance columns?
#' @param plot Boolean. Do you want to see a plot? Three variables tops
#' @param top Integer. Plot most relevant (less categories) variables
#' @param quiet Boolean. Keep quiet? (or show variables exclusions)
#' @param save Boolean. Save the output plot in our working directory
#' @param subdir Character. Into which subdirectory do you wish to
#' save the plot to?
#' @return Plot when \code{plot=TRUE} and data.frame with grouped frequency results
#' when \code{plot=FALSE}.
#' @examples
#' data(dft) # Titanic dataset
#' freqs_df(dft)
#' freqs_df(dft, plot = TRUE)
#' @export
freqs_df <- function(df,
                     max = 0.9, min = 0.0, novar = TRUE,
                     plot = FALSE, top = 30,
                     quiet = FALSE,
                     save = FALSE,
                     subdir = NA) {
  if (is.vector(df)) {
    temp <- data.frame(values = df)
    ret <- freqs(temp, .data$values)
    return(ret)
  }

  df <- df[!unlist(lapply(df, is.list))]
  names <- lapply(df, function(x) length(unique(x)))
  unique <- as.data.frame(names)
  colnames(unique) <- names(names)
  which <- names(sort(-rank(unique, ties.method = "first")))

  # Too much variance
  no <- names(unique)[unique > nrow(df) * max]
  if (length(no) > 0) {
    if (!quiet) {
      message(paste(
        length(no), "variables with more than", max,
        "variance exluded:", vector2text(no)
      ))
    }
    which <- which[!which %in% no]
  }

  # No variance at all
  if (novar) {
    no <- zerovar(df)
    if (length(no) > 0) {
      if (!quiet) {
        message(paste(
          length(no), "variables with no variance exluded:",
          vector2text(no)
        ))
      }
      which <- which[!which %in% no]
    }
  }

  # Too many columns
  if (length(which) > top) {
    no <- which[(top + 1):length(which)]
    if (!quiet) {
      message(paste(
        "Using the", top, "variables with less distinct categories.",
        length(no), "variables excluded:", vector2text(no)
      ))
    }
    which <- which[1:top]
  }

  if (length(which) > 0) {
    for (i in seq_along(which)) {
      if (i == 1) out <- NULL
      iter <- which[i]
      counter <- table(df[iter], useNA = "ifany")
      res <- data.frame(
        value = names(counter),
        count = as.vector(counter),
        col = iter
      ) %>%
        arrange(desc(.data$count))
      out <- rbind(out, res)
    }
    out <- out %>%
      mutate(p = round(100 * .data$count / nrow(df), 2)) %>%
      mutate(value = ifelse(.data$p > min * 100, as.character(.data$value), "(HF)")) %>%
      group_by(.data$col, .data$value) %>%
      summarise(p = sum(.data$p), count = sum(.data$count)) %>%
      arrange(desc(.data$count)) %>%
      ungroup()
  } else {
    warning("No relevant information to display regarding your data.frame!")
    return(invisible(NULL))
  }

  if (plot) {
    out <- out %>%
      mutate(value = ifelse(is.na(.data$value), "NA", as.character(.data$value))) %>%
      mutate(col = factor(.data$col, levels = which)) %>%
      mutate(label = ifelse(.data$p > 8, as.character(.data$value), "")) %>%
      group_by(.data$col) %>%
      mutate(alpha = log(.data$count)) %>%
      mutate(alpha = as.numeric(ifelse(.data$value == "NA", 0, .data$alpha))) %>%
      mutate(alpha = as.numeric(ifelse(is.infinite(.data$alpha), 0, .data$alpha)))

    p <- ggplot(out, aes(
      x = .data$col, y = .data$count, fill = .data$col,
      label = .data$label, colour = .data$col
    )) +
      geom_col(aes(alpha = .data$alpha),
        position = "fill",
        colour = "80000000", width = 0.95, size = 0.1
      ) +
      geom_text(position = position_fill(vjust = .5), size = 3) +
      coord_flip() +
      labs(x = NULL, y = NULL, title = "Overall Values Frequencies") +
      scale_y_percent(expand = c(0, 0)) +
      guides(fill = "none", colour = "none", alpha = "none") +
      theme_lares(pal = 1) +
      theme(
        panel.background = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank()
      ) +
      geom_hline(
        yintercept = c(0.25, 0.5, 0.75), linetype = "dashed",
        color = "black", size = 0.5, alpha = 0.3
      )
    if (save) export_plot(p, "viz_freqs_df", subdir = subdir)
    return(p)
  } else {
    out <- select(out, .data$col, .data$value, .data$count, .data$p) %>%
      rename(n = .data$count, variable = .data$col) %>%
      group_by(.data$variable) %>%
      mutate(pcum = round(cumsum(.data$p), 2))
    return(out)
  }
}


####################################################################
#' Combined Frequencies Plot for Categorical Features
#'
#' Plot frequencies of multiple categories within a data.frame in
#' a new fancy way. Tidyverse friendly, based on \code{lares::freqs()},
#' no limits on amount of features to evaluate.
#'
#' @family Frequency
#' @family Exploratory
#' @family Visualization
#' @inheritParams freqs
#' @return Plot. Result of the frequency of combined variables.
#' @examples
#' Sys.unsetenv("LARES_FONT") # Temporal
#' data(dft) # Titanic dataset
#'
#' df <- freqs_plot(dft, Pclass, Survived)
#' head(df$data)
#' plot(df)
#'
#' freqs_plot(dft, Pclass, Survived, Sex, Embarked)
#'
#' freqs_plot(dft, Pclass, Survived, Sex, Embarked, top = 15)
#' @export
freqs_plot <- function(df, ..., top = 10, rm.na = FALSE, abc = FALSE,
                       title = NA, subtitle = NA) {
  vars <- quos(...)

  if (length(vars) == 0) {
    return(freqs(df))
  }

  aux <- df %>%
    freqs(!!!vars, rm.na = rm.na) %>%
    mutate_if(is.factor, as.character) %>%
    mutate_at(seq_along(vars), as.character) %>%
    mutate(order = ifelse(.data$order > top, "...", .data$order))
  if ("..." %in% aux$order) {
    message(paste(
      "Showing", top, "most frequent values. Tail of",
      nrow(aux) - top,
      "other values grouped into one"
    ))
  }
  for (i in 1:(ncol(aux) - 4)) {
    aux[, i][aux$order == "...", ] <- "Tail"
  }
  aux <- aux %>%
    group_by(!!!vars, .data$order) %>%
    summarise_all(sum) %>%
    ungroup()
  vars_names <- colnames(aux)[1:(ncol(aux) - 4)]

  labels <- aux %>%
    mutate_all(as.character) %>%
    tidyr::gather("name", "value", 1:(ncol(aux) - 4)) %>%
    mutate(label = sprintf("%s: %s", .data$name, .data$value)) %>%
    mutate(
      n = as.integer(.data$n),
      pcum = as.numeric(.data$pcum),
      p = as.numeric(.data$p)
    ) %>%
    ungroup() %>%
    arrange(desc(.data$pcum))

  if (is.na(title)) {
    title <- "Absolute Frequencies"
  }
  if (is.na(subtitle)) {
    subtitle <- paste("Grouped by", vector2text(vars_names, quotes = FALSE))
  }
  mg <- 5

  p1 <- aux %>%
    mutate(aux = ifelse(.data$order == "...", "grey", "black")) %>%
    mutate(order = paste0(.data$order, "\n", signif(as.numeric(.data$p), 2), "%")) %>%
    ggplot(aes(x = reorder(.data$order, .data$pcum), y = .data$n, label = .data$p)) +
    geom_col(aes(fill = .data$aux)) +
    scale_fill_manual(values = c("black", "grey55")) +
    labs(x = NULL, y = NULL, title = title, subtitle = subtitle) +
    scale_y_comma() +
    guides(fill = "none") +
    theme_lares() +
    theme(plot.margin = margin(mg, mg, 0, mg))

  p2 <- labels %>%
    {
      if (abc) mutate(., n = -rank(.data$label)) else .
    } %>%
    mutate(label = ifelse(.data$order == "...", " Mixed Tail", .data$label)) %>%
    ggplot(aes(
      x = reorder(.data$order, .data$pcum),
      y = reorder(.data$label, .data$n),
      group = .data$pcum
    )) +
    geom_point(size = 4) +
    labs(
      x = NULL, y = NULL,
      caption = paste("Total observations:", formatNum(nrow(df), 0))
    ) +
    guides(colour = "none") +
    theme_lares(which = "XY") +
    theme(
      axis.text.x = element_blank(),
      plot.margin = margin(0, mg, mg, mg)
    )
  if (length(vars) > 1) {
    p2 <- p2 + geom_path()
  }

  p <- (p1 / p2)
  attr(p, "data") <- aux
  attr(p, "labels") <- labels
  return(p)
}


####################################################################
#' Frequencies on Lists and UpSet Plot
#'
#' Visualize frequency of elements on a list, list vector, or vector with
#' comma separated values. Detect which combinations and elements are
#' the most frequent and how much they represent of your total observations.
#' This is similar to the \href{http://vcg.github.io/upset/}{UpSet Plots}
#' which may be used as an alternative to Venn diagrams.
#'
#' @family Frequency
#' @family Exploratory
#' @family Visualization
#' @inheritParams freqs
#' @param var Variable. Variables you wish to process.
#' @param wt Variable, numeric. Select a numeric column to use
#' in the colour scale, used as sum, mean... of those values for each
#' of the combinations.
#' @param fx Character. Set operation: mean, sum
#' @param rm.na Boolean. Remove NA value from \code{wt}?
#' @param min_elements Integer. Exclude combinations with less than n elements
#' @param limit,limit_x,limit_y Integer. Show top n combinations (x) and/or
#' elements (y). The rest will be grouped into a single element.
#' Set argument to 0 to ignore. \code{limit_x}/\code{limit_y} answer to
#' \code{limit}'s argument.
#' @param tail Boolean. Show tail grouped into "..." on the plots?
#' @param size Numeric. Text base size
#' @param unique Boolean. a,b = b,a?
#' @param plot Boolean. Plot viz? Will be generated anyways in the output object
#' @return List. data.frame with the data results, elements and combinations.
#' @examples
#' \dontrun{
#' df <- dplyr::starwars
#' head(df[, c(1, 4, 5, 12)], 10)
#'
#' # Characters per movies combinations in a list column
#' head(df$films, 2)
#' freqs_list(df, films)
#'
#' # Skin colours in a comma-separated column
#' head(df$skin_color)
#' x <- freqs_list(df, skin_color, min_elements = 2, limit = 5, plot = FALSE)
#' # Inside "x" we'll have:
#' names(x)
#'
#' # Using the 'wt' argument to add a continuous value metric
#' # into an already one-hot encoded columns dataset (and hide tail)
#' csv <- "https://raw.githubusercontent.com/hms-dbmi/UpSetR/master/inst/extdata/movies.csv"
#' movies <- read.csv(csv, sep = ";")
#' head(movies)
#' freqs_list(movies,
#'   wt = AvgRating, min_elements = 2, tail = FALSE,
#'   title = "Movies\nMixed Genres\nRanking"
#' )
#' # So, please: no more Comedy+SciFi and more Drama+Horror films (based on ~50 movies)!
#' }
#' @export
freqs_list <- function(df,
                       var = NULL,
                       wt = NULL, fx = "mean",
                       rm.na = FALSE,
                       min_elements = 1,
                       limit = 10,
                       limit_x = NA,
                       limit_y = NA,
                       tail = TRUE,
                       size = 10,
                       unique = TRUE,
                       abc = FALSE,
                       title = "",
                       plot = TRUE) {
  dff <- df
  var_str <- as_label(enquo(var))
  if (var_str != "NULL") {
    check_opts(var_str, colnames(df))
    colnames(df)[colnames(df) == var_str] <- "which"
  } else {
    # If user has an already one hot encoded data.frame
    duos <- data.frame(
      count = unlist(lapply(df, function(x) length(unique(x))))
    ) %>%
      filter(.data$count == 2)
    if (nrow(duos) == 0) {
      stop("Please, define a valid 'var' column or make sure to have binary/logical columns.")
    }
    duos <- rownames(duos)
    message(paste(">>> Binary columns detected:", v2t(duos)))
    which <- unlist(lapply(df[, duos], function(x) x == 1))
    result <- NULL
    for (i in seq_len(nrow(df[, duos]))) {
      result <- c(result, v2t(colnames(df[, duos])[which[i, ]], quotes = FALSE))
    }
    df$which <- result
  }

  # Weighted column
  wt_str <- as_label(enquo(wt))
  if (wt_str == "NULL") wt_str <- NULL
  if (length(wt_str) > 0) {
    if (wt_str %in% colnames(df)) {
      # message(paste(">>> Colour weight:", fx, wt_str))
      colnames(df)[colnames(df) == wt_str] <- "wt"
      weights <- as.numeric(df$wt)
    } else {
      stop(sprintf("'%s' is not a valid weight column", wt_str))
    }
  } else {
    weights <- 0
  }

  # If list then convert
  if (is.list(df$which)) {
    df$which <- unlist(lapply(df$which, function(x) paste(x, collapse = ",")))
  }

  # check if it's a valid column
  if (!any(grepl(",", df$which))) {
    stop(paste(
      "Your inputs may not be valid: no valid binary columns OR",
      "'var' does NOT contain commas nor is a list vector."
    ))
  }

  # Order values so a,b = b,a
  if (unique) {
    values <- strsplit(unlist(df$which), ",")
    values <- lapply(values, sort)
    values <- unlist(lapply(values, function(x) paste(x, collapse = ",")))
  } else {
    values <- df$which
  }

  # One hot encoding
  vals <- data.frame(var = values, wt = weights) %>%
    filter(.data$var != "") %>%
    {
      if (rm.na) filter(., !is.na(.data$wt)) else .
    } %>%
    group_by(.data$var) %>%
    summarise(
      n = n(),
      wt = ifelse(
        fx == "mean", mean(.data$wt, na.rm = TRUE),
        ifelse(fx == "sum", sum(.data$wt, na.rm = TRUE), 1)
      ),
      .groups = "drop"
    ) %>%
    arrange(desc(.data$n)) %>%
    mutate(p = 100 * .data$n / sum(.data$n), order = row_number()) %>%
    data.frame() %>%
    ohe_commas(var) %>%
    as_tibble(.name_repair = "unique") %>%
    # Remove combinations with less than "min_elements" elements
    mutate(combs = rowSums(.[unlist(lapply(., is.logical))], na.rm = TRUE)) %>%
    filter(.data$combs >= min_elements) %>%
    select(-.data$combs)

  # Set limits
  if (limit == 0) limit <- nrow(df)
  if (is.na(limit_x)) limit_x <- nrow(vals)
  if (is.na(limit_y)) limit_y <- nrow(vals)

  elements <- vals %>%
    tidyr::gather(.) %>%
    mutate(n = rep(vals$n, ncol(vals))) %>%
    mutate(wt = rep(vals$wt, ncol(vals))) %>%
    filter(.data$value == TRUE) %>%
    group_by(.data$key) %>%
    summarise(
      n = sum(.data$n),
      wt = ifelse(
        fx == "mean", mean(.data$wt, na.rm = TRUE),
        ifelse(fx == "sum", sum(.data$wt, na.rm = TRUE), 1)
      ),
      .groups = "drop"
    ) %>%
    {
      if (abc) arrange(., .data$key) else arrange(., desc(.data$n))
    } %>%
    mutate(p = 100 * .data$n / nrow(df), order = row_number()) %>%
    mutate(key = as.factor(.data$key)) %>%
    mutate(label = sprintf("%s (%s)", .data$key, formatNum(.data$p, 0, pos = "%"))) %>%
    mutate(label = ifelse(.data$order > min(limit, limit_y), "...", .data$label)) %>%
    mutate(label = as.factor(.data$label))

  tgthr <- vals %>%
    select(contains("var_")) %>%
    tidyr::gather("var") %>%
    mutate(var = factor(.data$var, levels = rev(elements$key))) %>%
    group_by(.data$var) %>%
    mutate(id = row_number(), order = row_number()) %>%
    mutate(label = sprintf("#%s", .data$id)) %>%
    filter(.data$value == TRUE) %>%
    mutate(label = ifelse(.data$id > min(limit, limit_x), "...", .data$label)) %>%
    ungroup() %>%
    mutate(var = ifelse(
      as.character(.data$var) %in% as.character(elements$key[elements$label != "..."]),
      as.character(.data$var), "..."
    )) %>%
    mutate(var = factor(.data$var, levels = rev(c(as.character(elements$key), "..."))))

  # Amount of data considered
  caption <- sprintf(
    "Observations: %s | Elements: %s",
    formatNum(sum(vals$n), 0),
    formatNum(sum(elements$n), 0)
  )
  if (min_elements > 1) {
    caption <- v2t(
      c(caption, sprintf(
        "Excluding combinations with less than %s elements", min_elements
      )),
      quotes = FALSE, sep = "\n"
    )
  }
  if (!tail) caption <- paste(caption, "Tail combinations suppressed from plot", sep = "\n")

  # Scatter plot: combinations
  p1 <- tgthr %>%
    ggplot(aes(
      x = reorder(.data$label, .data$order),
      y = var, group = .data$label
    )) +
    geom_point(size = 3.5) +
    geom_path() +
    theme_lares(size = size, mg = -1, grid = "XY") +
    labs(x = "Combination rank", y = NULL, caption = caption) +
    theme(
      axis.text.y = element_blank(),
      plot.margin = margin(0, 0, 0, 0)
    ) +
    scale_x_discrete(position = "top")

  # Bar plot: elements
  p2 <- elements %>%
    mutate(label = gsub("var_", "", .data$label)) %>%
    ggplot(aes(
      x = reorder(.data$label, -.data$order),
      y = -.data$n, fill = .data$wt
    )) +
    coord_flip() +
    geom_col() +
    labs(y = "Element Size", x = NULL) +
    theme_lares(size = size, mg = -1, grid = "X") +
    theme(plot.margin = margin(0, 0, 0, 0)) +
    scale_y_continuous(
      position = "right",
      labels = function(x) formatNum(abs(x), 0)
    ) +
    guides(fill = "none")

  # Bar plot: combinations
  p3 <- vals %>%
    mutate(var = ifelse(row_number() > min(limit, limit_x), "...", .data$var)) %>%
    {
      if (!tail) mutate(., n = ifelse(.data$var == "...", 1e-8, .data$n)) else .
    } %>%
    ggplot(aes(
      x = reorder(.data$var, .data$order),
      y = .data$n, fill = .data$wt
    )) +
    geom_col() + # ggfittext::geom_bar_text(size = 9) +
    theme_lares(size = size, mg = -1, grid = "Y") +
    scale_y_continuous(labels = function(x) formatNum(abs(x), 0)) +
    labs(
      x = NULL, y = "Combination Size",
      fill = if (length(wt_str) > 0) {
        paste(str_to_title(fx), wt_str, sep = "\n")
      } else {
        NULL
      }
    ) +
    theme(
      axis.text.x = element_blank(),
      plot.margin = margin(0, 0, 0, 0)
    ) +
    scale_fill_continuous(guide = guide_colourbar(direction = "horizontal"))
  if (length(wt_str) == 0) p3 <- p3 + guides(fill = "none")

  # Join all plots
  layout <- "
  ABBB
  CBBB
  DEEE
  DEEE"
  p <- noPlot(title, size = 3.5) + p3 + guide_area() +
    p2 + p1 + plot_layout(design = layout) +
    plot_layout(guides = "collect")
  if (plot) plot(p)

  # Prepare dataframe outputs
  elements <- mutate(elements, key = gsub("var_", "", .data$key)) %>%
    rename("element" = .data$key) %>%
    select(-.data$label)
  vals <- mutate(vals, var = gsub("var_", "", .data$var)) %>%
    rename("combination" = .data$var)
  colnames(vals) <- gsub("var_", "", colnames(vals))

  results <- list(
    plot = p,
    data = rename(df, "combination" = "which"),
    elements = elements,
    combinations = vals
  )

  return(invisible(results))
}
laresbernardo/lares documentation built on April 25, 2024, 5:31 a.m.