R/plotters.R

Defines functions plot_causal_network plot_missing plot_table

Documented in plot_causal_network plot_missing plot_table

#' Visualize a causal network as directed graph from node and edge data.
#'
#' 
#' @param graph_data A tidygraph object with nodes and edges with "name", "from", "to", and optionally other variables.
#' @param geom A character name of geometric object to represent nodes, either "text" or "point".
#' @param layout A character name of layout algorithm to use in ggraph, defaults to the force layout. See ?ggraph::ggraph for more. 
#' @param str_wrap_width A numeric width for the string wrapping algorithm.
#' @param arrow_curvature A numeric curvature of the edge arrows.
#' 
#' @importFrom magrittr "%>%"
#' @export

plot_causal_network <- function(graph_data,
                                geom = "text",
                                layout = "auto",
                                str_wrap_width = 10,
                                text_size = 5,
                                edge_width = 3,
                                arrow_curvature = 0.05,
                                arrow_length = 2,
                                clip = "off") {
    
    if (geom == "text") {
    
        gg_object <- graph_data %N>%
            mutate(name = stringr::str_wrap(name, width = str_wrap_width)) %>%
            ggraph::ggraph(layout = layout) +
                ggraph::geom_node_text(
                    mapping = ggplot2::aes(label = name), 
                    lineheight = 0.8,
                    size = text_size
                ) +
                ggraph::geom_edge_arc(
                    mapping = ggplot2::aes(
                        start_cap = ggraph::label_rect(node1.name), 
                        end_cap = ggraph::label_rect(node2.name)
                    ),
                    width = edge_width,
                    curvature = arrow_curvature,
                    arrow = grid::arrow(length = grid::unit(arrow_length, 'mm'), type = "closed", angle = 10)
                ) + 
                ggraph::theme_graph() +
                ggplot2::coord_cartesian(clip = clip)
        
    } else if (geom == "point") {
        
        gg_object <- graph_data %N>%
            mutate(name = stringr::str_wrap(name, width = str_wrap_width)) %>%
            ggraph::ggraph(layout = layout) +
            ggraph::geom_edge_arc(
                curvature = arrow_curvature,
                arrow = grid::arrow(length = grid::unit(arrow_length, 'mm'), type = "closed", angle = 10)
            ) + 
            ggraph::geom_node_point() +
            ggraph::theme_graph() +
            ggplot2::coord_cartesian(clip = clip)
        
    } else {
        
        stop(stringr::str_c("geom", geom, "has not been implemented", sep = " "))
    
    }
    
    return(gg_object)
}

#' Visualize missing value cells in dataframes.
#'
#' For plotting a table of a dataset where missingness is mapped to cell/tile color.
#' Remember the distinction between actual and structural missingness due to the data-format and impossible values.
#' You can easily explore the patterns in missigness visually with this function by sorting the dataset differently before plotting. 
#'
#' @param data A dataframe.
#' @param color_values A character vector of two color hex values. The second one is for the missing values.
#' @param text_size An integer size for the variable names on the y-axis.
#' @param variables_max_n The maximum number of variables that can be plotted. Prevents unwanted slow plotting.
#' 
#' @return A ggplot2 object of the visualized table.
#' @export


plot_missing <- function(data,
                         tile_colors = c("#f2f4fb", "#c30000"),
                         text_size = 9,
                         variables_max_n = 1000) {
    
    if (ncol(data) > variables_max_n) {
        stop("Plotting more variables can be very slow. Adjust the *variables_max_n* argument, if you want to do it.")
    }
    
    plot_result <- data %>%
        dplyr::mutate(row = dplyr::row_number()) %>%
        tidyr::gather("variable", "value", -row) %>%
        dplyr::mutate(missing = dplyr::if_else(is.na(value), TRUE, FALSE)) %>%
        ggplot2::ggplot() +
            ggplot2::geom_tile(ggplot2::aes(
                x = row, 
                y = forcats::fct_relevel(variable, colnames(data)), 
                fill = missing
            )) +
            ggplot2::scale_fill_manual(values = tile_colors, guide = "none") +
            ggplot2::theme_void() +
            ggplot2::theme(
                axis.text.y = ggplot2::element_text(hjust = 1, size = text_size),
                plot.margin = ggplot2::margin(0, 0.5, 0, 0.5, "cm")
            ) +
            ggplot2::scale_x_discrete(position = "top") +
            ggplot2::coord_cartesian(clip = "off")
            
    return(plot_result)

}

#' Create png and svg tables.
#'
#' For creating very simple, easily post-modifiable (svg) tables from small dataframes.
#'
#' @param input A dataframe to plot.
#' @param significant_digits How many digits to spare when rounding long numbers.
#' @param str_wrap_width How many characters to approximately print on one line.
#' @return A ggplot2 object.
#' @export

plot_table <- function(input, significant_digits = 5, str_wrap_width = 10) {

    # dont print massive tables
    stopifnot(nrow(input) < 50, ncol(input) < 20)

    # --- number formatter ----
    number_patterns <- rep("0", times = significant_digits) %>%
        accumulate(str_c) %>%
        str_c("\\.", ., "$") %>%
        prepend(str_sub(., start = 3)) %>%
        rev()
    number_replacement <- number_patterns %>%
        str_replace_all(c("0" = "  ", "\\." = "  ")) %>%
        str_remove_all("[^[:space:]]")
    replacement_map <- number_replacement %>%
        set_names(number_patterns)

    format_numbers <- . %>%
        signif(digits = significant_digits) %>%
        format(scientific = FALSE, trim = FALSE) %>%
        str_replace_all(" ", "  ") %>%
        str_replace_all(replacement_map)

    # --- text formatter -----
    format_text <- . %>%
        stringr::str_replace_all("[^[:alnum:]]+", " ") %>%
        stringr::str_wrap(str_wrap_width)

    # --- re-format input ---
    input_plotformat <- input %>%
        mutate_if(is.character, format_text) %>%
        mutate_if(is.numeric, format_numbers) %>%
        mutate_all(funs(if_else(is.na(.), "", .))) %>%
        mutate(
            row = row_number(),
            even_row = if_else(row %% 2 == 0, TRUE, FALSE)
        ) %>%
        gather("variable", "value", -row, -even_row)

    # --- plot ---
    plot <- input_plotformat %>%
        ggplot() +
            geom_tile(
                aes(x = reorder(variable, row), y = row, fill = even_row),
                width = 0.97
            ) +
            geom_text(
                aes(x = reorder(variable, row), y = row, label = value),
                color = "#565656",
                lineheight = 0.8,
                size = 3
            ) +
            scale_x_discrete(position = "top")  +
            scale_fill_manual(values = c("#ffffff", "#f8f8f8"), guide = "none") +
            scale_y_continuous() +
            theme_void() +
            theme(axis.text.x = element_text(
                    margin = margin(t = 10, b = -15),
                    color = "#3c3c3c",
                    face = "bold"),
                  panel.background = element_rect(fill = "#ffffff", color = "#ffffff")
            )

    return(plot)
}

#' Visualize strings, like unique names, with a wordcloud.
#'
#' For visualizing wordclouds of large numbers of strings.
#' Note: only about 500 strings can be visualized at once without problems.
#'
#' @param data The input dataframe.
#' @param label An unquoted name of the variable to be visualized.
#' @param title A plot title string.
#'
#' @return A ggplot2 object of the wordcloud.
#' @export

plot_strings <- function(data, label) {
    # supports unquoted input
    label <- enquo(label)
    
    p <- data %>%
        mutate_all(stringr::str_wrap, width = 15) %>%
        # ggrepel adjusts the label positions (x, y)
        ggplot(aes(x = 1, y = 1, label = !!label)) +
            ggrepel::geom_text_repel(
                segment.size = 0,
                max.iter = 5000,
                force = 1,
                # text size in relation to number of labels
                size = (1 / log(nrow(data))) * 15,
                lineheight = 0.8
            ) +
            theme_void() +
            theme(plot.title = element_text(size = 40))

    return(p)

}

#' Visualize the distributions of many variables as one plot matrix.
#' 
#' @param data The input dataframe.
#' @param geom The geometric object to use; points represent observations (one per row) and bars represent counts (one per unique value).
#' @param plot_columns_n The number of plot columns.
#' 
#' @return A ggplot2 object of the resulting plots.
#' @export

plot_variables <- function(input, geom = "point", plot_columns_n = 2) {
    
    if (geom == "density") {
        
        variable_r_classes <- purrr::map_chr(input, class)
        
        if (all(variable_r_classes %in% c("numeric", "integer"))) {
            
            plot <- input %>%
                gather("variable", "value") %>%
                ggplot() +
                    geom_density_ridges(
                        aes(x = value, y = variable), 
                        scale = 4
                    ) +
                    theme_ridges() +
                    scale_y_discrete(expand = c(0.01, 0)) +
                    theme(
                        axis.text.y = element_text(size = 8),
                        axis.text.x = element_text(size = 8)
                    )
            
            return(plot)
            
        } else {
            
            stop("densities are not applicable for non-numeric variables")
        
        }            
    }
    
    plot_variable <- function(var, var_name, geom) {
        
        if (geom == "point") {
            
            if (is.numeric(var) | is.Date(var)) {
            
                p <- var %>%
                    tibble::tibble(x = .) %>%
                    dplyr::mutate(mock = "") %>%
                    ggplot2::ggplot() +
                        ggplot2::geom_jitter(
                            ggplot2::aes(x = x, y = mock), 
                            width = 0, 
                            height = 0.15, 
                            alpha = 0.1,
                            stroke = 0 
                        ) +
                        ggplot2::theme_minimal() +
                        ggplot2::theme(
                            axis.text.y = ggplot2::element_blank(), 
                            axis.title.y = ggplot2::element_blank(),
                            panel.grid.major.y = ggplot2::element_blank(),
                            panel.grid.minor = ggplot2::element_blank(),
                            axis.text.x = ggplot2::element_text(
                                hjust = 1, 
                                angle = 25, 
                                size = 7
                            )
                        ) +
                        scale_x_continuous(
                            name = var_name,
                            labels = function(x) format(x, scientific = FALSE)
                        )
            } else {
                # for non-numeric data
                p <- var %>%
                    tibble::tibble(x = .) %>%
                    mutate(x = if_else(is.na(x), "Missing", x)) %>%
                    dplyr::mutate(mock = "") %>%
                    ggplot2::ggplot() +
                        ggplot2::geom_jitter(
                            ggplot2::aes(x = x, y = mock), 
                            width = 0.2, 
                            height = 0.2, 
                            alpha = 0.15
                        ) +
                        ggplot2::theme_minimal() +
                        ggplot2::theme(
                            axis.text.y = ggplot2::element_blank(), 
                            axis.title.y = ggplot2::element_blank(),
                            panel.grid.major.y = ggplot2::element_blank(),
                            panel.grid.minor = ggplot2::element_blank(),
                            axis.text.x = ggplot2::element_text(
                                hjust = 1, 
                                angle = 25, 
                                size = 7
                            )
                        ) +
                        scale_x_discrete(
                            name = var_name, 
                            drop = FALSE,
                            labels = function(x) format(x, scientific = FALSE)
                        )
            }
        
        }
        
        if (geom == "bar") {
            
            if (is.numeric(var) | is.Date(var)) {
            
                p <- var %>%
                    tibble::tibble(x = .) %>%
                    ggplot() +
                        geom_histogram(aes(x = x), bins = 20) +
                        theme_minimal() +
                        ggplot2::theme(
                            plot.margin = ggplot2::margin(20, 20, 20, 20),
                            panel.grid.minor = ggplot2::element_blank(),
                            panel.grid.major.x = ggplot2::element_blank(),
                            plot.title = element_text(
                                hjust = 0,
                                vjust = 1,
                                size = 8
                            ),
                            axis.title.y = element_blank(),
                            axis.title.x = element_text(size = 10)
                        ) +
                        scale_x_continuous(
                            name = var_name,
                            labels = function(x) format(x, scientific = FALSE)
                        ) +
                        coord_cartesian(clip = "off") +
                        # labs(tag = "Count")
                        ggtitle("Count")
                
            } else {
                # for non-numeric data
                p <- var %>%
                    tibble::tibble(x = .) %>%
                    mutate(x = if_else(is.na(x), "Missing", x)) %>%
                    ggplot() +
                        geom_bar(aes(x = x, y = ..count..)) +
                        coord_flip(clip = "off") +
                        theme_minimal() +
                        ggplot2::theme(
                            plot.margin = ggplot2::margin(20, 20, 20, 20),
                            panel.grid.minor = ggplot2::element_blank(),
                            panel.grid.major.x = ggplot2::element_blank(),
                            plot.title = element_text(
                                hjust = 0,
                                vjust = 1,
                                size = 8
                            ),
                            axis.title.y = element_blank(),
                            axis.title.x = element_text(size = 8),
                            axis.text.y = element_text(hjust = 1) 
                        ) +
                        scale_x_discrete(
                            drop = FALSE
                        ) +
                        scale_y_continuous(
                            limits = c(0, length(var)),
                            name = "Count"
                        ) +
                        # labs(tag = var_name)
                        ggtitle(var_name)
                        
                    

            }                
        }
        
        return(p)
    }        
    
    var_names <- colnames(input)
    
    plots <- purrr::map2(input, var_names, plot_variable, geom = geom) %>%
        # adding plots is a patchwork-package feature
        purrr::reduce(`%+%`) +
        patchwork::plot_layout(ncol = plot_columns_n)
    
    return(plots)
}
eteppo/tvs-project documentation built on Aug. 13, 2019, 8:53 a.m.