R/plot_scatter.R

Defines functions plot_scatter

Documented in plot_scatter

#' Generates a scatter plot of data for two continuous variables
#'
#'
#' @description
#' `plot_scatter` returns a ggplot2 object of data from two continuous
#' variables.  Can indicate regression line and its confidence
#' interval,prediction intervals regression residuals and more. This function
#' requires as input an esci_estimate object generated by [esci::estimate_r()]
#'
#'
#' @inherit plot_describe details
#'
#'
#' @param estimate - an esci_estimate object generated by [esci::estimate_r()]
#'
#' @param show_line - Boolean; defaults to FALSE; set to TRUE to show the
#'   regression line
#' @param show_line_CI  - Boolean; defaults to FALSE; set to TRUE to show the
#'   confidence interval on the regression line
#' @param show_PI - Boolean; defaults to FALSE; set to TRUE to show prediction
#'   intervals
#' @param show_residuals - Boolean; defaults to FALSE; set to TRUE to show
#'   residuals of prediction
#' @param show_mean_lines - Boolean; defaults to FALSE; set to TRUE to plot
#'   lines showing the mean of each variable
#' @param show_r - Boolean; defaults to FALSE; set to TRUE to print the *r*
#'   value and its CI on the plot
#' @param predict_from_x - Optional real number in the range of the x variable
#'   for the plot; Defaults to NULL; if passed, the graph shows the predicted Y'
#'   for this x value
#' @param plot_as_z - Boolean; defaults to FALSE; set to TRUE to convert x and y
#'   scores to z scores prior to plotting
#' @param ggtheme - Optional ggplot2 theme object to control overall styling;
#'   defaults to [ggplot2::theme_classic()]
#'
#'
#' @inherit plot_describe return
#'
#'
#' @inherit estimate_r examples
#'
#' @export
plot_scatter <- function(
    estimate,
    show_line = FALSE,
    show_line_CI = FALSE,
    show_PI = FALSE,
    show_residuals = FALSE,
    show_mean_lines = FALSE,
    show_r = FALSE,
    predict_from_x = NULL,
    plot_as_z = FALSE,
    ggtheme = ggplot2::theme_classic()
) {

    # Check inputs ---------------------------
  # Needs to be of class estimate
  esci_assert_type(estimate, "is.estimate")
  y <- type <- upr <- lwr <- predicted <- NULL

  # Check raw data available
  if (is.null(estimate$raw_data) | nrow(estimate$raw_data) == 0) {
    err_string <- stringr::str_interp(
      "This plot only works for estimates with raw data available."
    )
    stop(err_string)
  }

  warnings <- NULL

  plotting_groups <- !is.null(estimate$overview$grouping_variable_name)
  if (plotting_groups) {
    if (!is.null(predict_from_x)) {
      warning("Ignoring predict_from_x because the estimate passed is a difference in r; Not able to do predictions for that type of estimate yet.")
      predict_from_x <- NULL
    }
    if (show_PI){
      warning("Ignoring request to show prediction intervals (show_PI) because estimate passed is a difference in r; Not able to show prediction intervals for this type of estimate yet.")
      show_PI <- FALSE
    }
    if (show_residuals){
      warning("Ignoring request to show residuals (show_residuals) because estimate passed is a difference in r; Not able to show residuals for this type of estimate yet.")
      show_residuals <- FALSE
    }
  }

  rdata <- estimate$raw_data

  x_mean <- mean(rdata$x, na.rm = TRUE)
  x_sd <- sd(rdata$x, na.rm = TRUE)
  y_mean <- mean(rdata$y, na.rm = TRUE)
  y_sd <- sd(rdata$y, na.rm = TRUE)

  if (!plotting_groups & plot_as_z) {
    rdata$x <- (rdata$x - x_mean) / x_sd
    rdata$fit <- (rdata$fit -  y_mean) / y_sd
    rdata$lwr <- (rdata$lwr-  y_mean) / y_sd
    rdata$upr <- (rdata$upr -  y_mean) / y_sd
    rdata$y <- (rdata$y - y_mean) / y_sd

  }

  # predict_from_x
  if(!is.null(predict_from_x)) {

    if(!is.numeric(predict_from_x)) {
      predict_from_x <- NULL
      warning_string <- stringr::str_interp("'predict_from_x' ignored because it is not numeric, instead it is ${class(predict_from_x)}.")
      warnings <- c(warnings, warning_string)
      warning(warning_string)
    } else {
      if(predict_from_x > max(rdata$x, na.rm = TRUE)) {
        warning_string <- stringr::str_interp("'predict_from_x' ignored because it is out of range.  predict_from_x = ${predict_from_x}, but highest x value is $[.2f]{max(rdata$x, na.rm = TRUE)}.")
        warnings <- c(warnings, warning_string)
        warning(warning_string)
        predict_from_x <- NULL
      } else {
        if(predict_from_x < min(rdata$x, na.rm = TRUE)) {
          warning_string <- stringr::str_interp("'predict_from_x' ignored because it is out of range.  predict_from_x = ${predict_from_x}, but lowest x value is $[.2f]{min(rdata$x, na.rm = TRUE)}.")
          warnings <- c(warnings, warning_string)
          warning(warning_string)
          predict_from_x <- NULL

        }
      }
    }
  }


  # Prep for building graph ---------------------------
  zfix <- if (plot_as_z & !plotting_groups) "*Z*<sub>" else NULL
  zpost <- if (plot_as_z & !plotting_groups) "</sub>" else NULL

  ylab <- paste(zfix, estimate$es_r$y_variable_name, zpost, sep = "")
  xlab <- paste(zfix, estimate$es_r$x_variable_name, zpost, sep = "")


  if (!plotting_groups) {
    myslope <- estimate$regression$values[2]
    myintercept <- estimate$regression$values[1]
    myr <- estimate$es_r$effect_size[[1]]
    if (plot_as_z) {
      rdata$predicted <- rdata$x * myr
    } else{
      rdata$predicted <- rdata$x * myslope + myintercept
    }
  }


  # Build the plot

  # Points and regression line
  if (plotting_groups) {

    rdata$type <- "Unused"
    rdata[rdata$grouping_variable == estimate$es_r_difference$effect[[1]], ]$type <- "Comparison"
    rdata[rdata$grouping_variable == estimate$es_r_difference$effect[[2]], ]$type <- "Reference"
    my_labels <- c(
      paste(estimate$es_r_difference$effect[[1]], ": *r* = ", format( estimate$es_r_difference$effect_size[[1]], digits = 2), sep = ""),
      paste(estimate$es_r_difference$effect[[2]], ": *r* = ", format( estimate$es_r_difference$effect_size[[2]], digits = 2), sep = ""),
      if (nrow(estimate$es_r) > 2) "All others" else NULL
    )

    my_fills <- c(
      "Reference" =  "#008DF9",
      "Comparison" = "#009F81",
      "Unused" = "gray65"
    )
    my_shapes <- c(
      "Reference" =  "square filled",
      "Comparison" = "circle filled",
      "Unused" = "diamond filled"
    )
    my_colour <- c(
      "Reference" =  "black",
      "Comparison" = "black",
      "Unused" = "black"
    )
    my_alphas <- c(
      "Reference" =  0.75,
      "Comparison" = 0.75,
      "Unused" = .2
    )
    my_sizes <- c(
      "Reference" =  4,
      "Comparison" = 4,
      "Unused" = 1
    )


    # Base and theme
    myplot <- ggplot2::ggplot(
      data = rdata,
      ggplot2::aes(
        x = x,
        y = y,
        group = type
      )
    )

    myplot <- myplot + ggplot2::geom_point(
      ggplot2::aes(
        color = type,
        fill = type,
        alpha = type,
        size = type,
        shape = type
      )
    )
    myplot <- esci_plot_layers(myplot, "raw_point")

    myplot <- myplot + ggtheme
    scale_title <- estimate$es_r$grouping_variable_name[[1]]
    myplot <- myplot + ggplot2::scale_color_manual(values = my_colour, labels = my_labels, name = scale_title)
    myplot <- myplot + ggplot2::scale_fill_manual(values = my_fills, labels = my_labels, name = scale_title)
    myplot <- myplot + ggplot2::scale_shape_manual(values = my_shapes, labels = my_labels, name = scale_title)
    myplot <- myplot + ggplot2::scale_alpha_manual(values = my_alphas, labels = my_labels, name = scale_title)
    myplot <- myplot + ggplot2::scale_size_manual(values = my_sizes, labels = my_labels, name = scale_title)

    myplot$esci_scale_labels <- my_labels


    plot_levels <- c(
      "#008DF9" = "Reference",
      "#009F81" = "Comparison"
    )

    for (x in 1:length(plot_levels)) {
      next_up <- plot_levels[x]
      this_data <- rdata[rdata$type == next_up, ]
      if (nrow(this_data) > 0) {

        if (show_line_CI) {
          myplot <- myplot + ggplot2::geom_smooth(
            data = this_data,
            formula = y ~ x,
            method='lm',
            se = TRUE,
            level = estimate$properties$conf_level,
            color = NA,
            fill = names(plot_levels[x]),
            alpha = 0.25
          )
          myplot <- esci_plot_layers(myplot, paste("summary_", next_up, "_line_CI", sep = ""))

        }

        if (show_line) {        myplot <- myplot + ggplot2::geom_smooth(
          data = this_data,
          formula = y ~ x,
          method='lm',
          se = TRUE,
          level = estimate$properties$conf_level,
          color = names(plot_levels[x]),
          fill = NA
        )
        myplot <- esci_plot_layers(myplot, paste("summary_", next_up, "_line", sep = ""))


        }

      }
    }


  } else {
    # Base and theme
    myplot <- ggplot2::ggplot(
      data = rdata,
      ggplot2::aes(
        x = x,
        y = y
      )
    )
    myplot <- myplot + ggtheme


    if (show_mean_lines) {
      myplot <- myplot + ggplot2::geom_hline(
        yintercept = mean(rdata$y, na.rm = TRUE),
        linetype = "dotted",
        color = "black"
      )
      myplot <- myplot + ggplot2::geom_vline(
        xintercept = mean(rdata$x, na.rm = TRUE),
        linetype = "dotted",
        color = "black"
      )
    }

    # Regression line CI
    if (show_line_CI) {
      myplot <- myplot + ggplot2::geom_smooth(
        data = rdata,
        formula = y ~ x,
        method='lm',
        se = TRUE,
        level = estimate$properties$conf_level,
        color = NA,
        fill = "blue",
        alpha = 0.25
      )
      myplot <- esci_plot_layers(myplot, "summary_Reference_line_CI")
    }

    # Regression line
    if (show_line) {
      myplot <- myplot + ggplot2::geom_smooth(
        data = rdata,
        formula = y ~ x,
        method='lm',
        se = TRUE,
        level = estimate$properties$conf_level,
        color = "black",
        fill = NA
      )
      myplot <- esci_plot_layers(myplot, "summary_Reference_line")
    }

    # Points
    myplot <- myplot + ggplot2::geom_point(
      shape = 21,
      fill = "NA",
      colour = "blue",
      size = 3
    )
    myplot <- esci_plot_layers(myplot, "raw_Reference_point")
  }


  # Prediction intervals
  if (show_PI) {
    myplot <- myplot + ggplot2::geom_line(
      data = rdata,
      ggplot2::aes(y=upr, x = x),
      color = "red",
      linetype = "dashed"
    )
    myplot <- esci_plot_layers(myplot, "prediction_interval_upper")

    myplot <- myplot + ggplot2::geom_line(
      data = rdata,
      ggplot2::aes(y=lwr, x = x),
      color = "red",
      linetype = "dashed"
    )
    myplot <- esci_plot_layers(myplot, "prediction_interval_lower")

  }


  # Residuals
  if (show_residuals) {
    myplot<- myplot + ggplot2::geom_segment(
      data = rdata,
      ggplot2::aes(
        x = x,
        xend = x,
        y = y,
        yend = predicted
      ),
      colour = "red"
    )
    myplot <- esci_plot_layers(myplot, "residuals")
  }


  # Style options
  if (!plotting_groups) {
    myplot <- myplot + theme(legend.position="none")
  } else {
    myplot <- myplot + theme(legend.position=c(0.15,1))
  }
  myplot <- myplot + theme(axis.line = element_line(size = 1, linetype = "solid"))


  if(!is.null(predict_from_x)) {
    ny_min <- min(ggplot2::layer_scales(myplot)$y$range$range)
    ny_max <- max(ggplot2::layer_scales(myplot)$y$range$range)
    nx_min <- min(ggplot2::layer_scales(myplot)$x$range$range)
    nx_max <- max(ggplot2::layer_scales(myplot)$x$range$range)
    nxrange <- nx_max - nx_min

    if (plot_as_z) {
      ypr <- (predict_from_x * myr)
    } else {
      ypr <- (predict_from_x * myslope) + myintercept
    }



    myplot <- myplot + geom_segment(linetype = "dotted", aes(x = predict_from_x, xend = predict_from_x, y = ny_min - (0.05* (ny_max - ny_min)), yend = ypr), alpha = 0.5, size = 2)
    myplot <- esci_plot_layers(myplot, "prediction_vertical_line")
    myplot <- myplot + geom_segment(linetype = "dotted", aes(x = nx_min - 0.20*nxrange, xend = predict_from_x, y = ypr, yend = ypr), alpha = 1, size = 2)
    myplot <- esci_plot_layers(myplot, "prediction_horizontal_line")

    yplabel <- paste(zfix, "*\U0176*", zpost, " = ", format(ypr, digits = 4), sep = "")
    myplot <- myplot + ggtext::geom_richtext(
      ggplot2::aes(
        x = nx_min - 0.20*nxrange,
        y = ypr,
        label = yplabel
      ),
      text.color = "red",
      fill = "white",
      label.color = NA
    )
    myplot <- esci_plot_layers(myplot, "prediction_y_label")

    xlabel <- paste(zfix, "*X*", zpost, " =", predict_from_x, sep = "")
    y_half <- ny_min - (0.05* (ny_max - ny_min))
    myplot <- myplot + ggtext::geom_richtext(
      ggplot2::aes(
        x = predict_from_x,
        y = y_half,
        label = xlabel
      ),
    text.color = "red",
    fill = "white",
    label.color = NA
    )
    myplot <- esci_plot_layers(myplot, "prediction_x_label")

    pi_input <- if (plot_as_z) (predict_from_x * x_sd) + x_mean else predict_from_x

    xlab <- paste(xlab, "\n<br>At ", zfix, "*X*", zpost, " =", predict_from_x, ": ", zfix, "*\U0176*", zpost, " = ", format(ypr, digits = 4))

    if (show_line_CI) {
      ci <- predict(estimate$properties$lm, interval = "confidence", newdata = data.frame(x = pi_input), level = estimate$properties$conf_level)
      if (plot_as_z) {
        ci[1, "lwr"] <- (ci[1, "lwr"] - y_mean) / y_sd
        ci[1, "upr"] <- (ci[1, "upr"] - y_mean) / y_sd
      }
      xlab <- paste(
        xlab,
        "<br>",
        format(estimate$properties$conf_level*100, digits=1),
        "% CI[", format(ci[1, "lwr"], digits=3),
        ",", format(ci[1, "upr"], digits=3),
        "]",
        sep = ""
      )
      myplot <- myplot + geom_segment(alpha = 0.1, size = 2, color = "blue", aes(x = predict_from_x, xend = predict_from_x, y=ci[1, "lwr"], yend = ci[1, "upr"]))
      myplot <- esci_plot_layers(myplot, "prediction_confidence_interval")
    }


    if (show_PI) {
      pi <- predict(estimate$properties$lm, interval = "prediction", newdata = data.frame(x = pi_input), level = estimate$properties$conf_level)
      if (plot_as_z) {
        pi[1, "lwr"] <- (pi[1, "lwr"] - y_mean) / y_sd
        pi[1, "upr"] <- (pi[1, "upr"] - y_mean) / y_sd
      }
      xlab <- paste(xlab, "<br>", format(estimate$properties$conf_level*100, digits=1), "% PI[", format(pi[1, "lwr"], digits=3), ",", format(pi[1, "upr"], digits=3), "]", sep = "")
      myplot <- myplot + geom_segment(alpha = 0.1, size = 2, color = "red", aes(x = predict_from_x, xend = predict_from_x, y=pi[1, "lwr"], yend = pi[1, "upr"]))
      myplot <- esci_plot_layers(myplot, "prediction_prediction_interval")
    }



    myplot <- myplot + annotate("point", x = predict_from_x, y = ypr, colour = "red", shape = 23, size=4, fill="white")
    myplot <- esci_plot_layers(myplot, "prediction_point")

  }

  # Default scale will be same z range for x and y
  y_min <- min(ggplot2::layer_scales(myplot)$y$range$range)
  y_max <- max(ggplot2::layer_scales(myplot)$y$range$range)
  x_min <- min(ggplot2::layer_scales(myplot)$x$range$range)
  x_max <- max(ggplot2::layer_scales(myplot)$x$range$range)
  y_mean <- mean(rdata$y, na.rm = TRUE)
  x_mean <- mean(rdata$x, na.rm = TRUE)
  y_s <- sd(rdata$y, na.rm = TRUE)
  x_s <- sd(rdata$x, na.rm = TRUE)


  if (show_r) {
    myplot <- myplot + ggtext::geom_richtext(
      ggplot2::aes(
        x = x_min + (0.10 * (x_max - x_min)),
        y = y_max - (0.05 * (y_max - y_min)),
        label = paste("*r* = ", format(myr, digits = 2))
      ),
      text.color = "black",
      fill = NA,
      label.color = NA,
      size = 5
    )
    myplot <- esci_plot_layers(myplot, "r_label")
  }

  mypadding <- 0.05

  y_min <- y_min - (mypadding * (y_max - y_min))
  y_max <- y_max + (mypadding * (y_max - y_min))
  x_min <- x_min - (mypadding * (x_max - x_min))
  x_max <- x_max + (mypadding * (x_max - x_min))

  if (!is.null(predict_from_x)) {
    x_min <- x_min - (mypadding * (x_max - x_min))
  }


  y_zmin <- (y_min - y_mean) / y_s
  y_zmax <- (y_max - y_mean) / y_s
  x_zmin <- (x_min - x_mean) / x_s
  x_zmax <- (x_max - x_mean) / x_s

  if (y_zmin < x_zmin) {
    x_min <- (x_s * y_zmin) + x_mean
  } else {
    y_min <- (y_s * x_zmin) + y_mean
  }

  if (y_zmax > x_zmax) {
    x_max <- (x_s * y_zmax) + x_mean
  } else {
    y_max <- (y_s * x_zmax) + y_mean
  }

  myplot <- myplot + ggplot2::scale_y_continuous(
    limits = c(y_min, y_max),
    expand = c(0, 0),
    name = ylab
  )

  myplot <- myplot + ggplot2::scale_x_continuous(
    limits = c(x_min, x_max),
    expand = c(0, 0),
    name = xlab
  )

  myplot <- myplot + ggplot2::xlab(xlab)
  myplot <- myplot + ggplot2::ylab(ylab)


  myplot$esci_ymin <- y_min
  myplot$esci_ymax <- y_max
  myplot$esci_xmin <- x_min
  myplot$esci_xmax <- x_max

  myplot$esci_xlab <- xlab
  myplot$esci_ylab <- ylab


  myplot <- myplot + ggplot2::theme(
    axis.text.y = ggtext::element_markdown(),
    axis.title.y = ggtext::element_markdown(),
    axis.text.x = ggtext::element_markdown(),
    axis.title.x = ggtext::element_markdown(),
    legend.title = ggtext::element_markdown(),
    legend.text = ggtext::element_markdown()
  )

  myplot$warnings <- warnings

  return(myplot)
}
rcalinjageman/esci documentation built on March 29, 2024, 7:30 p.m.