R/plot_pca.R

Defines functions plot_pca

Documented in plot_pca

#' plot_pca
#'
#' Creates various plots from the PCA components as well as the calculated PCA Euclidean distances.
#'
#' @export
#' @param pca_dat The output from 'pah_pca' which includes the component data as well as the calculated Euclidean
#' distances data.
#' @param plot_type Which plot to create. Options include 'distance_boxplot' which summarizes the Euclidean
#' distances by source, 'pca_components' which shows where the sources and samples lie in all combinations
#' of the chosen components space.
#' @param pah_sources a dataframe of source profiles. The default is to use the built-in `source_profiles` table,
#' but users can provide their own table. This is useful if the user has a source profile to add to the built-in table.
#' @param source_abbreviation logical, whether source abbreviations should be used when plot_type = 'distance_boxplot'.
#' @return If plot_type = 'distance_boxplot', a single boxplot is returned with source IDs on the x axis and
#' Euclidean distances on the y axis. If plot_type = 'pca_components', a scatterplot of
#' all possible combinations of chosen components are included in a panel matrix, with samples as black dots
#' and sources as red dots that are labeled with the source abbreviation. To see the full names of each source
#' abbrevation, see table source_ratios.
#' @import ggplot2
#' @import dplyr
#' @importFrom ggrepel geom_text_repel
#' @importFrom cowplot plot_grid
#' @examples

plot_pca <- function(pca_dat, plot_type = "distance_boxplot", source_abbreviation = FALSE, pah_sources = sources) {
  if (plot_type == "distance_boxplot") {
    distance <- pca_dat$pca_distance

    if (source_abbreviation == FALSE) {
      distance <- left_join(distance,
                            select(pah_sources, source_abbrev, source_short_no_ref),
                            by = c('source' = 'source_abbrev')) %>%
        select(-source) %>%
        rename(source = source_short_no_ref)
    }
    median_diff <- distance %>%
      group_by(source) %>%
      summarize(median = median(euc_dist)) %>%
      arrange(median)

    distance$source <- factor(distance$source, levels = median_diff$source)

    p <- ggplot(distance, aes(x = source, y = euc_dist)) +
      geom_boxplot(outlier.shape = 1) +
      theme_bw() +
      labs(y = "Euclidean distance\n(zero = identical to sample)", x = "") +
      theme(panel.grid.minor = element_blank(),
            axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

    return(p)

  } else if (plot_type == "pca_components") {
    comp_dat <- pca_dat$pca_dat

    plot_num <- 1
    plot_list <- list()
    my_labels <- rownames(comp_dat)
    my_labels[comp_dat$type == 'sample'] <- ""

    n_pca <- ncol(comp_dat) - 2

    for(xcol in 1:(n_pca-1)){
      for(ycol in 2:n_pca){
        if (xcol >= ycol) {next}

        temp_dat <- data.frame(x = comp_dat[,xcol],
                               y = comp_dat[,ycol],
                               type = comp_dat$type)

        p <- ggplot(data = temp_dat, aes(x = x, y = y)) +
          geom_point(aes_string(color = 'type'), alpha = 0.5, show.legend = F) +
          geom_text_repel(data = temp_dat,
                          aes(x = x, y = y, label = my_labels), size = 2) +
          scale_color_manual(values = c('black', 'red')) +
          labs(x = paste('Component', xcol), y = paste('Component', ycol)) +
          theme_classic()

        plot_list[[plot_num]] <- p
        plot_num <- plot_num + 1
      }
    }

    if (n_pca < 4) {
      p_final <- cowplot::plot_grid(plotlist = plot_list, nrow = 1)
    } else if(n_pca == 4) {
      p_final <- cowplot::plot_grid(plotlist = plot_list, nrow = 3)
    } else if (n_pca > 4) {
      p_final <- cowplot::plot_grid(plotlist = plot_list, nrow = 4)
    }

    return(p_final)

  } else {
    warning('That plot_type does not exist.')
  }
}
limnoliver/pah documentation built on April 30, 2020, 2:45 p.m.