R/pca_3d.R

Defines functions pca_3d Longit_spread

Documented in pca_3d

#' @title Create 3d PCA plots
#' @name pca_3d
#' @description Create three dimensional PCA plots from longitudinal microbiome data or multiple omics data sets.
#' @param micro_set A tidy_micro data set
#' @param table OTU table of interest
#' @param time_var The time point variable column name in your tidi_MIBI set
#' @param subject The subject variable column name in your tidi_MIBI set
#' @param modes Components of the data to focus on: time, subjects, bacteria, etc.
#' @param plot_scores Plot the scores instead of the principle components
#' @param dist_method Dissimilartiy method to be calculated by vegdist. Euclidean by default
#' @param type Either "PCA" or "PCoA"
#' @param n_compA The number of components along first axis. See details
#' @param n_compB The number of components along second axis. See details
#' @param n_compC The number of components along third axis. See details
#' @param cex.axis Options for \code{\link[scatterplot3d]{scatterplot3d}}
#' @param cex.lab Options for \code{\link[scatterplot3d]{scatterplot3d}}
#' @param main Plot title
#' @param subtitle Plot subtitle
#' @param scalewt Logical; center and scale OTU table, recommended
#' @details Requires that you have columns for subject name and time point. Data must be complete across time points. The function will filter out inconsistent subjects
#'
#' When type = "PCoA" the component matrices must be specified prior to the optimization. This is handled automatically.
#'
#' If n_compA, n_compB, and n_compC aren't specified they will default to the number of complete subjects, the number of taxa, and the number of time points, respectively. This slows down performance slightly, but will not change the results.
#' @references \code{\link{help(vegdist)}}
#' @author Charlie Carpenter, Kayla Williamson
#' @examples
#' data(phy); data(cla); data(ord); data(fam); data(met)
#'
#' otu_tabs = list(Phylum = phy, Class = cla, Order = ord, Family = fam)
#' set <- tidy_micro(otu_tabs = otu_tabs, meta = met)
#'
#' set %>% pca_3d(table = "Family", time_var = day, subject = study_id)
#' @export
pca_3d <- function(micro_set, table, time_var, subject,
                   modes = "AC", dist_method = "euclidean", type = "PCoA", plot_scores = F,
                   n_compA, n_compB, n_compC, cex.axis = 1, cex.lab = 1,
                   main = NULL, subtitle = NULL, scalewt = TRUE){

  if(table %nin% unique(micro_set$Table)) stop("Specified table is not in supplied micro_set")

  ## Making the wide otu format that is needed for U_matrices
  wide_otu <- micro_set %>%
    Longit_spread(table = table, subject = !!rlang::enquo(subject),
                  time_var = !!rlang::enquo(time_var))

  ## Number of timepoints, subjects, and taxa
  n_time <- micro_set %>% dplyr::pull(!!rlang::enquo(time_var)) %>% unique %>% length
  n_sub <- nrow(wide_otu); n_taxa <- ncol(wide_otu)/n_time

  cat("Found", n_time, "time points and", n_sub, "subjects with complete cases.\n\n")
  ## Pulls the maximum number of components if not specified
  ## Get better plots when you don't do this
  if(missing(n_compA)) n_compA <- n_sub
  if(missing(n_compB)) n_compB <- n_taxa
  if(missing(n_compC)) n_compC <- n_time
  ## Stops for incorrect dimensions
  if(n_compA > n_sub){
    stop("n_compA can't be greater than the number of subjects in every time point.")
  }
  if(n_compB > n_taxa){
    stop("n_compB can't be greater than the number of unique taxa in the specified table.")
  }
  if(n_compC > n_time){
    stop("n_compC can't be greater than the number of time points in your data set.")
  }
  if(modes == "CB" & n_time < 3){
    stop("Need at least 3 time points for CB modes.")
  }

  if(type %nin% c("PCA", "PCoA")) stop("'type' must be either 'PCA' or 'PCoA'")

  ## Scales the OTU counts which is recommended by KW Thesis
  if(scalewt) wide_otu %<>% ade4::scalewt()

  if(modes %nin% c("BA", "AC", "CB")) {
    stop("'modes' must be one of 'AB', 'AC', or 'BC'. The function will plot based on the non-specified components")
  }

  modes = paste(modes, "plot", sep = "")

  if(type == "PCA"){
    ## Principle Components (start = 0)
    T3P <- ThreeWay::T3func(wide_otu, n = n_sub, m = n_taxa, p = n_time,
                            r1 = n_compA, r2 = n_compB, r3 = n_compC,
                            start = 0, conv = 0.0000000000000002) %>%
      T3_plots(r1 = n_compA, r2 = n_compB, r3 = n_compC, ., modes, plot_scores) %>%
      .[modes] %>% as.data.frame
  } else if(type == "PCoA"){

    T3 <- U_matrices(wide_otu, method = dist_method, n_compA, n_compB, n_compC,
                     n_sub, n_taxa, n_time)

    ## Principle Coordinant (we define the component matrices, start = 2)
    T3P <- ThreeWay::T3func(wide_otu, n = n_sub, m = n_taxa, p = n_time,
                            r1 = n_compA, r2 = n_compB, r3 = n_compC,
                            start = 2, conv = 0.0000000000000002,
                            T3$A,T3$B,T3$C,T3$G) %>%
      T3_plots(r1 = n_compA, r2 = n_compB, r3 = n_compC, ., modes, plot_scores) %>%
      .[modes] %>% as.data.frame
  }

  if(modes == "ACplot"){
    scatterplot3d::scatterplot3d(T3P[,1], T3P[,2], T3P[,3],
                                 color = factor(rep(1:n_time, each = n_sub)), pch = 16,
                                 xlab = "B1", ylab = "B2", zlab = "B3",
                                 cex.axis = cex.axis, cex.lab = cex.lab,
                                 main = main, sub = subtitle)
  }

  if(modes == "BAplot"){
    scatterplot3d::scatterplot3d(T3P[,1], T3P[,2], T3P[,3],
                                 color = factor(rep(1:n_sub, each = n_taxa)), pch = 16,
                                 xlab = "C1", ylab = "C2", zlab = "C3",
                                 cex.axis = cex.axis, cex.lab = cex.lab,
                                 main = main, sub = subtitle)
  }

  if(modes == "CBplot"){
    scatterplot3d::scatterplot3d(T3P[,1], T3P[,2], T3P[,3],
                                 color = factor(rep(1:n_time, each = n_taxa)), pch = 16,
                                 xlab = "A1", ylab = "A2",zlab = "A3",
                                 cex.axis = cex.axis, cex.lab = cex.lab,
                                 main = main, sub = subtitle)
  }
}

## Helper function to get data in the correct format
Longit_spread <- function(micro_set, table, subject, time_var){

  l_otu <- micro_set %>%
    dplyr::filter(Table == table) %>%
    dplyr::select(Taxa, sub = !!rlang::enquo(subject), time = !!rlang::enquo(time_var), cts) %>%
    # Just in case there are extra levels of the time variable
    # that aren't in the data set
    dplyr::mutate(time = factor(time, levels = unique(time)))

  # 'empty' data frame to fill
  wide_otu <- data.frame(sub = unique(l_otu$sub))
  # for loop to 'spread' taxa out by timepoint
  for(i in 1:length(levels(l_otu$time))){

    ## Filter out by time point, and uniting time_taxa vars into one unique column
    ss <- l_otu %>%
      dplyr::filter(time == levels(time)[i]) %>%
      tidyr::unite(Time_Taxa, time, Taxa)

    ## Filtering out subjects that aren't consistent b/t time points
    wide_otu %<>% dplyr::filter(sub %in% ss$sub)
    ss %<>% dplyr::filter(sub %in% wide_otu$sub)

    ## Spreading counts into wide format
    ss %<>% tidyr::spread(Time_Taxa, cts)

    ## Tacking on time point counts to join by subject
    wide_otu %<>% dplyr::full_join(ss, by = "sub")

  }

  ## Checking that subjects are consistent across all time points
  if(!all(unique(l_otu$sub) %in% unique(wide_otu$sub))){
    warning("Subjects are not consistent across time points.\nOnly complete cases will be used.\n",
            call. = FALSE)
  }

  ## Removing Subjects so we only have our counts
  rownames(wide_otu) <- wide_otu$sub
  wide_otu %<>% dplyr::select(-matches("sub"))

  return(wide_otu)
}
CharlieCarpenter/tidy.micro documentation built on Jan. 19, 2020, 6:28 p.m.