MicrobiomeR: Analyze Microbiome Data

Documented in create_taxmap output_dir transposer treatment_matrix

#' @title Get Taxmap Object
#' @description A function that handles the conversion of objects to Taxmap (\strong{taxa::taxmap}) objects.
#' @param obj An object that contains the data being analyzed.  Can be one of the following:
#' \describe{
#'   \item{Phyloseq Object}{An object generated from the phyloseq package.}
#'   \item{Taxmap Object}{An object generated from the metacoder or taxa package.}
#'   \item{RData file}{An RData file generated from the base::save function.  Can have an extension of .RData or .rda.}
#'   }
#' @return The output generated is a \strong{taxa::taxmap} object.
#' @pretty_print TRUE
#' @details This function is used to convert data to metacoder/taxmap objects for \emph{microbiome} analysis.
#' This function is used at the beginning of every other function to support multiple types of input
#' for the obj parameter in those functions.
#' @examples
#' \dontrun{
#' if(interactive()){
#'  library(MicrobiomeR)
#'  phy_obj <- MicrobiomeR::phyloseq_silva_2
#'  mc_obj <- create_taxmap(phy_obj)
#'  }
#' }
#' @export
#' @family Data Importers
#' @rdname create_taxmap
#' @seealso
#'  \code{\link[metacoder]{parse_phyloseq}}
#'
#'  \code{\link[tools]{fileutils}}
#' @importFrom metacoder parse_phyloseq
#' @importFrom tools file_ext
#' @importFrom crayon red
create_taxmap <- function(obj) {
  if (is.null(obj)) {
    stop(crayon::red("Please use a metacoder/phyloseq object or an rdata file."))
  } else {
    if (inherits(obj, "phyloseq")) {
      metacoder_object <- metacoder::parse_phyloseq(obj)
    } else if (inherits(obj, "Taxmap")) {
      metacoder_object <- obj
    } else if (file.exists(obj)) {
      if (tools::file_ext(obj) %in% c("RData", ".rda")) {
        load(file = obj)
        if (!"metacoder_object" %in% ls()) {
          stop(crayon::red("Please provide a loadable .RData/.rda file that contains an object called \"metacoder_object\"."))
        }
      }
    }
  }
  return(metacoder_object)
}

#' @title Get Treatment Matrix
#' @description A function that returns a matrix with used for comparing treatment data.
#' @param obj An object to be converted to a taxmap object with \code{\link[MicrobiomeR]{create_taxmap}}.
#' @return A matrix with each column representing a comparison to be made.
#' @pretty_print TRUE
#' @details Use this when you want to do pairwise comparisons.
#' @examples
#' \dontrun{
#' if(interactive()){
#'  #EXAMPLE1
#'  }
#' }
#' @export
#' @family Formatting
#' @rdname treatment_matrix
#' @importFrom utils combn
treatment_matrix <- function(obj) {
  # Get treatment data
  treat_1 <- as.character(obj$data$stats_tax_data$treatment_1)
  treat_2 <- as.character(obj$data$stats_tax_data$treatment_2)
  treatments <- unique(c(treat_1, treat_2))
  combinations <- t(utils::combn(seq_along(treatments), 2))
  combinations <- sapply(seq_len(nrow(combinations)),
                         function(index) {
                           set.seed(1)
                           treat_a <- treatments[combinations[index, 1]]
                           treat_b <- treatments[combinations[index, 2]]
                           c(treat_a, treat_b)
                         })
  return(combinations)
}

#' @title Create an Output Directory
#' @description A function for generating a consistent file system for data files and visualizations.  If none
#' of the parameters are set the full path generated by this function will be \emph{root_path}/output
#' @param start_path  With the start_path set, the full path generated by this function will be
#' \emph{root_path}/\emph{start_path}/\emph{?experiment?}/\emph{?plot_type?}/\emph{end_path}.  Default: NULL
#' @param experiment (optional) This will add a second level to the start_path file system.  Default: NULL
#' @param plot_type (optional) This will add a third level to the start_path file system.  With \strong{ONLY}
#' the plot_type set the full path generated by this function will be \emph{root_path}/\emph{plot_type}.  Default: NULL
#' @param end_path (optional) This will add the final directory to the start_path file system Default: A formatted date-time string.
#' @param root_path The root of the new file system (start_path or not).  Default: The working directory.
#' @param custom_path This is an absolute path that overrides everything else.  Default: NULL
#' @param overwrite A logical denoting that overwriting is acceptable.  Default: FALSE
#' @param mkdir A logical denoting weather or not the directory should be created or not.  Default: TRUE
#' @return Creates a directory for output and returns the path as a string.
#' @pretty_print TRUE
#' @details This function is incredibly useful on it's own but also for various other plotting/saving functions within the package.
#'  It helps keep data organized using a standard workflow.
#' @examples
#' \dontrun{
#' if(interactive()){
#'  # Get the path to an output directory without creating
#'  library(MicrobiomeR)
#'  output_dir <- output_dir(start_path="output", experiment="microbiome-proj", mkdir=FALSE)
#'  print(output_dir)
#'
#'  # Create a folder for your plot types
#'  output_dir <- output_dir(plot_type="scatter-plots")
#'  print(output_dir)
#'
#'  }
#' }
#' @export
#' @family Project Management
#' @rdname output_dir
#' @seealso
#'  \code{\link[glue]{glue}}
#'
#'  \code{\link[stringr]{case}}
#' @importFrom glue glue
#' @importFrom stringr str_to_lower
#' @importFrom crayon yellow blue red green
output_dir <- function(start_path=NULL, experiment=NULL, plot_type=NULL, end_path=NULL, root_path=NULL,
                           custom_path = NULL, overwrite=FALSE, mkdir=TRUE) {
  # Create the relative path to the plots.  By default the full_path will be <root_path>/output
  # With ONLY the plot_type set the full_path will be <root_path>/<plot_type>
  # With the start_path set the full path will be <root_path>/<start_path>/?experiment?/?plot_type?/<end_path>.
  # Experiment and plot_type are optional.  End_path is optional, but defaults to formatted data-time string.
  if (is.null(custom_path)) {
    # Get the root path (generally this already exists; like the working directory)
    if (is.null(root_path)) {
      full_path <- file.path(getwd())
    } else {
      full_path <- file.path(root_path)
    }
    # Add the output folder to use (generally a new folder, or pre-existing output folder)
    if (is.character(start_path)) {
      full_path <- file.path(full_path, start_path)
      # Add a folder for the specific experiment you are conducting
      if (is.character(experiment)) {
        full_path <- file.path(full_path, experiment)
      }
      # Add a folder for the plot_type that's being generated
      if (is.character(plot_type)) {
        full_path <- file.path(full_path, plot_type)
      }
      # Add a folder for any extra
      if (is.null(end_path)) {
        full_path <- file.path(full_path, format(Sys.time(), "%Y-%m-%d_%s"))
      } else if (is.character(end_path)) {
        full_path <- file.path(full_path, end_path)
      }
    } else if (is.null(end_path)) {
      # Add a folder for the plot_type that's being generated
      if (is.character(plot_type)) {
        full_path <- file.path(full_path, plot_type)
      } else {
        full_path <- file.path(full_path, "output")
      }
    }
  } else {
    full_path <- custom_path
  }
  # Directory creation.
  if (mkdir == TRUE) {
    answer_flag <- FALSE
    if (dir.exists(full_path) && overwrite == FALSE) {
      warning(glue::glue("The directory {full_path} already exists. And you don't want to overwrite the directory."))
    } else if (dir.exists(full_path) && overwrite == TRUE) {
      warning(glue::glue(crayon::yellow("You have chosen to overwrite the directory: {full_path}.")))
      while (answer_flag == FALSE) {
        answer <- readline(prompt = crayon::blue("Are you sure? (Y/N)"))
        if (stringr::str_to_lower(answer) == "y") {
          dir.create(full_path, recursive = TRUE)
          answer_flag <- TRUE
        } else if (stringr::str_to_lower(answer) == "n") {
          warning(glue::glue(crayon::red("Please set overwrite to FALSE and change the path ({full_path}) with experiment and/or other_path.")))
          stop(crayon::red("The files haven't been saved.  You have chosen not to overwrite you files."))
        } else {
          message(crayon::yellow("Please enter Y for YES or N for NO.  This is not case sensitive."))
          answer_flag <- FALSE
        }
      }
    } else if (!dir.exists(full_path)) {
      message(glue::glue(crayon::green("Creating a new directory: {full_path}")))
      dir.create(full_path, recursive = TRUE)
    }
  }
  return(full_path)
}

#' @title Transposing Tidy Data
#' @description This function transposes tables containing numeric and categorical data using the
#' tidyr package.
#' @param .data A matrix/data_frame/tibble for transposing.
#' @param ids The column to transpose by.  Default: The first column.
#' @param header_name A name for the numeric data that will be transposed.
#' @param preserved_categories A logical denoting weather categorical data should be conserved.  A
#' value of FALSE will cause all categorical data except the \emph{ids} to be dropped.  A value of
#' TRUE will cause the categorical data to preserved by \emph{tidyr::unite}ing these columns.  Default: TRUE
#' @param separated_categories A vector containing ordered column names to use in a previously transposed
#' and categorically preserved table.  Retransposing with this set should yield an exact replicate of
#' the original data.  Default: NULL
#' @return A transposed data table as a tibble.
#' @pretty_print TRUE
#' @details Transposing can help with preforming operations on the rows of your tibbles.
#' @examples
#' \dontrun{
#' if(interactive()){
#' # This example uses data that are no longer available in the MicrobiomeR package,
#' # however, they can be easily generated with \code{\link{MicrobiomeR}{as_basic_format}}.
#'  library(MicrobiomeR)
#'  basic_silva <- as_MicrobiomeR_format(MicrobiomeR::raw_silva_2, "basic_format")
#'  data <- basic_silva$data$taxa_abundance
#'  trans_data <- data %>%
#'    transposer(ids = "taxon_id", header_name = "samples")
#'  retrans_data <- trans_data %>%
#'    transposer(ids="samples", header_name="taxon_id")
#'  }
#' }
#' @export
#' @family Data Manipulators
#' @rdname transposer
#' @seealso
#'  \code{\link[tibble]{is_tibble}}
#'
#'  \code{\link[dplyr]{select_all}},  \code{\link[dplyr]{select}},  \code{\link[dplyr]{reexports}},  \code{\link[dplyr:tidyeval]{sym}},  \code{\link[dplyr:reexports]{one_of}}
#'
#'  \code{\link[tidyr]{gather}},  \code{\link[tidyr]{unite}},  \code{\link[tidyr]{spread}},  \code{\link[tidyr]{separate}}
#'
#'  \code{\link[stringr]{str_detect}},  \code{\link[stringr]{str_count}}
#' @importFrom tibble is.tibble
#' @importFrom dplyr select_if select as_tibble sym one_of
#' @importFrom tidyr gather unite spread separate
#' @importFrom stringr str_detect str_count
#' @importFrom crayon red yellow silver
transposer <- function(.data, ids = NULL, header_name, preserved_categories = TRUE, separated_categories = NULL) {
  # Verify format
  if (!(is.matrix(.data) | is.data.frame(.data) | tibble::is.tibble(.data))) {
    stop(crayon::red("Data not transposable."))
  }
  input <- .data
  # Get ids if none are given, defaults to the first column
  if (is.null(ids)) {
    ids <- input[1] %>% colnames()
    message(crayon::yellow(sprintf("There were no ids given.  Defaulting to the first column: %s", ids)))
  }
  # Get numeric data (columns)
  num_cols <- input %>% dplyr::select_if(is.numeric) %>% purrr::keep(!names(.) %in% ids) %>% colnames()

  # Transform
  if (preserved_categories == TRUE) { # All categorical data is preserved
    message(crayon::yellow("Categorical data will be united as a string, which can be tidyr::separated after re-transposing."))
    preserved_categories <- input %>% dplyr::select(-dplyr::one_of(c(num_cols))) %>% colnames()
    trans_data <- input %>%
      dplyr::as_tibble() %>%
      tidyr::gather(key = !!dplyr::sym(header_name), "_data", -c(preserved_categories)) %>% # Gather columns other that aren't preserved
      tidyr::unite("_categories", c(preserved_categories), sep = "<_>") %>% # Preserve the categorical data in 1 column
      tidyr::spread("_categories", "_data") # Spread categorical data over the numerical data
  } else if (preserved_categories == FALSE) { # Only the ids are preserved
    trans_data <- input %>% dplyr::select(c(ids, num_cols)) %>%
      dplyr::as_tibble() %>%
      tidyr::gather(key = !!dplyr::sym(header_name), "_data", -c(ids)) %>%
      tidyr::spread(!!dplyr::sym(ids), "_data")
  }
  # Look for previously transformed tibbles and separate any united colums
  if (all(suppressWarnings({stringr::str_detect(trans_data[header_name], "\\<\\_\\>")}))) {
    if (!is.null(separated_categories)) {
      trans_data <- trans_data %>% tidyr::separate(col = header_name, into = separated_categories, sep = "<_>")
    } else {
      message(crayon::yellow("Separated categories have not been supplied.  Columns will be named as \"category_#\"."))
      n_cats <- stringr::str_count(trans_data[[header_name]][1], pattern = "<_>") + 1
      trans_data <- trans_data %>% tidyr::separate(col = header_name, into = paste("category", seq(1:n_cats), sep = "_"), sep = "<_>")
    }
    message(crayon::silver("Re-Transposed Data."))
    return(trans_data)
  } else {
    message(crayon::silver("Transposed Data."))
    return(trans_data)
  }
}

#' @title Transforming Tidy Data
#' @description This function transforms a table by rows or by columns.
#' @param .data A matrix/data_frame/tibble for transforming.
#' @param func A function, which can be anonymous, that will be used to transform the data.  (e.g. proportions:
#' \emph{x/sum(x)}).
#' @param by This denotes how the data should be transformed (\strong{column/row}). Default: 'column'
#' @param ids The column to transpose by if \emph{by = 'row'}. Default: The first column.
#' @param header_name A name for the numeric data that will be transposed if \emph{by = 'row'}.
#' @param preserved_categories A logical denoting weather categorical data should be conserved if \emph{by = 'row'}.  A
#' value of FALSE will cause all categorical data except the \emph{ids} to be dropped.  A value of
#' TRUE will cause the categorical data to preserved by \emph{tidyr::unite}ing these columns.  Default: TRUE
#' @param separated_categories A vector containing ordered column names to use in a previously transposed
#' and categorically preserved table if \emph{by = 'row'}.  Retransposing with this set should yield an exact replicate of
#' the original data.  Default: NULL
#' @param ... Additional arguments passed on to \emph{func}.
#' @return A tibble that has been transformed.
#' @details This function transforms the supplied data using the \emph{func} parameter, which is used
#' in the purrr package.  The purr package allows the use of anonymous functions as described in the link below:
#'
#' \url{https://jennybc.github.io/purrr-tutorial/ls03_map-function-syntax.html#anonymous_function,_formula}
#' @pretty_print TRUE
#' @examples
#' \dontrun{
#' if(interactive()){
#' # This example uses data that are no longer available in the MicrobiomeR package,
#' # however, they can be easily generated with \code{\link{MicrobiomeR}{as_basic_format}}.
#'  library(MicrobiomeR)
#'  basic_silva <- as_MicrobiomeR_format(MicrobiomeR::raw_silva_2, "basic_format")
#'  data <- basic_silva$data$taxa_abundance
#'  # Get proportions using the anonymous functions
#'  tax_props <- data %>% transformer(~./sum(.))
#'  # Get proportions using explicit functions
#'  alt_tax_props <- data %>% transformer(function(x)x/sum(x))
#'  }
#' }
#' @export
#' @family Data Manipulators
#' @rdname transformer
#' @seealso
#'  \code{\link[MicrobiomeR]{transposer}}
#'  \code{\link[dplyr]{select_all}},  \code{\link[dplyr]{select}}
#'  \code{\link[purrr]{modify}}
#' @importFrom dplyr select_if select
#' @importFrom purrr modify_at
#' @importFrom crayon red silver
transformer <- function(.data, func, by = "column", ids = NULL, header_name = NULL, preserved_categories = TRUE, separated_categories = NULL, ...) {
  if (!(is.matrix(.data) | is.data.frame(.data) | tibble::is.tibble(.data))) {
    stop(crayon::red("Data not transformable."))
  }
  input <- .data
  if (by == "row") {
    # Transpose table and unite all categorical data into one column for row based transformations
    input <- input %>% transposer(ids = ids, header_name = header_name, preserved_categories = preserved_categories)
  }
  # Get numeric column names
  num_cols <- input %>% dplyr::select_if(is.numeric) %>% purrr::keep(!names(.) %in% ids) %>% colnames()
  # Get all other columsn as preserved columns
  preserved_categories <- input %>% dplyr::select(-dplyr::one_of(num_cols)) %>% colnames()
  # Transform data
  trans_data <- purrr::modify_at(input, num_cols, func, ...)
  if (by == "row") {
    # Retranspose the table and separate the categorical data for row based transformations
    trans_data <- trans_data %>% transposer(ids = header_name, header_name = ids, separated_categories = separated_categories, preserved_categories = FALSE)
  }
  message(crayon::silver("Transformed Data."))
  return(trans_data)
}

#' @title Mock Excel "VlookUp" Function
#' @description A function that mimics excels vlookup, but for R's dataframe.
#' @param lookup_vector A vector of items to look up.
#' @param df The dataframe to search.
#' @param match_var The column name to search in the dataframe.
#' @param return_var The column of data to return when matched.
#' @return A vector that contains the items of interest.
#' @pretty_print TRUE
#' @details A function that works like the VLOOKUP function in excel.  This function was
#' borrowed from \url{https://www.r-bloggers.com/an-r-vlookup-not-so-silly-idea/}.
#' @examples
#' \dontrun{
#' if(interactive()){
#'  #EXAMPLE1
#'  }
#' }
#' @export
#' @family Data Manipulators
#' @rdname vlookup
vlookup <- function(lookup_vector, df, match_var, return_var) {
  # TODO: Update the way this returns data.  Allow it to add data to a new column.
  m <- match(lookup_vector, df[[match_var]])
  df[[return_var]][m]
}

#' @title Create Publication Table
#'
#' @description Creates an image of a table from a dataframe.
#'
#' @param dataframe A dataframe object.
#' @return The text table as an object that can be saved to png/tiff/jpg/pdf.
#' @export
#' @examples
#' \dontrun{
#' if(interactive()){
#' # This example uses data that are no longer available in the MicrobiomeR package,
#' # however, they can be easily generated with \code{\link{MicrobiomeR}{as_basic_format}}.
#'  library(MicrobiomeR)
#'  data <- MicrobiomeR::basic_silva$data$taxa_abundance
#'  create_pub_table(data)
#'  }
#' }
#' @rdname create_pub_table
#'
#' @importFrom ggpubr ggtexttable ttheme colnames_style tbody_style
create_pub_table <- function(dataframe) {
  ggpubr::ggtexttable(dataframe,
                      theme = ggpubr::ttheme(
                        colnames.style = ggpubr::colnames_style(color = "black", fill = "white", linewidth = 0, linecolor = "white", face = "bold"),
                        tbody.style = ggpubr::tbody_style(
                          color = "black", face = "plain", size = 12,
                          fill = "white", linewidth = 0, linecolor = "white"
                        )
                      )
  )
}


####################### Private Package Variables #######################
pkg.private <- new.env(parent = emptyenv())

pkg.private$format_level_list <- list(unknown_format = -1, mixed_format = -1,phyloseq_format = 0,
                                      raw_format = 1, basic_format = 2, analyzed_format = 3)

pkg.private$format_table_list <- list(
  raw_format =  c("otu_abundance", "otu_annotations"),
  basic_format = c("otu_abundance", "otu_annotations",
                   "taxa_abundance", "otu_proportions", "taxa_proportions"),
  analyzed_format = c("otu_abundance", "otu_annotations",
                      "taxa_abundance", "otu_proportions", "taxa_proportions",
                      "statistical_data", "stats_tax_data"),
  phyloseq_format = c("otu_table", "tax_data", "sample_data", "phy_tree"),
  all_formats = c("otu_abundance", "otu_annotations",
                  "taxa_abundance", "otu_proportions", "taxa_proportions",
                  "statistical_data", "stats_tax_data",
                  "otu_table", "tax_data", "sample_data", "phy_tree"),
  expected_table_order = list(otu_abundance   = "otu_abundance",
                             otu_annotations  = "otu_annotations",
                             otu_proportions  = "otu_proportions",
                             sample_data      = "sample_data",
                             phy_tree         = "phy_tree",
                             taxa_abundance   = "taxa_abundance",
                             taxa_proportions = "taxa_proportions",
                             statistical_data = "statistical_data",
                             stats_tax_data   = "stats_tax_data"),
  otu_tables = c("otu_abundance", "otu_annotations", "otu_proportions"),
  taxa_tables = c("taxa_abundance", "taxa_proportions",
                  "statistical_data", "stats_tax_data")
)

pkg.private$ranks <- list(c("Kingdom"),
                          c("Phylum"),
                          c("Class"),
                          c("Order"),
                          c("Family"),
                          c("Genus"),
                          c("Species"))
pkg.private$rank_index <- list(Kingdom = 1,
                               Phylum = 2,
                               Class = 3,
                               Order = 4,
                               Family = 5,
                               Genus = 6,
                               Species = 7)
pkg.private$mc_df_rank_list <- list(Kingdom = 4,
                                    Phylum = 5,
                                    Class = 6,
                                    Order = 7,
                                    Family = 8,
                                    Genus = 9,
                                    Species = 10)
pkg.private$input_files = list(
  biom_files = list(
    silva = system.file("extdata", "silva_OTU.biom", package = "MicrobiomeR"),
    greengenes = system.file("extdata", "greengenes_OTU.biom", package = "MicrobiomeR")),
  tree_files = list(
    silva = system.file("extdata", "silva.tre", package = "MicrobiomeR"),
    greengenes = system.file("extdata", "greengenes.tre", package = "MicrobiomeR")),
  metadata = list(
    two_groups = system.file("extdata", "nephele_metadata2.txt", package = "MicrobiomeR"),
    three_groups = system.file("extdata", "nephele_metadata3.txt", package = "MicrobiomeR"))
)