R/helper_functions.R

Defines functions arrange_categories relab_sample_weighter time_weights S_checker relab_checker relab_phyloseq

Documented in relab_phyloseq time_weights

# relab_phyloseq ---------------------------------------------------------------
#' Generate a relative abundance matrix with sample metadata and OTU abundances from a  phyloseq object.
#'
#' The R package phyloseq streamlines the storage and analysis of microbiome sequence data. This function takes a phyloseq object and extracts the OTU table and the sample metadata and combines them into one relative abundance matrix with rows corresponding to samples, metadata on the left-hand side, and OTU relative abundances on the right-hand side.
#'
#' @param phyloseq_object A phyloseq object containing both an OTU table (`otu_table`) and sample metadata (`sample_data`).
#' @returns A data frame with rows representing samples and columns representing sample data categories or OTU relative abundances.
#' OTU abundances are automatically normalized so that they sum to 1 for each sample, though a warning will be provided if a
#' renormalization was necessary.
#' @examples
#' if (requireNamespace("phyloseq", quietly = TRUE)) {
#'   data(GlobalPatterns, package = "phyloseq")
#'
#' # Make a small phyloseq object for demonstration
#' phyloseq_subset = phyloseq::subset_taxa(phyloseq::subset_samples(GlobalPatterns,
#'                                                                  X.SampleID %in%
#'                                                                  c("CL3", "CC1")),
#'                                         Order == "Cenarchaeales")
#'   otu_table = relab_phyloseq(phyloseq_subset)
#'   otu_table[, 1:10]
#' }
#' @export
relab_phyloseq <- function(phyloseq_object){
  if(is.null(phyloseq_object@sam_data)){
    warning("phyloseq_object does not have sample_data")
    if(phyloseq::taxa_are_rows(phyloseq_object)){
      return(t(phyloseq::otu_table(phyloseq_object)))
    }else{
      return(phyloseq::otu_table(phyloseq_object))
    }
  }
  if(is.null(phyloseq_object@otu_table)){
    stop("phyloseq_object does not include an otu_table")
  }

  if(phyloseq::taxa_are_rows(phyloseq_object)){
    output = cbind(phyloseq::sample_data(phyloseq_object),
          t(phyloseq::otu_table(phyloseq_object)))
  }else{
    output = cbind(phyloseq::sample_data(phyloseq_object),
          (phyloseq::otu_table(phyloseq_object)))
  }

  meta_cols = ncol(phyloseq::sample_data(phyloseq_object))
  data_cols = (meta_cols+1):ncol(output)

  relab_sums = rowSums(output[,data_cols])

  if(any(relab_sums == 0)){
    warning(paste0("The following samples summed to 0 and were excluded: ", paste0(names(which(relab_sums==0)), collapse = ", ")))
    output = output[-which(relab_sums==0),]
    relab_sums = rowSums(output[,data_cols])
  }

  if(!all(round(relab_sums, 7)==1)){
    output[,data_cols] = output[,data_cols]/relab_sums
    warning("Some of the sample abundances do not sum to exactly 1. Rounding the sum of each sample to 1 by dividing all entries by the sum of the sample.")
  }

  return(output)

}



# relab_checker ------------------------------------------------------------------
# An internal function to check if relab matrices are up to spec and fix any issues automatically.
# relab - a relative abundance matrix provided as input to another function
# K - Optional; the number of ancestral clusters
# rep - an optional parameter used if relab matrix is one of a list, in order to provide more useful warning messages
# group - Optional; the group that each row of the relative matrix abundance corresponds to
# time - Optional; the timepoint that each row of the relative abundance matrix corresponds to
relab_checker <- function(relab, K = NULL, rep = NULL, group = NULL, time = NULL) {
  original_relab = relab

  if(is.null(K)){
      K = ncol(relab) - (!is.null(group)) - (!is.null(time))
  }

  # Check if relab matrix is within a list, and extract if needed
  if (is.list(relab) && !is.data.frame(relab) && !is.array(relab)) {
    relab <- relab[[1]]
  }
  # If the matrix contains meta information, extract the last K columns.
  if (ncol(relab) > K) {
    relab <- relab[, (ncol(relab) - K + 1):ncol(relab)]
  }

  # Give warning if any of the grouping variables are numeric
  if(any(group%in% as.character(1:1000))){
    warning("At least one of your group names is a number. Avoid numeric group names in order to avoid errors.")
  }

  # convert relab matrix entries to numbers
  if(any(!apply(relab, MARGIN = c(1,2), is.numeric))){
    relab <- apply(relab, MARGIN = c(1,2), as.numeric)
    warning("Some of your relative abundances are not numeric. I am converting all abundances to numbers with `as.numeric()`. Please double check your entries for `K` and `group`, and review the structure of your `relab_matrix` with `str()`.")
  }

  # Name relab matrix columns q1, q2, ..., qK
  # colnames(relab) <- paste0("q",1:K)

  # Check if relab matrix has any missing values, and give warning if necessary
  if(any(is.na(relab))){
    # Identify location of missing entries
    na.pos <- sapply(which(is.na(relab)),
                     function(index) c(index %% nrow(relab), ceiling(index/nrow(relab)))) %>%
      t()
    # Format missing entries as a string
    na.pos.format <- list()
    for(row in 1:nrow(na.pos)){
      na.pos.format[row] <- paste0("(", na.pos[row,1], ", ", na.pos[row,2], ")")
    }
    na.pos.format.string <- as.character(na.pos.format) %>% paste(collapse = ", ")

    stop(paste0("There is at least one NA value in your relab matrix. The missing entries are found in the following positions: ",
                na.pos.format.string))
  }

  # check if matrix rows sum to 1, and give useful warnings if rounding is necessary
  sums <- rowSums(relab) %>% round(5)
  if (any(sums != 1)) {
    if (is.null(rep)) {

      if(any(sums==0)){
        stop(paste0("The following rows of your relative abundance matrix sum to 0: ",
                    paste(as.character(which(sums==0)), collapse = ", "),
                    " Every row must have at least one non-zero abundance."))
      }

      warning("Your relative abundance matrix has rows which do not sum to exactly 1. Rounding the sum of each row to 1 by dividing all entries by the sum of the row.")
    } else {
      if(any(sums==0)){
        stop(paste0("At least one of the rows of your relative abundance matrix sums to 0 Every row must have at least one non-zero abundance."))
      }

      warning(paste0(
        "At least one of the rows of relab matrix number ", rep,
        " (restricted to the last K columns) does not sum to 1. Rounding the sum of each row to 1 by dividing all entries by the sum of the row."
      ))
    }
    # Normalize each row of the matrix by dividing by the rowsums
    relab <- relab / sums
  }
    if(is.null(group)){
      group_return = NULL
    } else{
      group_return = unlist(original_relab[,group], use.names = FALSE)
    }

    if(is.null(time)){
      time_return = NULL
    } else{
      time_return = unlist(original_relab[,time])
    }


    return(list("relab_matrix" = relab,
                "group" = group_return,
                "time" = time_return))
}

# relab_checker(relab = xue_microbiome_sample, group = "subject", time = "timepoint")






# FUNCTIONS FOR WEIGHTINGS

# S_checker ------------------------------------------------------------------
# An internal function to check if similarity matrices are up to spec and fix any issues automatically.
# S - a similarity matrix
# K - the number of taxa
S_checker <- function(S, K, relab_matrix = NULL) {

  if(!is.null(relab_matrix) & !is.null(colnames(relab_matrix)) & !is.null(colnames(S))){

    taxa_names = colnames(relab_matrix)[(ncol(relab_matrix)-K + 1):ncol(relab_matrix)]

    if(length(taxa_names) != ncol(S) & !all(taxa_names %in% colnames(S))){
      stop("Not all of the taxa in relab_matrix are included in S. Your similarity matrix must describe every taxon included in relab_matrix.")
    }

    if(length(taxa_names) < ncol(S)){
      S = S[taxa_names, taxa_names]
      warning("S describes more taxa than are included in relab_matrix.\nI am subsetting the rows and columns of S so that they match the names of the K categories in relab_matrix.")
    }

    else if(any(taxa_names!=colnames(S)) || any(taxa_names!=rownames(S))){
      S = S[taxa_names, taxa_names]
      warning("The row and/or column names of the similarity matrix S do not match the names of the K categories in relab_matrix.\nI am re-ordering the rows and columns of S so that they match the names of the K categories in relab_matrix.")
    }

  }


  if(!isSymmetric(S)){
    warning("S is not symmetric.")
  }
  if(any(round(diag(S),8)!=1)){
    stop("All diagonal elements of S must be equal to 1.")
  }
  if(nrow(S) != K | ncol(S) != K){
    stop("S must have K rows and K columns.")
  }
  if(any(S>1)){
    stop("All elements of S must be less than or equal to 1.")
  }
  if(any(S<0)){
    stop("All elements of S must be positive.")
  }

  return(S)
}


# time_weights -----------------------------------------------------------------
#' Compute a normalized weighting vector based on a vector of sampling times.
#'
#' This function takes a vector of sampling times, \eqn{t = (t_1, t_2, \ldots, t_I)}{latex}
#' and computes a normalized vector which can be used to weight each sample based on
#' the time between the subsequent and the preceding samples. The weighting vector \eqn{w}
#' is defined such that each entry, \eqn{w_i = d_i / 2T}, where \eqn{T=t_I - t_1} and
#' \eqn{d_i = t_{i+1} - t_{i-1}} for \eqn{i} not equal to 1 or I. \eqn{d_1 = t_2-t_1} and \eqn{d_I = t_I-t_{I-1}}.
#'
#' @param times A numeric vector of sampling times. Each entry must be
#' greater than the previous entry.
#' @param group Optional; a character vector specifying the group identity of each
#' sampling time. Use if there are samples from multiple replicates or subjects
#' in one dataset.
#' @returns A numeric vector. Each entry provides a weight for each entry in the
#' provided `times` vector. If `group` is not specified, the vector sums to 1. If
#' `group` is specified, the vector sums to the number of distinct groups.
#' @examples

#' time_vector = c(1, 8, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
#'                 32, 33, 34, 35, 36, 37, 38, 39, 44, 50, 57, 64)
#'
#' time_weights(times = time_vector)
#' @export
time_weights <- function(times, group = NULL){

  if(!is.null(group)){
    if(length(group) != length(times)){
      stop(paste0("The times and group vectors must have the same length. times has length ",
           length(times), ". group has length ", length(group), "."))
    }
  }

  # Create an ID for each sample to ensure the returned sample weights match the
  # order that the samples were given in.
  ID = paste0(group, times)


  if(is.null(group)){
    I = length(times)
    T = times[I] - times[1]

    if(I<2){
      stop("times must have length greater than 1.")
    }
    if(any(sapply(2:I, function(i) times[i] - times[i-1]) <= 0)){
      stop("times must be increasing. Each entry must be greater than the previous entry.")
    }

    di <- c(times[2] - times[1])
    if(I>2){
      for(i in 2:(I-1)){
        di[i] = times[i+1] - times[i-1]
      }
    }
    di[I] = times[I] - times[I-1]
    return(di/(2*T))
  }else{


    wi = c()
    for(name in unique(group)){
      time_name = times[which(group == name)]

      I = length(time_name)
      T = time_name[I] - time_name[1]

      if(I<2){
        stop("Within each group, times must have length greater than 1.")
      }
      if(any(sapply(2:I, function(i) time_name[i] - time_name[i-1]) <= 0)){
        stop("Within each group, times must be increasing. Each entry must be greater than the previous entry.")
      }

      wi_addition <- c(name1 = (time_name[2] - time_name[1])/(2*T))
      if(I>2){
        for(i in 2:(I-1)){
          wi_addition = c(wi_addition, (time_name[i+1] - time_name[i-1])/(2*T))
        }
      }
      wi_addition = c(wi_addition, (time_name[I] - time_name[I-1])/(2*T))

      names(wi_addition) = paste0(name, time_name)

      wi = c(wi, wi_addition)
    }
    return(wi[ID])
  }

}








# Repeat rows based on time series data, to be used
# when generating boostrap replicates of matrices

# lcm_pair = function(x, y){
#   if(x>y)
#   {
#     greater=x
#   }
#   else
#   {
#     greater=y
#   }
#   while(TRUE)
#   {
#     if((greater%%x==0)&&(greater%%y==0))
#     {
#       lcm=greater
#       break
#     }
#     greater=greater+1
#   }
#   return(lcm)
# }
#
# lcm = function(integers){Reduce(lcm_pair, integers)}

relab_sample_weighter = function(relab, K = NULL, time = NULL, w = NULL, group = NULL){

  relab_matrix_clean = relab_checker(relab = relab, K = K, time = time, group = group)

  if(is.null(group)){

    if(is.null(w) & (!is.null(time))){
      t = relab_matrix_clean$time
      w = time_weights(t)
      T = max(t) - min(t)

      return(relab[rep(1:length(w), round(w*T*2)),])
    }else  if(is.null(time) & !is.null(w)){
      return(relab[rep(1:length(w), round(w*800)),])
    } else{
      # warning("Please provide either time or w to relab_sample_weighter function.")
      return(relab)
    }

  }else{
    groups = unique(relab_matrix_clean$group)
    df_list = vector(mode='list', length = length(groups))
    i = 1
    for(g in groups){
      group_sub = relab_matrix_clean$group == g

      if(is.null(w) & (!is.null(time))){

        t_sub = (relab_matrix_clean$time)[group_sub]
        w_sub = time_weights(t_sub)
        T = max(t_sub) - min(t_sub)

        df_list[[i]] = relab[group_sub,][rep(1:length(w_sub), round(w_sub*T*2)),]

      }else  if(is.null(time) & !is.null(w)){
        df_list[[i]] = relab[group_sub,][rep(1:length(w), round(w*800)),]
      }else{
        # warning("Please provide either time or w to relab_sample_weighter function.")
        return(relab)
      }
      i = i + 1
    }

    return(do.call(rbind, df_list))

  }

}


# helper function: arrange

arrange_categories <- function(relab_matrix, arrange, K = NULL, group = NULL, time = NULL){

  if(arrange == FALSE){
    relab_checker_out = relab_checker(relab = relab_matrix, K = K, group = group, time = time)
    relab_matrix_clean = relab_checker_out$relab_matrix
    relab_matrix_clean = relab_matrix_clean[,colSums(relab_matrix_clean) > 0]
  }else{

  relab_checker_out = relab_checker(relab = relab_matrix, K = K, group = group, time = time)

  K = ncol(relab_checker_out$relab_matrix)

  relab_matrix_clean = relab_checker_out$relab_matrix
  relab_matrix_clean = relab_matrix_clean[,colSums(relab_matrix_clean) > 0]

  if(is.null(colnames(relab_matrix_clean))){
    colnames(relab_matrix_clean) = paste0("cat_", 1:K)
  }


  if(any(colnames(relab_matrix_clean) %in% as.character(1:1000))){
    warning("relab_matrix has at least one numeric column name, which can cause errors when re-arranging the columns. The columns are being automatically renamed in order to avoid errors. If you wish to avoid this renaming, please rename the columns of relab_matrix.")
    colnames(relab_matrix_clean) = paste0("cat_", 1:K)
  }


    clustermeans <- colMeans(relab_matrix_clean) %>% sort() %>% rev
    ordernames <- c(names(clustermeans))

    if(sum(clustermeans != 0) != K){
      warning(paste0("Only plotting the ", sum(clustermeans != 0),
                     " categories with non-zero abundances. If you are manually changing the fill or color of the plot, you can provide ",
                     sum(clustermeans != 0), " colors, instead of ", K, "."))
    }

    if(arrange %in% c(TRUE, "both")){

      relab_matrix_clean <- data.frame(relab_matrix_clean) %>%
        dplyr::arrange(dplyr::across({{ ordernames }})) %>%
        dplyr::select(c(names(which(clustermeans != 0))))

    }else if(arrange == "vertical"){
      relab_matrix_clean <- data.frame(relab_matrix_clean) %>%
        # dplyr::arrange(dplyr::across({{ ordernames }})) %>%
        dplyr::select(c(names(which(clustermeans != 0))))


    }else if(arrange == "horizontal"){
      relab_matrix_clean <- data.frame(relab_matrix_clean) %>%
        dplyr::arrange(dplyr::across({{ ordernames }}))# %>%
      # dplyr::select(c("group", names(which(clustermeans != 0))))
    }else{
      stop("The options for arrange are FALSE, TRUE, vertical, horizontal, or both. TRUE and both are interchangeable.")
    }

  }

  if(!is.null(group)){
    relab_matrix_clean = relab_matrix_clean %>% data.frame %>% dplyr::mutate(group = relab_checker_out$group, .before = 1)
  }

  if(!is.null(time)){
    relab_matrix_clean = relab_matrix_clean %>% data.frame %>% dplyr::mutate(time = relab_checker_out$time, .before = 1)
  }

  return(relab_matrix_clean %>% data.frame)



}

Try the FAVA package in your browser

Any scripts or data that you put into this service are public.

FAVA documentation built on April 4, 2025, 4:47 a.m.