aftersl1p: Generate Summary Graphics and Basic Analysis of 16s Data

Documented in df_glom rank_abund subset_order

# rank_abund ---------------------------------------------------------------

#' Rank Taxa by Abundance
#'
#' `rank_abund()` generates a data frame that's ready to be used by
#' [plot_rank_ab()]
#'
#' @section Value: A data frame whose taxa have been ranked by their mean
#'   abundance in the user-specified baseline level of some grouping variable or
#'   variables.
#'
#' @param phy_df A dataframe of a phyloseq object, like that generated by
#'   [phyloseq::psmelt()] or [make_phy_df()]
#' @param gvars (`NULL`) A character vector of grouping variables from which the
#'   baseline values are chosen to define the abundance ordering. If it is
#'   `NULL`, the ordering will be based on mean abundances in the whole data
#'   frame.
#' @param bases (`NULL`) A character vector of baseline values for the variables
#'   given in `gvars`. The ordering of the taxa will be given based only on the
#'   samples with these baseline values for these variables. Must be in the same
#'   order as `gvars`.
#' @param abunds (`'Abundance'`) The name of the abundance column.
#' @param rank (`'Genus'`) The rank to base the ordering on. Must be a column in
#'   `phy_df`
#' @param IDcol (`'X.SampleID'`) The column name of the sample IDs
#' 
rank_abund = function(phy_df, gvars = NULL, bases = NULL, abunds = 'Abundance',
                      rank = 'Genus', IDcol){
    # Set up the groups for the plotting totals
    rank_abs = df_glom(phy_df, IDcol = IDcol, rank = rank, abunds = abunds)

    # Subset and order
    ranked = subset_order(rank_abs, gvars, bases, rank = rank)

    # Order the bigger data frame by the above ordering
    lev_ord = levels(ranked[,rank])
    rank_abs[,rank] = factor(rank_abs[,rank], levels = lev_ord)

    return(rank_abs)
}

# subset_order -------------------------------------------------------------

#' Subset and generate taxon ordering
#'
#' `subset_order` generates a data frame whose taxon column given by
#' `rank` has been ranked according to its mean abundance in the
#' `abunds` column. Used internally by
#' [rank_abund()]
#'
#' @param phy_df A phyloseq data frame, as generated by
#'   [phyloseq::psmelt()], but probably generated by
#'   [df_glom()] or
#'   [make_phy_df()].
#' @param varbs (`NULL`) A character vector of grouping variables from
#'   which the baseline values are chosen to define the abundance ordering. If
#'   it is `NULL`, the ordering will be based on mean abundances in the
#'   whole data frame.
#' @param bases (`NULL`) A character vector of baseline values for the
#'   variables given in `vars`. The ordering of the taxa will be given
#'   based only on the samples with these baseline values for these variables.
#'   Must be in the same order as varbs.
#' @param abund (`'Abundance'`) The name of the abundance column.
#' @param rank (`'Genus'`) The taxonomic rank to base the ordering on.
subset_order = function(phy_df, varbs = NULL, bases = NULL, rank = 'Genus',
                        abunds = 'TotalAbunds'){
    # Check inputs
    if (is.null(varbs)){
        warn('No grouping variables given. Using whole data set.')
    } else if (is.null(bases)){
        warn(paste('No baseline values given for grouping variables.',
                    'Using whole data set.'))
    } else if (length(varbs) != length(bases)){
        stop('vars and bases must have the same length.')
    }


    # Subset and order
    ranked = phy_df
    for(i in 1:length(varbs)){
        ranked %>% dplyr::filter(UQ(sym(varbs[i])) == bases[i]) -> ranked
    }
    ranked %>%
        dplyr::group_by_at(vars(c(varbs,rank))) %>%
        dplyr::summarize(MetaMean = mean(UQ(sym(abunds)))) %>%
        data.frame() %>%
        order_taxa(rank, 'MetaMean', decreasing = TRUE) -> ranked

    return(ranked)
}

# df_glom ------------------------------------------------------------------

#' Like tax_glom, but for data frames
#'
#' [df_glom()] take totals within sample at a given taxonomic rank.
#'
#' @param phy_df A phyloseq data frame, as generated by
#'   [phyloseq::psmelt()] or [make_phy_df()]
#' @param ranks A character vector with the taxon rank names
#' @param IDcol The column name of the sample IDs
#' @param rank The taxonomic rank to glom at
#' @param abunds The column name of the abundances to sum
#' @param tots The desired column name of the summed (glommed) abundances
df_glom = function(phy_df, ranks, IDcol = 'X.SampleID', rank = 'Phylum',
                   abunds = 'Abundance'){

    # Set up the groups for the plotting totals
    phy_df %>%
        dplyr::group_by_at(vars(IDcol,rank)) %>%
        dplyr::mutate(TotalAbunds = sum(UQ(sym(abunds)))) %>%
        data.frame() -> glommed_df

    return(glommed_df)
}