aftersl1p: Generate Summary Graphics and Basic Analysis of 16s Data

Documented in make_phy_df order_taxa remain

##### make_phy_df --------------------------------------------------------------

#' Generate a Data Frame for Taxon Bar Charts
#'
#' `make_phy_df` generates a data frame that is useful for generating taxon
#' bar charts.
#'
#' @section Details: This function takes a phyloseq object and generates a data
#'   frame that is useful for plotting taxon abundance information. By default
#'   it propagates taxon assignment information down the tree into unassigned
#'   leves, and aggregates all taxa below 0.1 percent into a single 'Other'
#'   category. This function expects the phyloseq object to be relative
#'   abundance, and weird things will happen if it is not.
#'
#' @section Value: A data frame similar in structure to that generated by
#'   [phyloseq::psmelt()], but with an 'Other' category added and
#'   taxon levels ordered for use in plotting.
#'
#' @param physeq A phyloseq object.
#' @param rank The rank at which to glom taxa. Must be one of 'OTU', 'Genus',
#'   'Family', 'Order', 'Class', 'Phylum'. Default is 'Genus'.
#' @param cutoff The abundance cutoff below which taxa are grouped into 'Other'.
#'   If you don't want anything grouped into 'Other', set this to 0. Default is
#'   0.001.
#' @param indic a flag to indicate if the taxon names have level indicators. If
#'   FALSE, they are added.
#' @param prop Specifies whether taxa need to be propogated down the taxonomy
#'   table (default is `TRUE`) or if this has already been done.
#' @param count If `FALSE` (default) the function will expect a relative
#'   abundance table and create an 'Other' category for taxa below the cutoff
#'   (and will raise an error if the table is not relative abundance). If TRUE,
#'   the function will not check for relative abundance and will not create an
#'   'Other' category.
#' @export
make_phy_df = function(physeq, rank = 'Genus', cutoff = 0.001, indic = FALSE,
					   prop = TRUE, count = FALSE){


    # Check that its relab
    if (!count & any(phyloseq::otu_table(physeq) > 1)){
        stop('physeq must be a relative abundance table. You have counts > 1.')
    }
    ranks = colnames(phyloseq::tax_table(physeq))
    if ((rank == 'OTU') & !('OTU' %in% ranks)) {
        ranks = c(ranks, rank)
    }

    # Propogate taxonomic assignments down the tree
    if (prop){
    	physeq = prop_tax_down(physeq, indic)
    }

	# Glom to the correct taxonomic rank
    if (rank == 'OTU'){
        phyl_glommed = physeq
    } else {
        phyl_glommed = phyloseq::tax_glom(physeq, taxrank = rank)
    }

	# Set all counts < the cutoff to zero
	phyloseq::otu_table(phyl_glommed)[phyloseq::otu_table(phyl_glommed) <
	                                      cutoff] = 0
    # Check for sample_data() and create it with sample names if it's missing
	if (is_null(physeq@sam_data)){
	    samdat = data.frame('Sample' = phyloseq::sample_names(physeq))
	    rownames(samdat) = phyloseq::sample_names(physeq)
	    phyloseq::sample_data(physeq) = samdat
	}

	# Melt and sort, then filter out taxa that are 0 in their sample
	abunds = (phyl_glommed
	    %>% phyloseq::psmelt()
	    %>% data.frame()
	    %>% dplyr::filter(Abundance > 0))

	# List all the metadata columns so that they are included in the data frame
	metacols = colnames(phyloseq::sample_data(physeq))
	# Make an 'Other' row for each sample
	others = (abunds
	    %>% dplyr::group_by(across(all_of(metacols)))
	    %>% dplyr::summarize(Abundance=remain(Abundance), .groups = 'drop'))

	# Add in the taxonomic data columns
	taxcols = names(abunds)[match(ranks[1],names(abunds)):ncol(abunds)]
	if (rank == 'OTU'){
	    taxcols = c(taxcols, 'OTU')
	}

	if (! count){
	    others[taxcols] = 'Other'

	    # Combine the 'Other' data frame with the original
	    if (length(metacols) > 1){
	        newdf = abunds[,metacols]
	    } else {
	        newdf = data.frame(abunds[,metacols])
	        colnames(newdf) = metacols
	    }
	    newdf$Abundance = abunds$Abundance
	    newdf[,taxcols] = abunds[,taxcols]
	    newdf = rbind(as.data.frame(others),
	                  as.data.frame(newdf))
	} else {
	    newdf = abunds
	}

	# Order the tax cols by mean abundance
	for (r in taxcols){
	    newdf = order_taxa(newdf, r)
	}
	return(newdf)

}



##### remain -------------------------------------------------------------------

#' Calculate the difference between included taxa and 100% for the 'Other'
#' section of a phyloseq data frame.
#'
#' @section Value: a numeric vector
#'
#' @param x A numeric vector
#' @param tot The desired total. Default is 1.
remain = function(x, tot = 1){
    tot-sum(x)
}

##### order_taxa ---------------------------------------------------------------

#' Order Taxon Name Factors
#'
#' `order_taxa()` reorders the taxon names in a taxon column (e.g. 'Class' or
#' 'Phylum') by the taxon's mean abundance (but always makes sure to put Other
#' first).
#'
#' @section Value: A data frame that is identical to the one given, but with the
#'   specified column re-ordered by its mean abundance
#'
#' @param phy_df A data frame of a phyloseq object, as produced by
#'   [phyloseq::psmelt()] or [make_phy_df()].
#' @param rank The name of the column to be re-ordered
#' @param abund The name of the abundances column. Defaults to 'Abundance'
#' @param decreasing Specifies whether the taxon order should be based on
#'   decreasing or increasing abundance. Defaults to FALSE.
#' @export
order_taxa = function(phy_df, rank, abund = 'Abundance', decreasing = FALSE){

    phy_df[,rank] = factor(phy_df[,rank])
    total_abunds = (phy_df
                    %>% dplyr::filter(.data[[rank]] != 'Other')
                    %>% dplyr::group_by(.data[[rank]])
                    %>%	dplyr::summarize(Tot = sum(.data[[abund]]))
                    %>%	data.frame())

	lev_ord = levels(droplevels(total_abunds[,rank]))
	if (decreasing){
	    lev_ord = lev_ord[order(-total_abunds$Tot)]
	} else{
	    lev_ord = lev_ord[order(total_abunds$Tot)]
	}

	phy_df[,rank] = factor(phy_df[,rank], levels = c('Other',lev_ord))

    return(phy_df)
}