phyloseq-extended:

#' Use information from Vetroský and Baldrian (2013) to correct for 16S rRNA copy number variation. 
#' Given a taxonomic level (Phylum or Class within Proteobacteria), returns the average copy number
#' (CN) within that level in the database constructed by the authors. 
#' 
#' @title sample_copy_number_internal
#' @param level Required. Character. Taxonomic level for which the CN is required.
#' @param dispersion. Optional. Logical. Should the returned CN be the average within
#'                    the level or sampled according to the distribution within that level.
#'                    Distribution is approximated by a normal with mean and sd set to those
#'                    observed in the sample database. Defaults to FALSE.
#' @note The copy number is truncated to the [1, 15] range observed in the database. Levels
#'       not observed in the database have CN set to 4.2 or sample from \code{rnorm(1, 4.2, 2.7)}
#'       which correspond to the global mean and sd in the database.
#' @return A numeric value corresponding to imputed CN. 
#' @references Vetroský, T. and Baldrian, P.(2013). The variability of the 16S rRNA Gene in Bacterial
#'             Genomes and Its Consequences for Bacterial Community Analyses. _Plos One_ *8*(2):e57923
#' @seealso \code{\link{sample_copy_number}}
sample_copy_number_internal <- function(level, dispersion = FALSE) {
    known.levels <- c("Acidobacteria", "Actinobacteria", "Aquificae", "Bacteroidetes", "Caldiserica",
                      "Chlamydiae", "Chlorobi", "Chloroflexi", "Cyanobacteria", "Defferibacteres",
                      "Deinoccocus-Thermus", "Dyctioglomi", "Elusimicrobia", "Fibrobacteres",
                      "Firmicutes", "Fusobacteria", "Gemmatimonadetes", "Ignavibacteria",
                      "Nitrospirae", "Planctomycetes", "Alphaproteobacteria", "Betaproteobacteria",
                      "Deltaproteobacteria", "Epsilonproteobacteria", "Gammaproteobacteria",
                      "Spirochaetes", "Synergistetes", "Tenericutes", "Thermodelsufobacteria",
                      "Thermotogae", "Verrucomicrobia")
    if (level %in% known.levels) {
        if (dispersion) {
            copy.number <- switch(level,
                                  "Acidobacteria" = rnorm(1, 1, 0),
                                  "Actinobacteria" = rnorm(1, 3.1, 1.7),
                                  "Aquificae" = rnorm(1, 2, 0.6),
                                  "Bacteroidetes" = rnorm(1, 3.5, 1.5),
                                  "Caldiserica" = rnorm(1, 1, 0),
                                  "Chlamydiae" = rnorm(1, 1.4, 0.5),
                                  "Chlorobi" = rnorm(1, 1.7, 0.7),
                                  "Chloroflexi" = rnorm(1, 2.2, 1.2),
                                  "Cyanobacteria" = rnorm(1, 2.3, 1.2),
                                  "Defferibacteres" = rnorm(1, 2, 0),
                                  "Deinoccocus-Thermus" = rnorm(1, 2.7, 1),
                                  "Dyctioglomi" = rnorm(1, 2, 0),
                                  "Elusimicrobia" = rnorm(1, 1, 0),
                                  "Fibrobacteres" = rnorm(1, 3, 0),
                                  "Firmicutes" = rnorm(1, 5.8, 2.8),
                                  "Fusobacteria" = rnorm(1, 5, 0.7),
                                  "Gemmatimonadetes" = rnorm(1, 1, 0),
                                  "Ignavibacteria" = rnorm(1, 1, 0),
                                  "Nitrospirae" = rnorm(1, 2, 1.4),
                                  "Planctomycetes" = rnorm(1, 1.7, 0.8),
                                  "Alphaproteobacteria" = rnorm(1, 2.2, 1.3),
                                  "Betaproteobacteria" = rnorm(1, 3.3, 1.6),
                                  "Deltaproteobacteria" = rnorm(1, 2.7, 1.4),
                                  "Epsilonproteobacteria" = rnorm(1, 3, 1.1),
                                  "Gammaproteobacteria" = rnorm(1, 5.8, 2.8),
                                  "Spirochaetes" = rnorm(1, 2.4, 1),
                                  "Synergistetes" = rnorm(1, 2.5, 1),
                                  "Tenericutes" = rnorm(1, 1.6, 0.5),
                                  "Thermodelsufobacteria" = rnorm(1, 2, 0),
                                  "Thermotogae" = rnorm(1, 1.8, 1),
                                  "Verrucomicrobia" = rnorm(1, 1.8, 1)
                                  )
        } else {
            copy.number <- switch(level,
                                  "Acidobacteria" = 1,
                                  "Actinobacteria" = 3.1,
                                  "Aquificae" = 2,
                                  "Bacteroidetes" = 3.5,
                                  "Caldiserica" = 1,
                                  "Chlamydiae" = 1.4,
                                  "Chlorobi" = 1.7,
                                  "Chloroflexi" = 2.2,
                                  "Cyanobacteria" = 2.3,
                                  "Defferibacteres" = 2,
                                  "Deinoccocus-Thermus" = 2.7,
                                  "Dyctioglomi" = 2,
                                  "Elusimicrobia" = 1,
                                  "Fibrobacteres" = 3,
                                  "Firmicutes" = 5.8,
                                  "Fusobacteria" = 5,
                                  "Gemmatimonadetes" = 1,
                                  "Ignavibacteria" = 1,
                                  "Nitrospirae" = 2,
                                  "Planctomycetes" = 1.7,
                                  "Alphaproteobacteria" = 2.2,
                                  "Betaproteobacteria" = 3.3,
                                  "Deltaproteobacteria" = 2.7,
                                  "Epsilonproteobacteria" = 3,
                                  "Gammaproteobacteria" = 5.8,
                                  "Spirochaetes" = 2.4,
                                  "Synergistetes" = 2.5,
                                  "Tenericutes" = 1.6,
                                  "Thermodelsufobacteria" = 2,
                                  "Thermotogae" = 1.8,
                                  "Verrucomicrobia" = 1.8
                                  )

        }
    } else {
        copy.number <- ifelse(dispersion, rnorm(1, 4.2, 2.7), 4.2)
    }
    copy.number <- min(15, copy.number)
    copy.number <- max(1, copy.number)
    return(copy.number)
}

#' Use information from Vetroský and Baldrian (2013) to correct for 16S rRNA copy number variation. 
#' Given a vector of taxonomic level (Phylum or Class within Proteobacteria), returns the average copy
#' number (CN) for each level. The average are taken from the database reconstructed by the authors. 
#' 
#' @title sample_copy_number
#' @param level Required. Character. Taxonomic level for which the CN is required.
#' @param dispersion. Optional. Logical. Should the returned CN be the average within
#'                    the level or sampled according to the distribution within that level.
#'                    Distribution is approximated by a normal with mean and sd set to those
#'                    observed in the sample database. Defaults to FALSE. 
#' @note The copy number is truncated to the [1, 15] range observed in the database. Levels
#'       not observed in the database have CN set to 4.2 or sample from \code{rnorm(1, 4.2, 2.7)}
#'       which correspond to the global mean and sd in the database.
#' @return A numeric vector corresponding to imputed CN. 
#' @references Vetroský, T. and Baldrian, P.(2013). The variability of the 16S rRNA Gene in Bacterial
#'             Genomes and Its Consequences for Bacterial Community Analyses. _Plos One_ *8*(2):e57923
#' @seealso \code{\link{sample_copy_number_internal}}, \code{\link{extract_level}}
sample_copy_number <- Vectorize(sample_copy_number_internal, "level")


#' Extract taxonomic level from a \code{\link{phyloseq}} class object for use in
#' \code{\link{sample_copy_number}}. The required levels are "Phylum" or "Class" within
#' Proteobacteria
#'
#' @title extract_level
#' @param physeq Required. \code{\link{phyloseq}} class object with \code{tax_table} slot.
#' @return A level vector, for use in \code{\link{sample_copy_number}}
#' @seealso \code{\link{sample_copy_number}}
extract_level <- function(physeq) {
    res <- tax_table(physeq)[, "Phylum"]
    res[ res %in% "Proteobacteria" ] <- tax_table(physeq)[res %in% "Proteobacteria", "Class"]
    return(res)
}

#' Scale sample counts from a \code{\link{phyloseq}} class object according to a
#' scaling vector or using the Phylum (or Class within Proteobacteria) average copy number.
#' Average copy number are taken from Vetroský and Baldrian (2013).
#'
#' @title scale_sample_counts
#' @param physeq Required. \code{\link{phyloseq}} class object
#' @param scaling Optional. Either a numeric scaling factor or any
#'                of "Phylum" or "Abundance". If "Phylum", The scaling is reconstructed
#'                as the average copy number within "Phylum" or "Class" for Proteobacteria
#'                using \code{\link{sample_copy_number}}. If "Abundance", counts are moderated
#'                within each sample to shrink them towards evenness. See Details. 
#' @param dispersion Optional. Logical. Should the returned CN be the average within
#'                   the level or sampled according to the distribution within that level.
#'                   Distribution is approximated by a normal with mean and sd set to those
#'                   observed in the sample database. Defaults to FALSE. 
#' @return a \code{\link{phyloseq}} class objet with scaled \code{otu_table} slot.
#' @note The copy number inferred under "Phylum" scaling is truncated to the [1, 15] range observed
#'       in the database. Levels not observed in the database have CN set to 4.2 or sample from
#'       \code{rnorm(1, 4.2, 2.7)} which correspond to the global mean and sd in the database.
#'       Under "Abundance" scaling, counts are moderated within each sample by assuming that the most
#'       abundant taxa has CN of 15, the least abundant has CN of 1 and linear interpolation between
#'       these extremes. This effectively evens bacterial counts (as opposed to 16S rRNA gene counts). 
#' @references Vetroský, T. and Baldrian, P.(2013). The variability of the 16S rRNA Gene in Bacterial
#'             Genomes and Its Consequences for Bacterial Community Analyses. _Plos One_ *8*(2):e57923
#' @seealso \code{\link{sample_copy_number}}, \code{\link{extract_level}}
scale_sample_counts <- function(physeq, scaling = "Phylum", dispersion = FALSE) {
    ## Test scaling and reconstruct it if needed
    if (length(scaling) == 1 && scaling == "Abundance") {
        scaling_function <- function(x) {
            index <- (x > 0)
            x[index] <- x[index] / rescale(x[index], to = c(1, 15))
            return(x)
        }
    } else {
        if (length(scaling) == 1 && scaling == "Phylum") {
            scaling <- sample_copy_number(extract_level(physeq), dispersion = dispersion)
        } else {
            stopifnot(is.numeric(scaling), length(scaling) == ntaxa(physeq))
        }
        scaling_function <- function(x, x.scaling = scaling) {
            return(x / x.scaling[1:length(x)])
        }
    }
    return(transform_sample_counts(physeq, scaling_function))
}