ansimo: ANalysis and SIMulation of Omics data

Documented in feat.summary.stats

#'@title Calculates summary statistics for each feature
#'@description currently performs some tidy manipulations (grouping, nesting) and
#'     calculates the dynamic range, the prevalence (proportion of nonzero samples),
#'     mean and median relative abundances, as well as the variance
#'@param df a data frame of features x samples
#'@return a nested data frame with a list-column of the original data
#'@importFrom dplyr group_by filter select summarize summarize_all n mutate
#'@importFrom tidyr nest gather
#'@importFrom purrr map map_dbl
#'@importFrom moments skewness kurtosis
#'@export
feat.summary.stats <- function(df) {

  df %>%
    tidyr::gather(key = 'feature', value = 'abundance', -SampleID) %>%
    dplyr::group_by(feature) %>%
    tidyr::nest() %>%
    dplyr::mutate(data = purrr::map(data,
                                    ~ as_tibble(.))) %>%
    dplyr::mutate(dynamic_range = purrr::map_dbl(data,
                                                 ~ ansimo::dyn.rng(get('abundance', .)))) %>%
    dplyr::mutate(prevalence = purrr::map_dbl(data, function(x) {
      x %>%
        dplyr::select(abundance) %>%
        dplyr::group_by(abundance) %>%
        dplyr::summarize(n = dplyr::n()) %>%
        dplyr::filter(abundance != 0) %>%
        dplyr::summarize_all(~ sum(.)/nrow(x)) %>%
        magrittr::use_series(n) %>%
        magrittr::extract(1)})) %>%
    dplyr::mutate(mean_ab = purrr::map_dbl(data, ~ mean(.$abundance))) %>%
    dplyr::mutate(skewness = purrr::map_dbl(data, ~ moments::skewness(.$abundance))) %>%
    dplyr::mutate(kurtosis = purrr::map_dbl(data, ~ moments::kurtosis(.$abundance))) %>%
    dplyr::mutate(sd = purrr::map_dbl(data, ~ sd(.$abundance)))

}

feat.stratified.stats <- function(df.nest) {

  df.nest %>%
    dplyr::mutate(dyn.rng = map_dbl(data,  ~ dyn.rng(.$abundance))) %>%
    dplyr::mutate(prev = map_dbl(data, ~ calc.single.prevalence(.$abundance))) %>%
    dplyr::mutate(mean.ab = map_dbl(data, ~ mean(.$abundance))) %>%
    dplyr::mutate(sem = map_dbl(data, ~ (sd(.$abundance)/length(.$abundance)))) %>%
    dplyr::mutate(skew = map_dbl(data, ~ moments::skewness(.$abundance))) %>%
    dplyr::mutate(kurt = map_dbl(data, ~ moments::kurtosis(.$abundance))) %>%
    dplyr::mutate(sd = map_dbl(data, ~ sd(.$abundance)))
}


# takes the ratio of the log10(max):log10(min) (found in publication)
# other possible ways to calculate include difference, base2 difference,
# and a raw ratio of min and max values
#'@title Calculates the dynamic range of a given feature
#'@description the dynamic range may be defined as the log of the ratio
#'     of the highest and lowest values a feature may take
#'@param x a vector of relative abundances
#'@return an float value
dyn.rng <- function(x) {log10(max(x[x != 0])/min(x[x != 0]))}