ansimo: ANalysis and SIMulation of Omics data

Documented in remove.zero.feat remove.zero.samples

#'@title Remove zero features
#'@description iterates over a data.frame and removes features where all values = 0
#'@param df a data frame of abundances
#'@return df with all-zero columns removed
#'@importFrom dplyr summarize_if select_if select one_of
#'@export
remove.zero.feat <- function(df) {

  cols <- df %>%
    dplyr::summarize_if(is.numeric, sum) %>%
    tidyr::gather('feat','sums') %>%
    dplyr::filter(sums == 0) %>%
    magrittr::use_series(feat)

  if (length(cols) > 0) dplyr::select(df, -dplyr::one_of(cols))
  else df
}

#'@title Remove zero samples
#'@description iterates over a data.frame and removes samples where all values = 0
#'@param df a data frame of abundances
#'@return df with all-zero rows removed
#'@importFrom dplyr mutate filter select one_of
#'@export
remove.zero.samples <- function(df) {

  IDs <- dplyr::select(df, SampleID)

  df %>%
    dplyr::select_if(is.numeric) %>%
    dplyr::mutate(rowsums = rowSums(.)) %>%
    tibble::add_column(SampleID = IDs$SampleID, .before = 1) %>%
    dplyr::filter(rowsums != 0) %>%
    dplyr::select(-rowsums)
}


#'@title Join metadata to data
#'@description joins selected metadata variables to the data list-column
#'     of a feature data frame
#'@param df.stats the data frame of feature names, raw data, and summary stats
#'@param df.meta the data frame of metadata
#'@return an altered df.stats in which the data list-column now has metadata appended
#'@importFrom purrr map
#'@importFrom dplyr inner_join
join.selected.metadata <- function(df.stats, df.meta) {

  df.stats %>%
    mutate(data = purrr::map(data, ~ dplyr::inner_join(., df.meta, by = 'SampleID')))
}


# if an unclassified column named '-1' exists, this function removes that column
# after subtracting it from the row (feature) sum total and re-normalizing the other
# feature values; motivated by the weird abundance values in the genus data
rescale.abundances <- function(df) {

    if (exists('-1', df)) {
    row.sums <- df %>%
      select_if(is.numeric) %>%
      rowSums()

    unclassified <- df %>%
      select(one_of('-1')) %>%
      as.matrix() %>% as.numeric()

    scale.factor <- row.sums - unclassified

    df %>%
      select(-one_of('-1')) %>%
      mutate_if(is.numeric, ~ ./scale.factor)
  }

  else return(df)
}

#'@title Transform counts to relative abundances
#'@description Divides counts by row sums
#'@param df a data frame of abundances
#'@return df
#'@importFrom dplyr select_if mutate_all
#'@importFrom tibble add_column
#'@export
count.to.abundances <- function(df) {

  labels <- select_if(df, is.character)
  rowsums <- rowSums(select_if(df, is.double))
  df %>%
    select_if(is.numeric) %>%
    mutate_all(~ ./rowsums) %>%
    add_column(SampleID = labels$SampleID, .before = 1)
}


### SHORT UTILITY FUNCTIONS


adjust.repeats <- function(bool, x, y) {ifelse(bool == TRUE, x, y)}
remove.zero.rows <- function(df) {df[which(rowSums(df) != 0),]}
check.nonzero <- function(x) {!all(x == 0)}
check.rnorm.sum <- function(x) {sum(x) > 0}
count.zeros <- function(x) {sum(x == 0)}
pct.zeros <- function(x) {sum(x == 0)/length(x)*100}
mean.na.rm <- function(x) {mean(!is.na(x))}



#'@title Calculates alpha diversity metrics for each sample in a dataset
#'@description currently computes sample richness and diverity (Shannon index)
#'@param df a data frame of features x samples
#'@return a nested data frame
#'@importFrom dplyr group_by filter select summarize_all full_join
#'@importFrom tidyr nest unnest gather spread
#'@importFrom vegan diversity
#'@export
sample.summary.stats <- function(df) {

  if (exists('Timepoint', df)) {
    df <- df %>%
      group_by(Timepoint) %>%
      nest() %>%
      filter(Timepoint == 'M0') %>%
      unnest() %>%
      select(-one_of('Timepoint')) %>%
      gather(key = 'feature', value = 'value', 2:ncol(.)) %>%
      spread(1, value)
  }

  else {
    df <- df %>%
      gather(key = 'feature', value = 'value', 2:ncol(.)) %>%
      spread(1, value)
  }

  temp <- data.frame('SampleID' = colnames(select(df, -one_of('feature'))), stringsAsFactors = FALSE)

  # Richness
  temp <- df %>%
    summarize_all(n_distinct) %>%
    gather(key = 'SampleID', value = 'Richness', 2:ncol(.)) %>%
    select(-one_of('feature')) %>%
    full_join(temp, by = 'SampleID')

  # Shannon index
  temp <- df %>%
    summarize_if(is.double, ~ diversity(., 'shannon')) %>%
    gather(key = 'SampleID', value = 'Diversity') %>%
    full_join(temp, by = 'SampleID')

}