R/utils_metadata_manip.R

Defines functions determine.meta.var.classes prune.metadata.vars get.metadata.vars correct.metadata

Documented in correct.metadata get.metadata.vars

#'@title Check and correct types of metadata table
#'@description TBD
#'@param df.meta a tibble of the metadata with `SampleID` column
#'@param meta.sub default NULL or a character vector containing a subset of variables
#'@param exceptions default NULL or a character vector containing variables with
#'     more than 5 factor levels that should not be typecast as a character
#'@param analysis default NULL or a string triggering specific actions (e.g. 'metacardis')
#'@return a tibble with `meta.sub` columns and all variable types updated
#'@importFrom dplyr select
#'@importFrom purrr imap_dfc map_chr
#'@importFrom stringr str_split
#'@export
correct.metadata <- function(df.meta, meta.sub = NULL,
                             exceptions = NULL, analysis = NULL) {

  if (is.null(meta.sub)) meta.vars <- colnames(df.meta)
  else if (!('SampleID' %in% meta.sub)) meta.vars <- c('SampleID', meta.sub)
  else meta.vars <- meta.sub #check validity later

  out <- df.meta %>%
    dplyr::select(dplyr::one_of(meta.vars)) %>%
    purrr::imap_dfc(function(.x, .y) {
      browser()

      if ((!is.null(exceptions)) & (.y %in% exceptions)) as.factor(.x)
      else if (is.factor(.x) || is.double(.x)) .x
      else if (length(unique(.x)) == 1) as.character(.x)
      else if (length(unique(.x)) == 2) {
        # yes/no values with or without NAs
        if ('yes' %in% tolower(.x)) {
          if (any(is.na(.x))) factor(tolower(.x),
                                     levels = c('yes','no', NA),
                                     labels = c(1,0,NA))
          else factor(tolower(.x),
                      levels = c('yes','no'),
                      labels = c(1,0))}
        # binary values with NAs
        else if (any(is.na(.x))) factor(.x,
                                        levels = c(unique(.x), NA),
                                        labels = c(seq(length(unique(.x))), NA))
        # binary values without NAs
        else factor(.x, levels = c(unique(.x)), labels = c(1,2))}
      else if (length(unique(.x)) == 3) { #might be binary +NAs
        if (NA %in% unique(.x)) {factor(tolower(.x),
                                       levels = c('yes','no',NA), #assumption its yes/no+NA...
                                       labels = c(1,0)) %>%
            forcats::fct_explicit_na(na_level = NA)}
        else factor(.x, levels = c(unique(.x)), labels = c(seq(length(unique(.x)))))}
      else if (length(unique(.x)) < 6) {
        if (any(is.na(.x))) forcats::fct_explicit_na(.x, na_level = NA)
        else factor(.x,
                    levels = unique(.x),
                    labels = seq(length(unique(.x))))}
      else as.character(.x)

    })

  if (analysis == 'metacardis') {
    vars.update <- meta.vars %>%
      stringr::str_split('_') %>%
      purrr::map_chr(~ head(., n=1))
    colnames(out) <- vars.update
  }

  return(out)

}


#'@title Get name and type of metadata variables available in data
#'@description TBD
#'@param df.stats a tibble with feature rows + nested data column
#'@return a character vector with names = variables and content = variable type
#'@importFrom magrittr use_series extract2
get.metadata.vars <- function(df.stats) {
  df.stats %>%
    magrittr::use_series(data) %>%
    magrittr::extract2(1) %>%
    ansimo::determine.meta.var.classes()
}

prune.metadata.vars <- function(meta.vars) {
  vars <- list()
  vars$keep <- meta.vars[which(meta.vars == 'factor')]
  vars$discard <- meta.vars[which(meta.vars != 'factor')]
  return(vars)
}

determine.meta.var.classes <- function(df.single.feat) {

  df.single.feat %>%
    dplyr::select(-c('SampleID','abundance')) %>%
    purrr::map_chr(~ class(.))
}

extract.metadata.abundances <- function(df.stats, s.var, label) {
  #browser()
  df.stats %>%
    dplyr::select(c('feature','data')) %>%
    dplyr::mutate(data = purrr::map(data, ~ dplyr::select(., c('abundance', s.var, label))))
}

#'@title Represent a metadata table as a single string
#'@description TBD
#'@param df.meta a metadata tibble
#'@return a character vector of length 1 (a single string)
#'@importFrom purrr imap
#'@importFrom stringr str_replace_na str_flatten
characterize.metadata <- function(df.meta) {

  data <- df.meta %>%
    purrr::imap(function(.x, .y) {
      vals <- .x %>%
        stringr::str_replace_na(replacement = 'NA') %>%
        stringr::str_flatten(collapse = '.')
      paste0('_', .y, '_', vals)}) %>%
    unlist(use.names = FALSE) %>%
    stringr::str_flatten()
}

#'@title Reconstruct a metadata table from string representation
#'@description TBD
#'@param meta.str metadata represented as a string with '_' flanking
#'     variable/column names and '.' separating individual values
#'@return a tibble
#'@importFrom purrr imap pluck map flatten
#'@importFrom stringr str_split
#'@importFrom magrittr extract set_names
#'@importFrom dplyr bind_cols mutate_if
reconstruct.metadata <- function(meta.str) {

  meta.list <- meta.str %>%
    stringr::str_split('[_]') %>%
    purrr::pluck(1) %>%
    magrittr::extract(-1) %>%
    as.list()

  cols <- meta.list[seq(1, length(meta.list), 2)]
  raw.data <- meta.list[seq(2, length(meta.list), 2)]

  raw.data %>%
    magrittr::set_names(cols) %>%
    purrr::map(~ stringr::str_split(., '[.]')) %>%
    purrr::flatten() %>%
    dplyr::bind_cols() %>%
    dplyr::mutate_if(is.character, ~ as.factor(.))
}
sxmorgan/ansimo documentation built on June 26, 2020, 7:59 p.m.