R/metadata.R

Defines functions extract_metadata modify_metadata_values modify_metadata_names import_metadata

# Functions for optional metadata preprocessing
# Author: Mathias Kuhring


# Import and merge additional matedata
import_metadata <- function(
  data,
  metadata_file,
  column_duplicates = c("rename", "replace", "omit")[1]
){
  # Check parameters
  assertthat::assert_that(file.exists(metadata_file))
  assertthat::assert_that(length(column_duplicates) == 1)
  assertthat::assert_that(column_duplicates %in% c("rename", "replace", "omit"))

  # Load additional metadata/annotation
  encoding <- readr::guess_encoding(file = metadata_file)
  cat(paste0("Guessed encoding ", encoding[[1,1]], "."))
  if (get_file_extension(metadata_file) == "csv") {
    df_annotation <- suppressMessages(readr::read_csv(
      file = metadata_file,
      locale = readr::locale(encoding = encoding[[1,1]]),
      na = c("", "NA", "\xc2\xa0"), # remove non-breaking spaces
      trim_ws = TRUE
    ))
  } else if (get_file_extension(metadata_file) %in% c("tsv", "txt")) {
    df_annotation <- suppressMessages(readr::read_tsv(
      file = metadata_file,
      locale = readr::locale(encoding = encoding[[1,1]]),
      na = c("", "NA", "\xc2\xa0"), # remove non-breaking spaces
      trim_ws = TRUE
    ))
  } else {
    message(paste0(
      "Error: Unsupported text file format '",
      get_file_extension(metadata_file),
      "'. Use csv (comma-separated) or tsv/txt (tab-separated!"))
  }

  # Make sure that the sample identification column is available in the
  # annotation dataframe and that it is of charactere type
  to_rename <- which("Sample Identification" == names(df_annotation))
  if (length(to_rename) > 0) { names(df_annotation)[to_rename] <- "Sample.Identification"}
  assertthat::assert_that("Sample.Identification" %in% names(df_annotation))
  assertthat::assert_that(sum("Sample.Identification" == names(df_annotation)) == 1)
  df_annotation <- df_annotation %>%
    mutate(Sample.Identification = as.character(Sample.Identification)) %>%
    # Make sure ID is up front, otherwise potential renaming of duplicate
    # columns might overwrite wrong columns (as it is index based)
    select(Sample.Identification,  everything())

  # Handle duplicated columns
  data_cols <- names(data)
  meta_cols <- names(df_annotation %>% select(-Sample.Identification))
  col_dups <- intersect(data_cols, meta_cols)
  if (length(col_dups) > 0){
    if (column_duplicates == "rename") {
      renamed <- make.unique(c(data_cols, meta_cols))
      names(df_annotation)[(2):(length(meta_cols)+1)] <-
        renamed[(length(data_cols)+1):(length(renamed))]
    } else if (column_duplicates == "replace") {
      data <- data %>% select(-one_of(col_dups))
    } else { # "omit"
      df_annotation <- df_annotation %>% select(-one_of(col_dups))
    }
  }

  # Merge original data with metadata
  data <- left_join(data, df_annotation, by="Sample.Identification")

  return(data)
}


# Rename columns in the data
# Use a named vector:
# c(oldname1 = "newname1", oldname2 = "newname2", ...)
modify_metadata_names <- function(
  data,
  mapping
){
  assertthat::assert_that(length(mapping) > 0)
  assertthat::assert_that(class(mapping) == "character")
  assertthat::assert_that(!is.null(names(mapping)))
  assertthat::assert_that(!any(is.na(names(mapping))))

  # Make sure new names are unique
  renamed <- make.unique(c(names(data), mapping))
  old_names <- names(mapping)
  new_names <- renamed[(length(names(data))+1):(length(renamed))]
  mapping <- new_names
  names(mapping) <- old_names

  # Rename columns
  idx <- which(names(data) %in% names(mapping))
  names(data)[idx] <- mapping[names(data)[idx]]

  return(data)
}


# Change values in the data
# Use a named list with named vectors per columns, e.g.:
# list("columnX" = c("oldvalueA" = "newvalueA", "oldvalueB" = "newvalueB"),
#      "columnY" = c("1" = 5, "3" = 6, ...), ...)
modify_metadata_values <- function(
  data,
  mapping
){
  assertthat::assert_that(length(mapping) > 0)
  assertthat::assert_that(is.list(mapping))
  assertthat::assert_that(!is.null(names(mapping)))
  assertthat::assert_that(!any(is.na(names(mapping))))
  for (value_mods in mapping){
    assertthat::assert_that(length(value_mods) > 0)
    assertthat::assert_that(!is.null(names(value_mods)))
    assertthat::assert_that(!any(is.na(names(value_mods))))
    assertthat::assert_that(
      class(value_mods) %in%
        c("character", "complex", "integer", "logical", "numeric")
    )
  }

  # Remove mappings of unavailable columns
  mapping <- mapping[names(mapping) %in% names(data)]

  # Iterate mappings per column
  cols <- names(mapping)
  for (col in cols){
    # Recode column values
    data <- data %>%
      mutate(!!sym(col) := dplyr::recode(!!sym(col), !!!mapping[[col]]))
  }

  return(data)
}


# Extract additional metadata from a character column (e.g. Sample.Identifier)
extract_metadata <- function(
  data,
  output_column,
  split_number,
  input_column = "Sample.Identification",
  split_string = "-",
  sample_type = SAMPLE_TYPE_BIOLOGICAL
){
  data %>%
    mutate(
      UQ(sym(output_column)) := if_else(
        Sample.Type == sample_type,
        stringr::str_split_fixed(
          string = UQ(sym(input_column)), pattern = split_string,
          n = split_number)[,split_number],
        NA_character_)) %>%
    return()
}
bihealth/metaquac documentation built on Aug. 7, 2021, 8:40 a.m.