assayr2: Tools for analyzing assay data

Documented in get_ph_metadata

#' Import meta data from various PureHoney files.
#'
#' Reads meta data from sample databases on Box via `boxr::box_read()`
#' @md
#' @param study_id Character in the standard study name format (i.e. "RNO0101-1" or "RNO0101").
#' @param device Boolean, is this a device experiment? Only used for 'RNO' projects.
#' @param fully_parse Logical indicating whether to try and filter the database to a specific run; Set \code{FALSE} to troubleshoot.
#' @details Requires a column named `STUDY NAME` that contains the study ID in the
#' Excel file being read, in order to use the `fully_parse` option to work properly.
#' PAH experiments were originally recorded in box_file 101838632791 until November 2018.
#' In order to access these older projects adjust the study ID to be 'PAHO'.
#' @return A data frame containing the meta information from a given study or all the meta information from all studies in a program.
#' @examples
#' \dontrun{
#' ## Requires boxR credentials
#' # returns only meta data from study RNO0738
#' meta <- get_ph_metadata("RNO0738")
#' # returns all studies' meta data from RNO07 project
#' all_meta <- get_ph_metadata("RNO0738", fully_parse = F)
#' }
#' @importFrom stats setNames
#' @importFrom boxr box_read
#' @importFrom dplyr bind_rows filter
#' @importFrom purrr map_lgl
#' @export get_ph_metadata
get_ph_metadata <- function(study_id, device = FALSE, fully_parse = TRUE) {
  if (length(study_id) > 1) stop("get_ph_metadata() only accepts one study_id at a time.")
  
  # manually adjusted for new experiments
  box_files <- c(
    101838632791, 161176997524, 161176997524, 33980749645,
    185483159790, 71120788417, 243555362303, 101346468166,
    161176997524, 161176997524, 161176997524, 318145533469,
    318145533469, 318145533469, 325086451911, 409671319284,
    342600832787
  ) %>%
    setNames(c(
      "PAHO", "PAU", "PAG", "RNO",
      "HEM", "RSQ", "PAD", "PAF",
      "PAC", "PAB", "PAP", "PAN",
      "PAA", "PAM", "PYP", "PAHT",
      "PAH"
    ))
  # manually map for sheet names of interest
  if (device == FALSE) {
    sheet_names <- c(
      "PAH", "PAU_Huh7", "PAG_HepG2", "RNO07 Static",
      "RNO20 (PBMC)", "HEM01", "RSQ05 MF", "RSQ01 Primary Heps Static",
      "RNO21 (PAHTEE)", "RNO08 - fibroblasts", "PAD", "PAF",
      "PAC_Cardiomyocytes", "PAB_PBMCs", "RNO01 -cell lines", "PAP_Hap1",
      "PAN_Neurons", "PAM_skeletal muscle", "PAA_Astrocytes", "PYP",
      "PAHT", "PAH"
    ) %>%
      setNames(c(
        "PAHO", "PAU", "PAG", "RNO07",
        "RNO20", "HEM", "RSQ05", "RSQ01",
        "RNO21", "RNO08", "PAD", "PAF",
        "PAC", "PAB", "RNO01", "PAP",
        "PAN", "PAM", "PAA", "PYP",
        "PAHT", "PAH"
      ))
  }
  if (device == TRUE) {
    sheet_names <- c("RNO07 Device") %>%
      setNames("RNO07")
  }
  
  program <- gsub("^(\\D{3,4}).*", "\\1", study_id)
  if (program %in% c("RNO", "RSQ")) program <- gsub("^(\\D{3}\\d{2}).*", "\\1", study_id)
  
  meta <- boxr::box_read(box_files[program],
                         which = sheet_names[program],
                         col_names = T,
                         col_types = "text"
  )
  
  # reset "fix" for old/new PAH docs before filtering
  study_id <- gsub("PAHO", "PAH", study_id)
  
  if (fully_parse) result <- meta[meta$`STUDY NAME` %in% study_id, ]
  
  # auto-catch the pressence/absence of '-1' run suffix
  if (nrow(result) == 0) {
    if (!grepl("-\\d$", study_id)) {
      study_id2 <- paste0(study_id, "-1")
      message(glue::glue('No runs matchs{study_id}, trying {study_id2}...'))
      result <- meta[meta$`STUDY NAME` %in% study_id2, ]
    }
    if (grepl("-\\d$", study_id)) {
      study_id2 <- gsub("-\\d+$", "", study_id)
      message(glue::glue('No runs matchs{study_id}, trying {study_id2}...'))
      result <- meta[meta$`STUDY NAME` %in% study_id2, ]
    }
  }
  
  # drop empty columns
  allNA_ix <- purrr::map_lgl(result, ~ sum(is.na(.)) == nrow(result))
  result <- result[!allNA_ix]
  return(result)
}
#' Download raw data and layout files for a RNO experiment
#'
#' @md
#' @param plate_ids Character vector of plate identifiers (ex. 100000748)
#' @param outdir File path to local destination for downloaded file.
#' @param layout Boolean. Should the assay layout file be downloaded along with the PureHoney raw files?
#' @param overwrite Boolean. Should existing versions on Box be overwritten.
#' Default is `FALSE`, i.e. don't overwrite.
#' @details Reccommend to use the development version of `boxr`, use `devtools::install_github(brenden-r/boxr)`.
#' @examples
#' \dontrun{
#' box_dl_pure_honey("761")
#'  }
#' @importFrom boxr box_ls box_dl
#' @importFrom purrr map_df
#' @importFrom dplyr filter select slice bind_rows
#' @importFrom assertthat assert_that
#' @export
box_dl_pure_honey <- function (
  plate_ids = 2276:2283,
  outdir = getwd(), overwrite = FALSE, layout = TRUE) 
{
  plate_ids %<>% sprintf("%08d", .) %>% paste0("1", .)
  raw_data_box_path <- "All Files/PureHoney-HS Shared Folder/Raw Data"
  raw_data_files <- purrr::map_df(plate_ids, ~ boxr::box_search(.) %>%
                                    as.data.frame()) %>%
    dplyr::filter(grepl('\\.csv', name), path == raw_data_box_path) %>%
    dplyr::select(name, type, id, modified_at)
  
  possible_matches <- purrr::map_df(plate_ids, ~dplyr::filter(raw_data_files, 
                                                              grepl(., name))) %>% dplyr::select(name, type, id, modified_at)
  if (layout) {
    coa_box_dir <- 6203931357
    layout_files <- boxr::box_ls(dir_id = coa_box_dir) %>% 
      as.data.frame()
    possible_layout_dirs <- purrr::map_df(plate_ids, ~dplyr::filter(layout_files, 
                                                                    grepl(., name))) %>% unique()
    if (nrow(possible_layout_dirs) > 0) {
      layout_file <- boxr::box_ls(possible_layout_dirs$id) %>% 
        as.data.frame() %>%
        dplyr::filter(grepl("xlsx$", name)) %>%
        dplyr::select(name, type, id, modified_at) %>% 
        dplyr::slice(1)
      possible_matches %<>% dplyr::bind_rows(layout_file)
    } else {
      stop("No layout files found, consider running with `layout = FALSE`")
    }
  }
  assertthat::assert_that(nrow(possible_matches) > 0, msg = "No matching files :(")
  cat("The following files matched the specified plate identifiers: \\n")
  print(possible_matches)
  user_response <- readline(prompt = paste0("Download the files to ", 
                                            outdir, "? (y/N)"))
  if (trimws(user_response) == "y") {
    for (i in 1:length(possible_matches$id)) {
      cat("\\nDownloading ", possible_matches$name[i], 
          "...")
      try(boxr::box_dl(file_id = possible_matches$id[i], 
                       local_dir = outdir, overwrite = overwrite))
    }
  }
  if (layout) {
    as.numeric(possible_layout_dirs$id)
  }
  else {
    invisible(NULL)
  }
}


#' The color palette for isotopes in PureHoney experiments
#' Colors '13C' blue, '12C' grey and 'deuterated' green.
#' @md
#' @return Named character vector for use with `scale_color_manual()`
#' @importFrom stats setNames
#' @export
ph_color_pal <- function() {
  c("blue", "grey", "forestgreen") %>%
    setNames(c("13C", "12C", "deuterated"))
}

#' The fill palette for isotopes in PureHoney experiments
#' Colors '13C' blue, '12C' grey and 'deuterated' green.
#' @md
#' @return Named character vector for use with `scale_fill_manual()`
#' @importFrom stats setNames
#' @export
ph_fill_pal <- function() {
  c("navyblue", "grey20", "darkgreen") %>%
    setNames(c("13C", "12C", "deuterated"))
}

#' PureHoney helper function
#'
#' Tries to match standard curve analytes to the paired analytes without a standard curve of their own. i.e. matching 12C-Isobutyrl with D6-Isobutyryl
#'
#' @param targets Vector of targets present in the analysis.
#' @param standard_prefix Regex expression to identify curves that have standard curves.
#' @param drop_acetyl Boolean. Should Acetyl related curve pairs be removed.
#' Usually both standards for 12C-Acetyl-CoA and 13C-Acetyl-CoA and in that case your don't want to curve match them.
#' @examples
#' \dontrun{
#' # purehoney data
#' curve_matcher(ph_data$targ, "D")
#' curve_matcher(ph_data$targ, "13C")
#' }
#' 
#' @export
curve_matcher <- function(targets, standard_prefix = "D\\d", drop_acetyl = TRUE) {
  unique_targs <- unique(targets)
  curve_targs <- unique_targs[grepl(paste0("^(", standard_prefix, ")"), unique_targs)]
  non_curve_targs <- setdiff(unique_targs, curve_targs)
  matched <- curve_targs[match(gsub("^[^-]*-", "\\1", non_curve_targs),
                               gsub("^[^-]*-", "\\1", curve_targs))]
  curve_pairs <- matched %>% setNames(non_curve_targs)

  if (drop_acetyl) {
    curve_pairs %<>% .[!grepl("Acetyl", ., ignore.case = TRUE)]
  }

  curve_pairs[!is.na(curve_pairs)]
}

#' PureHoney helper function
#'
#' Applies a named vector of curve matches (ususally the output of assayr2::curve_matcher()) to data to add the relevant curve id columns
#'
#' @md
#' @param data Date being anlyzed, typically from `assayr2::read_ph_raw()`.
#' @param curve_pairs Named chr vector, with curves analytes as values and matched non-curve analytes as names
#' @param plate_id A numeric or character representing the plate_id of the standards.
#' Useful if standards are read on one plate, but used for multiple sample plates.
#' @param prun A character representing the plate run of the standards.
#' Useful if standard are read on one read, but used for multiple sample reads.
#' @examples
#' \dontrun{
#' # if no special matching is required (all plate reads have dedicated curves)
#' ph_data %>% apply_curve_pairs(curve_pairs)
#' 
#' # if targets need to be matched to curves on another read on the same plate
#' ph_data %>% apply_curve_pairs(curve_pairs, prun = "Std_read")
#' 
#' # if targets need to be matched to curves on another plate
#' ph_data %>% apply_curve_pairs(curve_pairs,  plate_id = 2222)
#' }
#' @importFrom dplyr mutate
#' @export
apply_curve_matches <- function(data, curve_pairs, plate_id = NULL, prun = NULL) {
  data %<>% dplyr::mutate(curve_id = ifelse(target %in% names(curve_pairs),
    curve_pairs[target], target
  ))
  data$curve_id %<>% ifelse(is.na(data$curve_id), data$target, .)

  # make curve_id unique for all plate runs
  if (is.null(plate_id)) {plate_id <- data$plate_id}
  if (is.null(prun)) {prun <- data$prun}
  data$plate_run <- paste(plate_id, prun, sep = "_")
  data$curve_id <- paste(data$plate_run, data$curve_id, sep = "_")
  data
}

#' PureHoney helper function
#'
#' Joins a data frame of nucleus counts to a PureHoney data frame and performs
#' standard nuclear normalization calculations to determine intracellular concentraion.
#'
#' @param data PureHoney data frame, with the columns 'sample_id' and 'conc_corrected'.
#' @param nuc_data Nucleus count data frame, with the columns 'sample_id' and 'nuc_count'.
#' @examples
#' \dontrun{
#' data %<>% nuc_join(nuc_data)
#' }
#' @importFrom dplyr inner_join mutate
#' @export
nuc_join <- function(data, nuc_data) {
  dplyr::inner_join(data, nuc_data) %>%
    dplyr::mutate(
      cell_volume_well_L = nuc_well * (3.4 * 10^-12),
      conc_nmols = conc_corrected * (.0001 + cell_volume_well_L), # conc_corrected in nM want to get at # of mols in 100uL reaction volume (reaction buffer + cell volume)
      conc_incell_nM = conc_nmols / cell_volume_well_L,
      conc_incell_uM = conc_incell_nM / 1000
    )
}

#' A wrapper for saving PureHoney data for used in the Shiny app, HST-Explorer
#'
#' Saves RDSs in the required format for use in `rds_checker.R`.
#' @md
#' @param data A data.frame with analyzed PureHoney data, ready to be QC'd for HemoShine.
#' @param path A valid file path to a directory for RDS outputs.
#' @param cols_split These columns will be combined to create `tx_run`, the unique ID in HST Explorer.
#' Deviating from the default may cause unexpected behavior.
#' @importFrom dplyr mutate
#' @importFrom purrr map2
#' @importFrom tidyr unite
#' @export
save_explorer_rds <- function(data, path = getwd(), cols_split = c("donor", "tx_cmpd", "tx_challenge", "run")) {
  # add new column, tx_challenge_abstract, used for selection widget in HST-Explorer
  data %<>% dplyr::mutate(
    tx_challenge_abstract = gsub("(_|,)? ?\\d+(u|m)M", "", tx_challenge) %>%
      gsub("\\+H00.*", "", .) %>%
      gsub(" ", "", .),
    # Add columns required for QC check
    qc = FALSE,
    qc_warning = ""
  ) %>%
    tidyr::unite(tx_run, !!cols_split, remove = FALSE) # build unique split var

  lst <- data %>% split(.$tx_run)
  purrr::map2(lst, names(lst), ~saveRDS(., paste0(path, "/", .y, ".RDS")))
}

#' Compute the C3/C2 ratio on Carntine data from PureHoney
#' 
#' Check a data frame for required information and return the orginal + newly
#' computed values.
#' @param data A data frame of PureHoney data. Must contain the columns:
#' "plate_id", "prun", "row", "column", "sample_id", "target", and "conc_corrected".
#' @param propionyl_species A character, either '12C' (default) or '13C' denoting the
#' species of Propionyl-Carnitine to use in calculating the C3/C2 ratio.
#' @examples
#' \dontrun{
#' data %<>% make_c3c2()
#' }
#' @importFrom dplyr ungroup group_by count filter select mutate distinct inner_join bind_rows
#' @importFrom tidyselect starts_with matches
#' @importFrom tidyr spread
#' @importFrom rlang ensym
#' @importFrom magrittr %<>%
#' @export
make_c3c2 <- function(data, propionyl_species = "12C") {
  
  # check propionyl_species is valid
  if (!propionyl_species %in% c("12C", "13C")) {
    stop("Only '12C' or '13C' is allowed as 'propionyl_species'.")
  }
  propionyl <- paste0(propionyl_species, "-Propionyl-Car")
  # check required fields are present
  req_cols <- c("plate_id", "prun", "row", "column", "sample_id", "target", "conc_corrected")
  if (!all(req_cols %in% names(data))) {
    missing_cols <- req_cols[!req_cols %in% names(data)]
    stop(paste0("Missing required columns: ", paste(missing_cols, collapse = ", ")))
  }
  # check required analytes are detected
  req_analytes <- c(propionyl, "12C-Acetyl-Car")
  if (!all(req_analytes %in% unique(data$target))) {
    missing_analytes <- req_analytes[!req_analytes %in% unique(data$target)]
    stop(paste0("Missing required analytes: ", paste(missing_analytes, collapse = ", ")))
  }
  # check sample_ids are valid
  if (any(is.na(data$sample_id))) {
    stop("NAs found for in data$sample_id. Double check that sample IDs are valid")
  }
  
  ratios <- data %>% 
    filter(target %in% c("12C-Acetyl-Car", propionyl)) %>%
    select(plate_id, prun, sample_id, row, column, target, conc_corrected)
  
  # check coordinates identify a unique sample; required for spreading later
  check <- ratios %>% 
    ungroup() %>% 
    group_by(plate_id, prun, row, column, sample_id) %>%
    count()
  if(!all(check$n == 2)) {
    stop("All combinations of columns, plate_id, row, column, and sample_id
         are not unique. Double check that each unique sample contains measurements
         for '12C-Acetyl-Car' and '", propionyl, "'.")
  }
  
  ratios %<>%
    spread(target, conc_corrected) %>% 
    mutate(conc_corrected = !!ensym(propionyl) / `12C-Acetyl-Car`) %>% 
    select(-starts_with("1")) %>% 
    mutate(target = paste0(propionyl_species, "-C3-C2-ratio-Car"),
           curve_plot = "C3-C2-ratio-Car",
           isotope = propionyl_species,
           c_bool = FALSE)
  
  # add new ratio values onto original data
  select(data, -target, -matches("^curve_"), -c_bool, -isotope, -raw, -matches("^conc"), -matches("^log"), - llod_conc) %>% 
    distinct() %>% 
    inner_join(ratios, ., by = c("plate_id", "prun", "sample_id", "row", "column")) %>% 
    fct_sync(data) %>% 
    bind_rows(data)
}


#' Estimate concentration in tissue
#' 
#' @param conc_nm Recovery-adjusted concentration from Pure Honey (nanomolar)
#' @param w_mg Weight of compound administered (milligrams)
#' @return Estimated concentration (micromolar) in tissue (numeric vector)
#' @export
get_tissue_conc <- function(conc_nm, w_mg) {
  3750 / w_mg * conc_nm / 1000
}