R/readers.R

Defines functions read_generic

Documented in read_generic

# Functions related to reading in data

#' Universal assayr2 function template to read in tables.
#'
#' Wrapper around file format specific read-in patterns.
#' @md
#' @param file `character` path to file or integer of Box folder ID
#' @param rows `integer` defining rows to be read. Only used for XLSX files.
#' @param cols `integer` or `character` defining colums to be read. Only used for XLSX files.
#' @param col_names `logical` should the first non-empty row be converted to column names.
#' @param dropNA `logical` should columns with all NA value be dropped automatically, if `TRUE`
#' (default) will emit at warning in the console.
#' @param snake_names `logical` should column names be converted to snake_case
#' @param ... Arguments passed on to `read.table()`, `openxlsx::read.xlsx()` or `boxr::box_read()`.
#' @return A dataframe without factors.
#' @importFrom utils read.table count.fields
#' @importFrom boxr box_read
#' @importFrom openxlsx read.xlsx
#' @importFrom glue glue
#' @importFrom dplyr mutate_all
#' @export
read_generic <- function(file, rows = NULL, cols = NULL, col_names = FALSE,
                         dropNA = TRUE, snake_names = TRUE, ...) {
  if (is.numeric(file)) {
    out <- boxr::box_read(file, ...)
  }

  else {
    file_type <- gsub(".*\\.(.*)$", "\\1", file) %>% tolower()

    if (file_type == "txt") {
      out_ncol_max <- max(count.fields(file = file, sep = "\t"))
      out <- read.table(
        file = file, header = col_names, sep = "\t", fill = TRUE,
        col.names = 1:out_ncol_max,
        stringsAsFactors = FALSE, ...
      )
    }

    else if (file_type == "csv") {
      out_ncol_max <- max(count.fields(file = file, sep = ","), na.rm = T)
      out <- read.table(
        file = file, header = col_names, sep = ",", fill = TRUE,
        col.names = 1:out_ncol_max,
        stringsAsFactors = FALSE, ...
      )
    }
    else if (file_type == "xlsx") {
      if (is.character(cols)) {
        col_idx <- chr_to_int(cols)
      }
      else {
        col_idx <- cols
      }
      out <- openxlsx::read.xlsx(file,
        colNames = col_names, cols = col_idx,
        rows = rows, ...
      )
    }
    else {
      stop("Invalid file type: is not supported. Only TXT, CSV, XLSX are currently,
              open an issue on github.com/hemoshear/assayr2.")
    }
  }
  if (dropNA) {
    na_indexs <- names(out)[sapply(out, function(x) all(is.na(x)))]
    if (length(na_indexs) > 0) {
      message(glue::glue('The columns: {paste(na_indexs, collapse= ", ")} contained only NA values and were dropped.'))
      out <- out[!names(out) %in% na_indexs]
    }
  }
  # temp fix for dplyr #4094 related to tibble #566
  names(out) %<>% gsub("^\\.+", "X", .)
  
  if (snake_names) {
    names(out) %<>% to_snake_case()
  }
  # dplyr::mutate_all(out, as.character)
  out
}

#' Parser for Molecular Devices scope output
#'
#' Extracts meta information from table headers and combines individual scope
#' aquisition tables into a single unified data frame.
#' @md
#' @param file A valid path to a .txt or .xlsx file with Molecular Devices scope output.
#' @param measurements Default is NULL, which will auto-detect measurements
#' present and messsage the user in terminal. Alternativly, provide a character
#' vector of regex ready (special characters must be escaped) patterns and only
#' matching measurements will be returned.
#' @param verbose Boolean, should information about parsed columns be printed to the console?
#' @param plate_capture Character, regex capture expression to grab plate identifier
#' from the field "Acquisition Name \[Plate Info\]". Default grabs digits followed by optional "b".
#' The `gsub()` call this feeds into is case insensitive.
#' @param ... Arguments passed to `read_generic()`
#' @return A data frame with columns for run, plate, well, site and measurments.
#' @section Warning:
#' This function will not work with a `file = $BoxFileID`, due to `rio::import()`'s
#' behavior which is baked into `boxr`. So `file` must be a local path.
#' @examples
#' moldev_smaa <- system.file("extdata", "SMAA_Moldev_example.txt", package = "assayr2")
#' smaa <- read_moldev(moldev_smaa)
#' 
#' moldev_nilered <- system.file("extdata", "NileRed_Moldev_example.txt", package = "assayr2")
#' nilered <- read_moldev(moldev_nilered)
#' @importFrom dplyr mutate_all select everything mutate bind_rows
#' @importFrom purrr map
#' @export
read_moldev <- function (file, measurements = NULL, verbose = TRUE, plate_capture = ".*Plate ?(\\d*|\\d*b).*", ...) 
{
  output <- read_generic(file)
  output %<>% mutate_all(~gsub("ATF$", "", .))
  meta_info <- grep(".*Name \\[Plate Info\\].*", unlist(output[, 
                                                               1]), value = TRUE) %>% unname()
  run_names <- gsub(".*=([A-Z]{3}[0-9]{4}(-?|-\\d+)).*", "\\1", 
                    meta_info) %>% gsub("-$", "", .)
  plate_nums <- gsub(plate_capture, "\\1", meta_info, 
                     ignore.case = T)
  plate_starts <- grep("(Well Name|Plate ID)", unlist(output[, 
                                                             1])) + 1
  plate_stops <- grep("^\\d$", unlist(output[, 1])) - 1
  plate_stops <- plate_stops[-1]
  plate_stops <- c(plate_stops, nrow(output))
  id_row <- droplevels(output[(plate_starts[1] - 1), ]) %>% 
    unlist() %>% na.omit()
  names(id_row) <- 1:length(id_row)
  always_ids <- c("Plate ID", "Well Name", "Site ID", "MEASUREMENT SET ID")
  if (is.null(measurements)) {
    measure_ids <- id_row[!id_row %in% always_ids]
    measurement_decoder <- c(`Nuclear Count (Transfluor)` = "nuc_count", 
                             `Vesicle Integrated Intensity (Transfluor)` = "nile_intensity", 
                             `Cell: smaa area sum (Custom Module)` = "smaa_area", 
                             `Cell: integrated int sum (Custom Module)` = "smaa_intensity", 
                             `Vesicle Count (Transfluor)` = "mac_count", `Cell: Average_Nuc_Size (Custom Module)` = "avg_nuc_size", 
                             `Cell: Live_Nuc_Count (Custom Module)` = "live_nuc_count", 
                             `Cell: Dead_Nuc_Count (Custom Module)` = "dead_nuc_count", 
                             `Cell: Total_Nuc_Count (Custom Module)` = "total_nuc_count", 
                             `Cell: stell  nuc count (Custom Module)` = "stellate_count", 
                             `Cell: mac nuc count (Custom Module)` = "mac_count", 
                             `Cell: total nuc count (Custom Module)` = "total_nuc_count")
    if (verbose) {
      message("Parsing measurements: ", paste(ifelse(measure_ids %in% 
                                                       names(measurement_decoder), measurement_decoder[measure_ids], 
                                                     measure_ids), collapse = ", "))
    }
    measures <- names(measure_ids) %>% as.numeric()
  }
  else {
    measures <- purrr::map(measurements, ~grep(., id_row, 
                                               ignore.case = TRUE)) %>% unlist() %>% unique() %>% 
      as.numeric()
  }
  well_name <- grep("Well Name", id_row)
  site_id <- grep("Site ID", id_row)
  columns_we_want <- c(well_name, site_id, measures) %>% as.numeric()
  c_names <- id_row[columns_we_want] %>% ifelse(. %in% names(measurement_decoder), 
                                                measurement_decoder[.], .) %>% gsub(" .*", "", .) %>% 
    to_snake_case()
  plates <- list()
  for (p in 1:length(plate_starts)) {
    p_df <- output[plate_starts[p]:plate_stops[p], ]
    p_df <- p_df[, columns_we_want]
    colnames(p_df) <- c_names
    p_df$plate <- plate_nums[p]
    p_df$run <- run_names[p]
    p_df %<>% dplyr::select(run, plate, dplyr::everything())
    if (p_df[1, 3] != "6") {
      plates[[p]] <- p_df
    }
  }
  plates %>% bind_rows() %>% mutate_at(measures, as.numeric) %>% 
    mutate(plate = gsub("^0+", "", plate))
}

#' Extract sample identifiers from a file with discards from imaging.
#'
#' Currently only workds with CSVs, could be extended to XLSX
#' @md
#' @param file A path to a CSV file with discards.
#' @param separate_uid Boolean. Should the UID from the scope be split into 3 separate
#' columns (plate, well, site).
#' @return A data frame.
#' @param ... Arguments passed to `read_generic()`
#' @importFrom stats setNames
#' @importFrom tidyr separate
#' @importFrom dplyr mutate
#' @examples
#' \dontrun{
#' # scope data
#' scope_discards <- read_discards(367357642382) # a file on Box
#' }
#' @export
read_discards <- function(file, separate_uid = TRUE, ...) {
  out <- read_generic(file, col_names = TRUE, ...)[2] %>%
    setNames(c("uid")) %>%
    separate(uid, c("run", "uid"), sep = "late") %>%
    dplyr::mutate(
      run = gsub("([A-Z]{3}[0-9]{4}(-?|-\\d+)).*", "\\1", run),
      uid = gsub("s(\\d+)\\.tif", "\\1", uid)
    )

  if (separate_uid) {
    out %<>% separate(uid, c("plate", "well", "site")) %>%
      mutate(plate = gsub("^0+", "", plate))
  }
  out
}

#' Generic parser for data from the Omega FLUOStar plate analyzer
#'
#' Versatile parser that orders standard columns (ie raw, blank_raw, machine_fit, etc)
#' automatically for \code{rbind()}-ing. Verbose with warnings, returns a tibble.
#' @param file Path to a BMG FLUOstar Omega plate-reader output xlsx.
#' @param plate_capture A regex pattern with a single capture for extracting plate id from file name.
#' @param std_concs A numeric vector of concentrations in descending order. Must be same length as standards
#' in the FLUOstar output file.
#' @param measurements A character specifying the measrements to be extracted from
#' the FLUOstar output file. If `NULL` (the default), assumes all columns that are not
#' 'Well Row', 'Well Col' and 'Content' are to be parsed as measurments.
#' @param verbose Boolean, should information about parsed columns be printed to the console?
#' @param ... Arguments passed to `read_generic()` 
#' @examples
#' \dontrun{
#' # example data in assayr2
#' fluostar_file <- system.file("extdata", "FLUOstar_ALbumin_example.xlsx", package = "assayr2")
#' fluostar_data <- read_fluostar(fluostar_file, ".*_(.*)\\..*")
#' }
#' @importFrom stats setNames
#' @importFrom purrr map
#' @importFrom glue glue
#' @importFrom dplyr filter full_join mutate mutate_at select rename
#' @export
read_fluostar <- function(file, plate_capture = "", std_concs = NULL,
                          measurements = NULL, verbose = TRUE, ...) {
  df <- read_generic(file, ...)

  # Parse columns of interest and their names
  id_row_pos <- grep("^Well", df[, 1])
  id_names <- unlist(df[id_row_pos, ]) %>%
    setNames(1:length(.)) %>%
    to_snake_case() %>%
    gsub("well_", "", .)
  # could add more gsub()s here to clean up colnames more
  
  # flag values that are always present
  always_ids <- c("row", "col", "content")
  # assume other values are measuements if measurements = NULL
  if (is.null(measurements)) {
    measure_ids <- id_names[!id_names %in% always_ids]
    # could add key-value pairs here to parse names to specified alternatives
    if (verbose) {
      message("Parsing measurements: ", paste(measure_ids, collapse = ", "))
    }
    measures_col_pos <- names(measure_ids) %>% as.numeric()
  } else {
    measurements <- to_snake_case(measurements)
    measures_col_pos <- purrr::map(measurements, ~grep(., id_names, ignore.case = TRUE))
    
    # handle missing matches with a message to user
    measures_missing_a_match <- sapply(measures_col_pos, function(x) length(x) == 0)
    if (any(measures_missing_a_match)) {
      message(glue::glue("The measurements:\n{paste(measurements[measures_missing_a_match], collapse = '\n')}\nDon't match any names in FLUOstar output :("))
    }
    measures_col_pos <- measures_col_pos %>%
      unlist() %>%
      unique()
  }
  
  # set up 
  columns_we_want <- c(names(id_names[id_names %in% always_ids]), measures_col_pos) %>%
    as.numeric()
  
  column_names <- id_names[columns_we_want]
    
  df <- df[(id_row_pos + 1):nrow(df), columns_we_want] %>%
    setNames(column_names)

  # handle cases on missing std_concentations
  if (nrow(dplyr::filter(df, grepl("Standard", content))) > 0 &&
      sum(grepl("standard_concentrations", names(df))) == 0 &&
      is.null(std_concs)) {
    
    exp_concs <- df[grepl("Standard", df$content), ]$content %>%
      unique() %>%
      length()
    
    stop(paste0("No standard concentrations detected. Please provide a numeric vector (length ", exp_concs, ") to the argument 'std_concs'"))
  }

  if (nrow(dplyr::filter(df, grepl("Standard", content))) > 0 &&
      sum(grepl("standard_concentrations", names(df))) == 0 &&
      !is.null(std_concs)) {
    
    missing <- df[grepl("Standard", df$content), ]$content %>%
      unique()
    
    df <- dplyr::full_join(df, data.frame(content = missing, std_conc = std_concs, stringsAsFactors = FALSE))
  }
  
  df %<>% dplyr::mutate_at(.vars = measures_col_pos, .funs = as.numeric)
  
  # warn if saturated measurements are detected
  main_measure_id <- measure_ids[which.max(grepl("raw", measure_ids))]
  if (any(df[[main_measure_id]] == 3.5)) {
    warning("Saturated values detected in: ", main_measure_id)
  }
  
  df %>%
    dplyr::mutate(plate = gsub(plate_capture, "\\1", file, ignore.case = TRUE)) %>% 
    dplyr::rename(column = col) %>% 
    dplyr::select(plate, dplyr::everything())
}

#' Generic parser for Biorad CFX thermocycler
#'
#' Reads in the defualt output csv from CFX Manager 3.1.1621.0826 as a tibble.
#' @md
#' @param file Path to a BioRad CFX output CSV
#' @param plate_capture A regex pattern with a single capture for extracting plate id from file name.
#' @return A data frame with 3 columns (plate, well, cq)
#' @examples
#' # example data in assayr2
#' pcr_file <- system.file("extdata", "CFX_example.csv", package = "assayr2")
#' pcr_data <- read_cfx(pcr_file)
#' @importFrom dplyr mutate select
#' @export
read_cfx <- function(file, plate_capture = "") {
  read_generic(file) %>%
    .[, 1:2] %>%
    setNames(c("well", "cq")) %>%
    .[(grep("Well$", .$well) + 1):nrow(.), ] %>%
    dplyr::mutate(
      plate = gsub(plate_capture, "\\1", file),
      cq = as.numeric(cq),
      well = gsub("(\\D)0+", "\\1", well)
    ) %>%
    dplyr::select(plate, well, cq) # re-order columns
}
#' Generic parser for PureHoney Assay layouts
#'
#' High level wrapper function that that imports the assay layout for PureHoney as a tibble.
#' Uses fixed offset from document markers to capture specific regions, prone to errors if assay document template is altered.
#' @md
#' @param file Path to .xlsx file with PureHoney assay layout.
#' @param plate_ids Optional character vector with plate numbers to extract.
#' Defaults to NULL, which will extract all plate ids present.
#' @param meta_cols Numeric. Defines the columns to extract from metadata table.
#' Default is `NULL` for all columns.
#' @param meta_names Character. Column names, length must match `legnth(meta_cols)`.`
#' @param skip Number of rows to ignore when looking for plates. Useful for avoiding standard curve references in the header.
#' @param ... Arguments passed to `read_generic()`
#' @examples
#' ph_meta <- system.file("extdata", "PH_Assay_example.xlsx", package = "assayr2")
#' meta <- read_ph_assay(ph_meta)
#' @importFrom purrr map_chr map_dfr map2_dfr
#' @importFrom dplyr cumall filter mutate full_join
#' @importFrom tidyr replace_na
#' @importFrom tibble enframe
#' @export
read_ph_assay <- function(file, plate_ids = NULL, meta_cols = NULL,
                          meta_names = NULL, skip = 15, ...) {
  read <- read_generic(file, ...) %>%
    .[-c(1:skip), ]

  plates <- grep("^1\\d{8}$", read[, 1], value = T) %>%
    gsub("10*([1-9]\\d+)($| .*)", "\\1", .) %>%
    unique()

  lays <- grep("^A$", read[, 1]) %>% setNames(., plates[1:length(.)])

  metas <- grep("^Sample", read[, 1]) %>% setNames(., plates[1:length(.)])

  # catch dilution factor (if it is in the specific cell)
  dils <- map_chr(metas, ~read[(. - 1), 2]) %>%
    as.numeric() %>%
    replace_na(1) %>%
    setNames(., plates[1:length(.)])

  if (!is.null(plate_ids)) {
    lays %<>% .[names(lays) %in% plate_ids]
    metas %<>% .[names(metas) %in% plate_ids]
    dils %<>% .[names(dils) %in% plate_ids]
  }

  layout <- purrr::map_dfr(lays, ~melt_plate(read[c(.:(. + 7)), 2:13]), .id = "plate_id")

  if (is.null(meta_cols)) {
    meta_names <- read[metas[1], ] %>%
      unlist() %>%
      tolower() %>%
      unname() %>%
      gsub(" ", "_", .) %>%
      gsub("sample_#", "content", .) %>%
      replace_na("drop")

    meta <- purrr::map_dfr(metas, ~read[c((. + 1):(. + 85)), ] %>%
      filter(dplyr::cumall(!is.na(.$x_1))), .id = "plate_id") %>%
      setNames(c("plate_id", meta_names))
  } else {
    meta <- map_dfr(metas, ~read[c((. + 1):(. + 85)), meta_cols] %>%
      filter(cumall(!is.na(.$x_1))), .id = "plate_id") %>%
      setNames(c("plate_id", meta_names))
  }


  # drop columns if all values are NAs
  allNA_ix <- purrr::map_lgl(meta, ~sum(is.na(.)) == nrow(meta))
  meta <- meta[!allNA_ix]

  dil <- tibble::enframe(dils) %>%
    setNames(c("plate_id", "dilution_factor"))

  meta %<>% full_join(layout, by = c("plate_id", "content")) %>%
    full_join(dil, by = "plate_id") %>%
    filter(!is.na(row))

  # little prep tweaks
  meta %>% mutate(content = gsub("BLANK", "blank", content, ignore.case = TRUE))
}


#' PureHoney Assay Result Reader
#'
#' High level wrapper function that that imports the raw data output from PureHoney as a tibble.
#' @md
#' @param file Path to .csv file with PureHoney raw output.
#' @param plate_capture A single regex capture expression extracting plate id from file name.
#' @param prun_capture A single regex capture expression for plate run information.
#' @return A long data frame containing all of the analyte reads in the plate.
#' @param ... Arguments passed to `read_generic()`.
#' @examples
#' ph_raw <- system.file("extdata", "PH_raw_example.csv", package = "assayr2")
#' ph_data <- read_ph_raw(ph_raw)
#' @importFrom dplyr rename mutate bind_rows
#' @export
read_ph_raw <- function(file, plate_capture = ".*10+([1-9]\\d{3,5}).*", prun_capture = ".*[[:alpha:]]\\.csv", ...) {
  output <- read_generic(file, ...)

  species_ids <- grep("XIC", output[, 1], value = T) %>% gsub("XIC = xic-", "", .)
  plate_starts <- grep("XIC", output[, 1]) + 2
  plate_stops <- plate_starts + 7

  data <- list()

  for (c in 1:length(species_ids)) {
    data[[c]] <- output[c(plate_starts[c]:plate_stops[c]), 2:13] %>%
      melt_plate() %>%
      dplyr::mutate(
        target = species_ids[c],
        plate_id = file
      ) %>%
      dplyr::rename(raw = content)
  }

  data %>%
    bind_rows() %>%
    mutate(
      raw = as.numeric(raw),
      log10_raw = log10(raw),
      prun = gsub(prun_capture, "\\1", plate_id),
      plate_id = gsub(plate_capture, "\\1", plate_id)
    )
}

#' Parser for MagPix csv outputs
#'
#' Convenience wrapper for importing a MagPix xPONENT analysis file as a tibble.
#' @md
#' @param file A valid path to a .csv file with MagPix xPONENT analysis.
#' @param value A character vector specifying the value to extract.
#' Must be one of: Median, Net MFI, Count, Result, Avg Net MFI, Avg Result.
#' @param ... Arguments passed to `read_generic()`.
#' @return A list with two elements: one data frame with sample values and one data
#' frame with standard values.
#' @examples
#' magpix_raw <- system.file("extdata", "Magpix_example.csv", package = "assayr2")
#' magpix <- read_magpix(magpix_raw)
#' @importFrom dplyr select rename full_join group_by summarise filter
#' @importFrom stats setNames
#' @importFrom tidyr gather
#' @export
read_magpix <- function(file, value = "Net MFI", ...) {
  
  f <- read_generic(file, ...)
  
  # positional anchors
  anchors <- grep("^(DataType|-- CRC --)", f[,1]) %>% stats::setNames(., f[,2][.])

  # value
  start <- anchors[value] + 1
  stop <- anchors[which((anchors - start) > 0)][1] - 1 # next anchors value
  vals <- f[start:stop, ] %>% setNames(., .[1, ]) %>% .[-1, ] # first row to col_names
  vals %<>% .[, colSums(vals != "") != 0] # drop empty columns
  vals %<>% select(-`Total Events`) # drop `Total Events`
  vals$Location %<>% gsub(".*,(.*)\\)", "\\1", .) # clean to well_id
  vals %<>% gather("target", "concentration", -(Location:Sample)) %>% # go long
    rename(content = Sample, well = Location) # clean up

  colnames(vals)[4] <- "intensity"
  suppressWarnings(vals[, 4] %<>% as.numeric())

  # warnings
  start <- anchors["Warnings/Errors"] + 1
  stop <- anchors[which((anchors - start) > 0)][1] - 1
  if (start - stop > 0) {
    wrngs <- f[start:stop, ] %>% magrittr::set_names(., .[1, ]) %>% .[-1, ]
    wrngs %<>% .[, colSums(wrngs != "") != 0]
    wrngs$Location %<>% gsub(".*,", "", .)
    names(wrngs) %<>% tolower()
    wrngs %<>% dplyr::rename(well = location)

    if (nrow(wrngs) > 0) { # join if warning are present
      vals %<>% dplyr::full_join(wrngs)
    }
  }

  # std_conc
  start <- anchors["Standard Expected Concentration"] + 1
  stop <- anchors[which((anchors - start) > 0)][1] - 1
  stds <- f[start:stop, ] %>% setNames(., .[1, ]) %>% .[-1, ]
  stds %<>% .[, colSums(stds != "") != 0]
  stds %<>% gather("target", "concentration", -Reagent) %>%
    rename(content = Reagent)
  suppressWarnings(stds[, 3] %<>% as.numeric())
  validate_std_curve_conc <- function(d) {
    f <- group_by(d, target) %>%
      summarise(all_equal_conc = all(diff(concentration) == 0)) %>%
      filter(all_equal_conc)
    # return original dataframe if standard concentrations are ok
    if (nrow(f) == 0) {
      return(d)
    } else {
      warning(paste0(
        "Invalid standard concentrations for ",
        f$target,
        ", add known concentrations to data frame"
      ))
      return(d)
    }
  }

  stds %<>% validate_std_curve_conc()
  suppressWarnings(
    vals$log10_intensity <- log10(vals$intensity)
  )
  # return a named list
  list(vals, stds) %>%
    setNames(paste0(c("values_", "stds_"), file))
}

#' Melt a data frame from plate format to long format
#' @md
#' @param plt A data frame representing an assay plate.
#' @param .id The name of the well content column.
#' @return A melted (long) data frame.
#' @examples
#' plate <- as.data.frame(matrix(rnorm(96), nrow = 8))
#' melt_plate(plate)
#' @importFrom dplyr mutate
#' @importFrom tidyr gather
#' @importFrom stats setNames
#' @export
melt_plate <- function(plt, .id = "content") {
  if (is.null(.id) || is.na(.id)) .id <- "content"
  plt %>%
    setNames(1:ncol(.)) %>%
    mutate(row = LETTERS[1:nrow(.)]) %>%
    gather(key = "column", value = !!.id, -row)
}

#' Read xlsx data in plate format and convert to long format
#'
#' @description Read in plate format data from an xlsx file and return a melted
#' data frame with the contents and well coordinates.
#' @md
#' @param file A valid file path for the xlsx file.
#' @param rows A numeric vector of the rows to read
#' @param cols A vector of columns to be read. Can be integers or characters, see examples
#' @param col_names Boolean. Should the first non-empty row be converted to column names.
#' @param sheet The name or index of the sheet.
#' @param ... Arguments passed to `read_generic()`.
#' @param .id The name of the well content column.
#' @return A melted data frame.
#' @details A special case is when the first row of contents is empty.
#' The suggested pattern then is to include an extra row with column names from the Excel Sheet and set `col_names = TRUE`.
#' So for a a 8-row plate you would include a 9-row range, see examples. This is because of behavior in
#' the underlying function `openxlsx::read.xlsx()` that skips empty rows at the start
#' of a read-in region if the contents are blank.
#' @examples
#' \dontrun{
#' # generic assay with layouts in 96 well format
#' meta_file <- "data/plate_layouts.xlxs"
#'
#' # all equivilant
#' melt_plate_xlsx(meta_file, rows = 2:9, cols = 2:13, sheet = 1)
#' melt_plate_xlsx(meta_file, rows = 2:9, cols = LETTERS[2:13], sheet = 1)
#' melt_plate_xlsx(meta_file, rows = 2:9, cols = "B:M", sheet = 1)
#' 
#' # if the first row was empty
#' melt_plate_xlsx(meta_file, rows = 1:9, cols = "B:M", col_names = T, sheet = 1)
#' }
#' @export
melt_plate_xlsx <- function (file, rows, cols, sheet = 1, col_names = FALSE,  ..., .id = "content") {
  read_generic(file, rows = rows, cols = cols, sheet = sheet, col_names = col_names,
               dropNA = FALSE, skipEmptyRows = FALSE, ...) %>% 
    melt_plate(.id = .id)
}

#' A parser for plate maps
#'
#' Useful for reading in an Excel sheet full of plate layouts, originally built
#' for PCR layouts. Auto detects plate names, well contents, targets and
#' primer temperatures, if the layout is typical :)
#' @md
#' @param file A valid file path to layout of interest. Currently only supports
#' 'xlsx' files.
#' @param sheet Numeric. The sheet of interest in `pcr_file`.
#' @param skip Numeric. Number of rows to skip, always try to skip header meta
#' material, start at first plate.
#' @param arrange Character vector with elements being either "rows" or "cols"
#' (default), that describe how primers are layed out within plates. If
#' `length(arrange) == length(plate_layouts)`, then each layout will be handled
#' according to the matching `arrange` value of plate layouts.The `arrange` arg is not a magic bullet.
#' @param times Integer. Number of times to replicate targets.
#' @param ... Arguments passed to `read_generic()` 
#' @details Relies on finding a single plate identifier starting with the 'Plate' for each layout.
#' This function will handle standard layouts well, but abnormal layouts will require manual TLC, until a
#' color matching solution is ready (so like never).
#' @return A names list for each plate. Useful for fixing a-typical layouts manually.
#' @examples
#' \dontrun{
#' layout_file <- "path/to/my_layout.xlsx"
#' lay <- read_layouts(meta_file, sheet = 1) # returns list
#' # perhaps fix a specific abnormal plate by hand
#' lay %<>% bind_rows() # once every plate is ready
#' }
#' @importFrom purrr map map_lgl keep discard
#' @importFrom dplyr arrange mutate select
#' @importFrom tidyr unite
#' @export
read_layouts <- function(file, sheet = 1, skip = 1, arrange = "cols", times = NULL, ...) {
  f <- read_generic(file, sheet = sheet, startRow = skip, ...)

  # locate critical anchor columns ------
  all_cols <- purrr::map(f, unlist)
  # left edge of plates
  # start_col <- which(map_lgl(all_cols, ~ na.omit(.)[1] == 'a' | na.omit(.)[1] == 'A'))
  start_col <- which(map_lgl(all_cols, ~all(LETTERS[1:8] %in% .)))
  # meta-info:plate_name, temperature, targets (usually to the right of column12)
  plt_col <- which.max(map_lgl(all_cols, ~grepl("plate", na.omit(.)[1], ignore.case = TRUE)))

  # establish offsets ----
  # parser requires constant format (spacing) over plates
  row_offset <- which.max(all_cols[[start_col]] == "A")
  plt_offset <- which.max(!is.na(all_cols[[plt_col]]))
  shift <- abs(row_offset - plt_offset) # the $$$ piece

  plate_starts <- grep("^A$", f[, start_col], ignore.case = TRUE)

  res <- list()
  for (p in plate_starts) { # each plate_layout

    if (length(arrange) == length(plate_starts)) {
      names(arrange) <- plate_starts
      p_arrange <- dplyr::arrange[match(p, names(arrange))]
    } else {
      p_arrange <- arrange[1]
    }

    # check row for multiple plate_names within a plate_layout
    name <- grep("Plate", f[p - shift, ], value = TRUE, ignore.case = TRUE)
    names(name) <- NULL
    name_pos <- grep("Plate", f[p - shift, ], ignore.case = TRUE)
    num_plts <- length(name)

    # grab temperatures if they exist
    temps <- purrr::map(name_pos, ~f[ (p - (shift - 1)), .] %>%
      purrr::keep(~grepl("(\u00B0|\u00BA)", .))) %>%
      stats::setNames(name)

    # grab targets as everything thats not temps
    targs <- purrr::map(name_pos, ~f[ (p - (shift - 1)):(p + 7), .] %>%
      na.omit() %>%
      purrr::discard(~grepl("(\u00B0|\u00BA)", .))) %>%
      stats::setNames(name)

    # melt to long form
    plt_lay <- melt_plate(f[p:(p + 7), (start_col + 1):(start_col + 12)])

    for (n in name) { # each plate name within a plate_layout
      plt_lay$plate <- gsub("plate ?", "", n, ignore.case = TRUE)
      number_samples <- length(na.omit(unique(plt_lay$content)))

      # add targets by plate -----
      tmp <- unlist(targs[[n]])
      if (length(tmp) > 0) { # only try to add targets if present
        temp <- unlist(temps[[n]])
        if (length(temp) > 0) plt_lay$temp <- temp # add temperature
        # 5 targs fuck things up
        if (length(tmp) == 5) tmp %<>% c("") # add 6th empty target

        # + row wise ----
        if (p_arrange == "rows" | ((length(tmp) == 4) && (number_samples == 12))) {
          times <- 24 * ceiling(number_samples / 12)

          plt_lay %<>%
            dplyr::arrange(row, as.numeric(column)) %>%
            dplyr::mutate(target = rep(tmp, each = times, length.out = nrow(plt_lay)))
        }
        # + column wise ----
        else {
          times <- 16 * floor(number_samples / 8)
          if (length(tmp) == 2) times <- 48
          if (length(tmp) == 6) times <- 16
          if (times == 0) times <- 2

          plt_lay %<>%
            dplyr::arrange(as.numeric(column), row) %>%
            dplyr::mutate(target = rep(tmp, each = times, length.out = nrow(.)))
        }
      }
      # mild formatting
      res[[n]] <- plt_lay %>%
        dplyr::select(plate, tidyselect::everything()) %>%
        tidyr::unite(well, row, column, sep = "", remove = FALSE)
    }
  }
  res
}

#' Read in a device layout from Box
#'
#' Trim out the rectangle of critial device information, by using column names and NA positions
#' @md
#' @param file Integer or character. If integer must be a valid Box file ID. If
#' character must be a valid local path for a XLSX file.
#' @param sheet Integer for sheet to read in XLSX file.
#' @param ... Arguments passed to `read_generic()`.
#' @examples
#' \dontrun{
#' library(boxr)
#' box_auth()
#' device_finder("HEM0401-2") %>% read_device_layout()
#' }
#' @importFrom dplyr mutate
#' @importFrom purrr map_lgl
#' @export
read_device_layout <- function(file, sheet = 1, ...) {

  read <- read_generic(file, sheet = sheet, ...)

  # capture names if they exist
  run_name_pos <- grep("Project", read[[1]], ignore.case = TRUE)
  
  if (length(run_name_pos) > 0) {
    run_name <- gsub("Project:?", "", read[[1]][run_name_pos]) %>% trimws()
  }
  else if (!is.null(names(file))) {
    run_name <- names(file)
  }
  else {
    warning("No project codes detected, assigning project = NA")
    run_name <- NA
  }

  start_pos <- grep("Device", read[[1]], ignore.case = TRUE) # eventual colnames

  if (length(start_pos) == 0) {
    stop(paste(file, "doesn't look like device layout"))
  } else {
    stop_pos <- which.max(is.na(read[[1]])) - 1
    if (stop_pos == 0) {
      stop_pos <- nrow(read)
    }

    read %<>% .[(start_pos + 1):stop_pos, ] %>%
      setNames(read[start_pos, ]) %>%
      .[!is.na(names(.))]

    # to chop off bad columns on the right
    end_pos <- grep("Touching", names(read), ignore.case = TRUE)
    
    # to drop empty header rows
    first_column_mask <- !is.na(read[, 1])

    read %<>% .[1:end_pos] %>%
      .[first_column_mask, ] %>%
      dplyr::mutate(run = run_name)

    # drop columns where all values are NA
    allNA_idx <- purrr::map_lgl(read, ~sum(is.na(.)) == nrow(read))
    read[!allNA_idx]
  }
}

#' Read in SMAD quantification data post ImageJ analysis
#'
#' @md
#' @param file A txt file containing ImageJ analysis of SMAD staining.
#' @param uid_capture Regex capture expression for extracting UID (plate_well_site) from image names.
#' @param keys Character vector with names of measuements in analysis output. Default is `NULL`,
#' which will guess whether to use NPC or HEP keys based on number of measurements.
#' @param return_all Boolean, should all intermediate value be returned, useful for debugging.
#' @param ... Arguments passed to `read_generic()`.
#' @return A long data frame, with values returned for multiple measurements in the cytoplasm and nucleus.
#' @examples
#' library(assayr2)
#' \dontrun{
#' smad_raw <- system.file("extdata", "SMAD_Quant_example.txt", package = "assayr2")
#' smad <- read_smad(smad_raw)
#' }
#' @importFrom stats setNames
#' @importFrom glue glue
#' @importFrom dplyr mutate filter select
#' @importFrom tidyr fill spread separate
#' @export
read_smad <- function(file, uid_capture = ".*plate(.*)\\.tif",
                      keys = NULL, return_all = FALSE, ...) {
  output <- read_generic(file, ....) %>%
    stats::setNames("value")

  # guess keys if NULL
  if (is.null(keys)) {
    hep_keys <- c(
      "image_name", "nuclear.count", "nuclear.intensity", "nuclear.area",
      "dilated.count", "dilated.intensity", "dilated.area"
    )
    npc_keys <- hep_keys[-5] # doesn't have the secondary count

    num_measures <- grepl("^\\d", output[, 1]) %>% rle() %>% .$lengths %>% max() %>% add(1)

    if (num_measures == 6) {
      message(glue::glue("Using NPC measurement keys:{paste(c('', npc_keys), collapse = '\n')}"))
      keys <- npc_keys
    }
    else {
      message(glue::glue("Using HEP measurement keys:{paste(c('', hep_keys), collapse = '\n')}"))
      keys <- hep_keys
    }
    output$measure <- rep(keys, length.out = nrow(output))
  } else {
    output$measure <- rep(keys, length.out = nrow(output))
  }

  output <- dplyr::mutate(output,
    uid = ifelse(grepl("tif$", value),
      gsub(uid_capture, "\\1", value, ignore.case = TRUE),
      NA
    )
  ) %>%
    tidyr::fill(uid) %>%
    dplyr::mutate(plate_well = gsub("_s\\d$", "", uid)) %>%
    tidyr::separate(plate_well, c("plate", "well"), sep = "_", remove = TRUE, extra = "drop") %>%
    dplyr::filter(measure != "image_name") %>%
    dplyr::mutate(
      plate = gsub("^0+", "", plate),
      value = as.numeric(value)
    ) %>%
    tidyr::spread(measure, value) %>%
    # building "peri" organelle values
    dplyr::mutate(
      nuclear.intensity = nuclear.intensity * nuclear.count,
      nuclear.area = nuclear.area * nuclear.count,
      # NPC and HEP macros differ for dilated area measurements, NPC is total, HEP is avg
      dilated.intensity = if ("dilated.count" %in% keys) dilated.intensity * dilated.count else dilated.intensity,
      dilated.area = if ("dilated.count" %in% keys) dilated.area * dilated.count else dilated.area,
      peri.intensity = dilated.intensity - nuclear.intensity,
      peri.area = dilated.area - nuclear.area,
      nuclear.mean_intensity = nuclear.intensity / nuclear.area,
      peri.mean_intensity = peri.intensity / peri.area,
      nuclear.log_ratio = log2(nuclear.mean_intensity / peri.mean_intensity)
    )

  if (!return_all) output <- dplyr::select(output, uid:well, nuclear.count, nuclear.mean_intensity:nuclear.log_ratio)

  output
}
hemoshear/assayr2 documentation built on Nov. 8, 2019, 6:13 p.m.