R/sample.R

Defines functions sample

Documented in sample

utils::globalVariables(c("Assay_Plate_Barcode", "replicate_id"))
#' Sample selects replicates across specified plates.
#'
#' \code{sample} replicates across specified plates, and aggregates the
#' result in a single file. It assumes the following files exist:
#' \itemize{
#'   \item \code{<workspace_dir>/metadata/barcode_platemap.csv} which maps the
#'   plate barcodes to the corresponding platemaps,
#'   \item For each plate barcode, a CSV file
#' \code{<workspace_dir>/backend/<batch_id>/<file-matching-specified-pattern>}
#' which will be among the files included in the sample.
#' }
#'
#' @param batch_id        Batch ID.
#' @param pattern         Regular expression specifying the filenames containing the profiles from which to sample. Only CSVs are allowed.
#' @param output          Output file. The extension should be either csv, rds, or feather.
#' @param replicates      Number of replicates to select per plate map. Selects all replicates if \code{NULL}. default: \code{NULL}.
#' @param workspace_dir   Root directory containing backend and metadata subdirectories. Can be relative or absolute. default: \code{"."}.
#' @importFrom magrittr %<>%
#' @importFrom magrittr %>%
#' @importFrom magrittr extract2
#' @importFrom rlang .data
#' @importFrom utils head
#' @importFrom utils tail
#' @export
sample <- function(batch_id, pattern, output,
                   replicates = NULL,
                   workspace_dir = ".") {
  backend_dir <- file.path(workspace_dir, "backend", batch_id)

  metadata_dir <- file.path(workspace_dir, "metadata", batch_id)

  file_list <- list.files(
    backend_dir,
    pattern = pattern,
    recursive = T, full.names = T
  )

  if (!is.null(replicates)) {
    # get the list of plates that retrieved using the pattern
    plate_list_retrieved <-
      dplyr::tibble(
        Assay_Plate_Barcode = lapply(file_list, function(file) {
          head(tail(stringr::str_split(file, "/")[[1]], 2), 1)
        }) %>%
          unlist()
      )

    replicates <- as.integer(replicates)

    # create a plate_list based on number of replicates to be selected
    plate_list <-
      suppressMessages(
        readr::read_csv(
          file.path(metadata_dir, "barcode_platemap.csv"),
          col_types = readr::cols(
            Assay_Plate_Barcode = readr::col_character(),
            Plate_Map_Name = readr::col_character()
          )
        )
      ) %>%
      dplyr::select("Assay_Plate_Barcode", "Plate_Map_Name") %>%
      dplyr::inner_join(plate_list_retrieved, by = "Assay_Plate_Barcode") %>%
      dplyr::group_by("Plate_Map_Name") %>%
      dplyr::arrange(Assay_Plate_Barcode) %>%
      dplyr::mutate(
        replicate_id = dplyr::dense_rank(.data$Assay_Plate_Barcode)
      ) %>%
      dplyr::filter(replicate_id %in% seq(replicates)) %>%
      dplyr::ungroup() %>%
      dplyr::select("Assay_Plate_Barcode") %>%
      magrittr::extract2("Assay_Plate_Barcode")

    # filter file_list based on plate_list
    file_list <-
      lapply(file_list, function(file) {
        if (length(unlist(lapply(
          plate_list,
          function(plate) grep(plate, file)
        )))) {
          file
        }
      }) %>%
      unlist()
  }

  futile.logger::flog.info(
    sprintf(
      "Reading %d files...:\n%s", length(file_list),
      paste(file_list, collapse = "\n")
    )
  )

  df <- file_list %>%
    lapply(function(x) suppressMessages(readr::read_csv(x))) %>%
    dplyr::bind_rows()

  futile.logger::flog.info(sprintf("Output contains %d rows.", nrow(df)))

  if (tools::file_ext(output) == "rds") {
    saveRDS(df, output)
  } else if (tools::file_ext(output) == "csv") {
    readr::write_csv(df, output)
  } else if (tools::file_ext(output) == "feather") {
    feather::write_feather(df, output)
  } else {
    stop(paste0("Unsupported file extension: ", tools::file_ext(output)))
  }
}
cytomining/cytotools documentation built on Nov. 19, 2020, 10:23 p.m.