R/data_handling.R

Defines functions import_QS_files

Documented in import_QS_files

#' Import data
#' @description Imports .csv-files generated by QuantaSoft.
#' @param paths Character vector. Specifies which files and/or directories to
#'   load. For directories all \code{.csv} files within are loaded.
#' @param Ch1_is_mutation Logical. Control if Ch1 is used to measure presence of
#'   mutant DNA (and Ch2 for wild type DNA). If this is FALSE Ch2 is used
#'   instead (and Ch1 for wild type).
#' @param annotations Named list or \code{data.frame}. Annotations to be added
#'   to all samples.
#' @param sample_annotations \code{data.frame}. Similar to \code{annotations},
#'   but annotations are specific to each sample. This need to include a column
#'   "Sample", which will be used for joining.
#' @param merge_wells String. Controls if wells from the same sample
#'   ("Sample") should be merged within a dataset (file). There are 4 options:
#'   \itemize{
#'     \item \code{merge_wells="yes"}: Merge wells. Discards QS merged wells (e.g. "M01").
#'     \item \code{merge_wells="no"}: Do **not** merge wells. Discard QS merged wells (e.g. "M01").
#'     \item \code{merge_wells="qs"}: The merged wells from QuantaSoft (e.g. "M01") is used if these are present.
#'     \item \code{merge_wells="none"}: No merging is done.
#'   }
#'   Default if "none".
#' @param merge_files Logical. If this and \code{merge_wells} is TRUE, samples
#'   across files are also merged. Default is FALSE.
#'
#' @return A \code{data.frame} with the columns:
#'   \item{FileName:}{Name of the file imported.}
#'   \item{Well:}{Well identifier.}
#'   \item{Sample:}{Sample name from QuantaSoft.}
#'   \item{Ch1TargetType:}{The target type for channel 1.}
#'   \item{Ch2TargetType:}{The target type for channel 2.}
#'   \item{Target:}{Target name from QuantaSoft.}
#'   \item{MutantOnlyDroplets:}{The count of mutant only droplets.}
#'   \item{WildtypeOnlyDroplets:}{The count of wild type only droplets.}
#'   \item{DoubleNegativeDroplets:}{The count of double negative droplets.}
#'   \item{DoublePositiveDroplets:}{The count of double positive droplets.}
#'   \item{TotalDroplets:}{The total number of droplets.}
#'   \item{MergedWells:}{If \code{merge_wells=TRUE} this is a string of the wells merged.
#'      Otherwise this will be NA.}
#'   \item{NumberOfMergedWells:}{The number of wells merged.}
#'
#' @seealso
#' \itemize{
#'   \item \code{\link{train_simple_ddpcr_model}}, \code{\link{test_tumor_sample_simple}}
#'   \item \code{\link{train_integrated_ddpcr_model}}, \code{\link{test_tumor_sample_integrated}}
#' }
#'
#' @export
#'
#' @importFrom readr read_csv cols
#' @importFrom utils file_test
#' @import dplyr stringr
import_QS_files <- function(paths,
                            Ch1_is_mutation = TRUE,
                            annotations = NULL,
                            sample_annotations = NULL,
                            merge_wells = "none",
                            merge_files = FALSE) {

  # Check existence of files/folders
  file_exists <- file.exists(paths)
  if (any(!file_exists)) {
    stop(paste0("The path(s) ", paste0("'", paths[!file_exists], "'", collapse = ", "), " does/do not exist."))
  }

  # Check if sample annotations has Sample column
  if (!is.null(sample_annotations) && !"Sample" %in% colnames(sample_annotations)) {
    stop("'sample_annotations' does not include a column 'Sample'.")
  }

  # Split input into files and directories
  # Get unique files
  file_paths <-
    paths[file_test("-f", paths)] %>%
    unique()

  # Get unique files in folder - and setdiff files in file_paths above
  dir_file_paths <-
    paths[file_test("-d", paths)] %>%
    list.files(pattern = ".csv", full.names = TRUE) %>%
    unique() %>%
    setdiff(file_paths)

  # Get files
  load_files_df <- suppressWarnings(
    read_csv(
      file_paths, id = "FilePath", show_col_types = FALSE,
      col_types = cols(.default = "?", MergedWells = "c")
    )
  )

  # Get .csv files from directories
  load_dirs_df <- suppressWarnings(
    read_csv(
      dir_file_paths, id = "FilePath", show_col_types = FALSE,
      col_types = cols(.default = "?", MergedWells = "c")
    )
  )

  # Bind data from dirs and files
  df <- bind_rows(load_files_df, load_dirs_df)

  # Channel 1 data
  ch1_df <- df %>%
    filter(
      grepl("Ch1", .data$TargetType)
    ) %>%
    mutate(
      Ch1TargetType = str_remove(.data$TargetType, pattern = "Ch1")
    ) %>%
    select(-c("TargetType"))

  # Channel 2 data
  ch2_df <- df %>%
    filter(
      grepl("Ch2", .data$TargetType)
    ) %>%
    mutate(
      Ch2TargetType = str_remove(.data$TargetType, pattern = "Ch2")
    ) %>%
    select("FilePath", "Well", "ExptType", "Experiment", "Sample", "Ch2TargetType")

  # Join Ch1 and Ch2 data
  df <- full_join(ch1_df, ch2_df, by = c("FilePath", "Well", "ExptType", "Experiment", "Sample"))

  # Clean up data
  df <- df %>%
    mutate(
      FileName = basename(.data$FilePath)
    ) %>%
    select(
      "FileName",
      "Well", "Sample", "Ch1TargetType", "Ch2TargetType", "Target",
      "Ch1+Ch2-", "Ch1-Ch2+", "Ch1-Ch2-", "Ch1+Ch2+",
      "AcceptedDroplets", "MergedWells"
    ) %>%
    rename(
      MutantOnlyDroplets = ifelse(Ch1_is_mutation, "Ch1+Ch2-", "Ch1-Ch2+"),
      WildtypeOnlyDroplets = ifelse(Ch1_is_mutation, "Ch1-Ch2+", "Ch1+Ch2-"),
      DoubleNegativeDroplets = "Ch1-Ch2-",
      DoublePositiveDroplets = "Ch1+Ch2+",
      TotalDroplets = .data$AcceptedDroplets
    ) %>%
    mutate(
      NumberOfMergedWells = str_count(ifelse(is.na(.data$MergedWells), "", .data$MergedWells), pattern = ",") + 1
    )

  # Merging wells
  if (merge_wells == "none") {
    # Do nothing
    df <- df
  } else if (merge_wells == "yes") {
    # Remove QS merged wells
    no_qs_df <- df %>%
      filter(
        !grepl("M", .data$Well)
      )

    # Single wells
    single_df <- no_qs_df %>%
      group_by(across(all_of(if (merge_files) "Sample" else c("Sample", "FileName")))) %>%
      filter(n() == 1) %>%
      ungroup()

    # Wells to be merged
    merged_df <-
      no_qs_df %>%
      group_by(across(all_of(if (merge_files) "Sample" else c("Sample", "FileName")))) %>%
      filter(n() > 1) %>%
      summarise(
        FileName = paste0(unique(.data$FileName), collapse = ","),
        Target = paste0(unique(.data$Target), collapse = ","),
        Ch1TargetType = paste0(unique(.data$Ch1TargetType), collapse = ","),
        Ch2TargetType = paste0(unique(.data$Ch2TargetType), collapse = ","),
        WildtypeOnlyDroplets = sum(.data$WildtypeOnlyDroplets),
        MutantOnlyDroplets = sum(.data$MutantOnlyDroplets),
        DoubleNegativeDroplets = sum(.data$DoubleNegativeDroplets),
        DoublePositiveDroplets = sum(.data$DoublePositiveDroplets),
        TotalDroplets = sum(.data$TotalDroplets),
        NumberOfMergedWells = n(),
        MergedWells = paste0(c("(", paste0(c(.data$Well), collapse = ","), ")"), collapse = ""),
        .groups = "drop"
      ) %>%
      mutate(
        Well = sprintf("M%02d", row_number())
      )

    # Bind data
    df <- bind_rows(
      merged_df, single_df
    )
  } else if (merge_wells == "no") {
    # Remove QS merged samples
    df <- df %>%
      filter(
        !grepl("M", .data$Well)
      )
  } else if (merge_wells == "qs") {
    # Vector of QS merged samples
    qs_merged_samples <- df %>%
      filter(grepl("M", .data$Well)) %>%
      pull(.data$Sample) %>%
      unique()
    # Vector of QS un-merged samples
    qs_unmerged_samples <- df %>%
      pull(.data$Sample) %>%
      setdiff(qs_merged_samples) %>%
      unique()

    # Filter:
    # If sample is merged: Keep merged sample
    # If sample is NOT merged: Keep original sample(s)
    df <- df %>%
      filter(grepl("M", .data$Well) | .data$Sample %in% qs_unmerged_samples)
  } else {
    stop("merge_wells should be 'yes', 'no', 'qs' or 'none'")
  }

  if (!is.null(annotations)) {
    df <- df %>% bind_cols(annotations)
  }

  if (!is.null(sample_annotations)) {
    df <- df %>% left_join(sample_annotations, by = "Sample")
  }

  # Arrange columns
  df <- df %>%
    relocate(
      "FileName", "Sample",
      "Well", "Ch1TargetType", "Ch2TargetType", "Target",
      "DoubleNegativeDroplets", "WildtypeOnlyDroplets", "MutantOnlyDroplets", "DoublePositiveDroplets",
      "TotalDroplets", "NumberOfMergedWells", "MergedWells"
    )

  return(df)
}
simondrue/castle documentation built on Jan. 29, 2022, 3:04 a.m.