R/01_prepare_data.R

Defines functions convert_flowset compile_fcs

Documented in compile_fcs convert_flowset

#### Compile FCS files ----


#' Compile all .fcs files in a directory to a flowset
#'
#' A simple function to compile a directory of FCS files into a flowSet object
#'  using flowCore::read.flowSet().
#'  Use the pattern argument to select only a subset of FCS files.
#'
#' @param data_dir Directory containing the .fcs files
#' @param pattern The pattern to use to find the files in the folder
#' @inheritParams flowCore::read.FCS
#' @family dataprep
#' @examples
#' \dontrun{
#' fcs <- compile_fcs(data_dir = "_data/raw", pattern = "\\.fcs")
#' }
#' @export
compile_fcs <- function(
    data_dir,
    pattern = "\\.fcs",
    column.pattern = NULL,
    invert.pattern = FALSE
    ) {

  # Error checking
  if (data_dir %>% endsWith("/")) {
    data_dir <- stringr::str_sub(data_dir, end = -2)
  }

  # Specifying files to use
  files <- list.files(data_dir,
                      pattern = pattern,
                      recursive = FALSE,
                      full.names = TRUE) %>%
    sort()
  if (length(files) == 0) stop("No files found in folder \"", data_dir, "\"")

  # Read the data files
  message(paste("Reading", length(files), "files to a flowSet.."))
  fcs_raw <- files %>%
    flowCore::read.flowSet(transformation = FALSE,
                           truncate_max_range = FALSE,
                           emptyValue = FALSE,
                           column.pattern = column.pattern,
                           invert.pattern = invert.pattern)

  return(fcs_raw)
}






#' Convert a flowSet into a tibble
#'
#' Use this function to convert a flowSet into the tibble
#'  object that the remaining functions in cyCombine relies on.
#'  A tibble is a Tidyverse implementation of a data.frame and can be treated as a such.
#'  The majority of arguments revolves adding relevant info from the metadata file/object.
#'  The panel argument is included to adjust the output column names using a panel with channel and antigen columns.
#'  Bear in mind the column names will be altered with the following:
#'  \code{stringr::str_remove_all("^\\d+\[A-Za-z\]+_") %>% stringr::str_remove_all("\[ _-\]")}.
#'
#'
#'
#' @param flowset The flowset to convert
#' @param metadata Optional: Can be either a filename or data.frame of the metadata file. Please give the full path from working directory to metadata file
#' @param sample_ids Optional: If a character, it should be the sample column in the metadata. If its a vector, it should have the same length as the total flowset. If NULL, sample ids will be the file names. If a single value, all rows will be assigned this value.
#' @param batch_ids Optional: If a character, it should be the column in the metadata containing the batch ids. If its a vector, it should have the same length as the total flowset. If a single value, all rows will be assigned this value.
#' @param filename_col Optional: The column in the metadata containing the fcs filenames. Needed if metadata is given, but sample_ids is not
#' @param condition Optional: The column in the metadata containing the condition. Will be used as the covariate in ComBat, but can be specified later. You may use this to add a different column of choice, in case you want to use a custom column in the ComBat model matrix.
#' @param anchor Experimental: The column in the metadata referencing the anchor samples (control references). Will be used as a covariate in ComBat, if specified. Please be aware that this column may be confounded with the condition column. You may use this to add a different column of choice, in case you want to use a custom column in the ComBat model matrix. You may use a custom column name, but it is good practice to add the name to the 'non_markers' object exported by cyCombine, to reduce the risk of unexpected errors.
#' @param down_sample If TRUE, the output will be down-sampled to size sample_size
#' @param sample_size The size to down-sample to. If a non-random sampling type is used and a group contains fewer cells than the sample_size, all cells of that group will be used.
#' @param sampling_type The type of down-sampling to use. "random" to randomly select cells across the entire dataset, "batch_ids" to sample evenly (sample_size) from each batch, or "sample_ids" sample evenly (sample_size) from each sample.
#' @param seed The seed to use for down-sampling
#' @param clean_colnames (Default: TRUE). A logical defining whether column names should be cleaned or not. Cleaning involves removing isotope tags, spaces, dashes, underscores, and all bracket types.
#' @param panel Optional: Panel as a filename or data.frame. Is used to define colnames from the panel_antigen column
#' @param panel_channel Optional: Only used if panel is given. It is the column name in the panel data.frame that contains the channel names
#' @param panel_antigen Optional: Only used if panel is given. It is the column name in the panel data.frame that contains the antigen names
#' @family dataprep
#' @examples
#' \dontrun{
#' df <- convert_flowset(flowset = flowset,
#'  metadata = file.path(data_dir, "metadata.csv"),
#'  filename_col = "FCS_files",
#'  sample_ids = "sample_id",
#'  batch_ids = "batch_ids",
#'  down_sample = FALSE)
#'  }
#' @export
convert_flowset <- function(flowset,
                            metadata = NULL,
                            filename_col = "filename",
                            sample_ids = NULL,
                            batch_ids = NULL,
                            condition = NULL,
                            anchor = NULL,
                            down_sample = TRUE,
                            sample_size = 500000,
                            sampling_type = "random",
                            seed = 473,
                            clean_colnames = TRUE,
                            panel = NULL,
                            panel_channel = "fcs_colname",
                            panel_antigen = "antigen"){
  # Extract information necessary both with and without included metadata
  ## FlowSet row numbers
  nrows <- flowCore::fsApply(flowset, nrow)

  ## File names from flowset
  files <- flowset@phenoData %>%
    rownames() %>%
    basename()
  # Add metadata information (long)
  if(!is.null(metadata)){
    # Error handling
    if(is.null(filename_col)){
      stop("Please specify a filename_col.")
    }
    if("character" %in% class(metadata)){
      if(!file.exists(file.path(metadata))){
        stop("File \"", file.path(metadata), "\" was not found")
      }

      # Get metadata
      if(endsWith(metadata, suffix = ".xlsx")){
        metadata <- suppressMessages(file.path(metadata) %>%
                                       readxl::read_xlsx())
      } else if(endsWith(metadata, suffix = ".csv")){
        metadata <- suppressMessages(file.path(metadata) %>%
                                       readr::read_csv())
      } else {
        stop("Sorry, the given metadata is not in a supported format. Please use a .xlsx or .csv file.\n",
                            "Alternatively, a data.frame of the metadata can be used.")
      }
    }

    # Check for errors in metadata columns
    md_cols <- colnames(metadata)
    cyCombine:::check_colname(md_cols, filename_col)


    # Extract info from metadata
    if(!endsWith(tolower(metadata[[filename_col]][1]), ".fcs")){
      metadata[[filename_col]] <- paste0(metadata[[filename_col]], ".fcs")
    }
    # Check that all metadata rows has a file
    if(any(metadata[[filename_col]] %!in% files)){
      missing_files <- metadata[[filename_col]][metadata[[filename_col]] %!in% files]
      warning("The following samples in the metadata were not found in the provided folder and will be ignored.\n", stringr::str_c(missing_files, collapse = ", "))
      # Remove files from metadata
      metadata <- metadata[metadata[[filename_col]] %in% files,]
    }

    # Check that all files are represented in metadata
    if(any(files %!in% metadata[[filename_col]])){
      missing_files <- files[files %!in% metadata[[filename_col]]]
      warning("The samples were not found in the metadata file and will be ignored.\n", stringr::str_c(missing_files, collapse = ", "))
      files <- files[files %in% metadata[[filename_col]]]
      nrows <- nrows[rownames(nrows) %in% files,] %>% as.matrix()
      flowset <- flowset[files]
    }

    # Get sample ids
    if (is.null(sample_ids)){
      sample_ids <- files %>%
        stringr::str_remove(".fcs") %>%
        stringr::str_remove(".FCS") %>%
        rep(nrows)
    } else if (length(sample_ids) == 1){
      cyCombine:::check_colname(md_cols, sample_ids)
      sample_ids <- metadata[[sample_ids]][match(files, metadata[[filename_col]])] %>%
        stringr::str_remove(".fcs") %>%
        stringr::str_remove(".FCS") %>%
        rep(nrows)
    }

    # Get batch ids
    if (!is.null(batch_ids)){
      if(length(batch_ids) == 1){
        cyCombine:::check_colname(md_cols, batch_ids)
        batch_ids <- metadata[[batch_ids]][match(files, metadata[[filename_col]])] %>%
          as.factor() %>%
          rep(nrows)
      }
    }
    # Get condition
    if(!is.null(condition)){
      if(length(condition == 1)){
        cyCombine:::check_colname(md_cols, condition)
        condition <- metadata[[condition]][match(files, metadata[[filename_col]])] %>%
          as.factor() %>%
          rep(nrows)
      }
    }
    # Get anchor
    if(!is.null(anchor)){
      if(length(anchor == 1)){
        cyCombine:::check_colname(md_cols, anchor)
        anchor <- metadata[[anchor]][match(files, metadata[[filename_col]])] %>%
          as.factor() %>%
          rep(nrows)
      }
    }


  } else{ # If no metadata given
    if(is.null(sample_ids)){# Get sample ids from filenames
      sample_ids <- files %>%
        stringr::str_remove(".fcs") %>%
        stringr::str_remove(".FCS") %>%
        rep(nrows)
    }
  }
  ## To down sample within fsApply
  tot_nrows <- sum(nrows)
  ids <- 1:tot_nrows
  # Down sampling setup
  if (down_sample){
    set.seed(seed)
    # Sorting here enables major resource savings when down-sampling
    # For non-random sampling, the dplyr::slice allows down-sampling to group size if there are less cells than sample_size.
    if (sampling_type == "random"){
      message("Down sampling to ", sample_size, " cells")
      sample <- sample(ids, sample_size) %>%
        sort()
    } else if ((sampling_type == "batch_ids") && !is.null(batch_ids)){ # even down-sampling from batches
      message(paste("Down sampling to", sample_size, "cells per batch"))
      sample <- tibble::tibble(batch_ids, ids) %>%
        dplyr::group_by(batch_ids) %>%
        dplyr::slice(sample(dplyr::n(), min(sample_size, dplyr::n()))) %>%
        dplyr::pull(ids) %>%
        sort()
      tiny_batches <- table(batch_ids) < sample_size
      if(any(tiny_batches)) message("Please be aware that batches the following batches contained less than ",
                                    sample_size, " cells:\n",
                                    stringr::str_c(names(tiny_batches), collapse = ", "))
    } else{ # Even down-sampling from samples
      message(paste("Down sampling to", sample_size, "cells per sample"))
      sample <- tibble::tibble(sample_ids, ids) %>%
        dplyr::group_by(sample_ids) %>%
        dplyr::slice(sample(dplyr::n(), min(sample_size, dplyr::n()))) %>%
        dplyr::pull(ids) %>%
        sort()
      tiny_samples <- table(sample_ids) < sample_size
      if(any(tiny_samples)) message("Please be aware that batches the following batches contained less than",
                                    sample_size, "cells:\n",
                                    stringr::str_c(names(tiny_samples), collapse = ", "))
    }


    # Down-sample metadata columns
    ids <- ids[sample]
    if(!is.null(sample_ids) && length(sample_ids) > 1) sample_ids <- sample_ids[sample]
    if(!is.null(batch_ids) && length(batch_ids) > 1) batch_ids <- batch_ids[sample]
    if(!is.null(condition) && length(condition) > 1) condition <- condition[sample]
    if(!is.null(anchor) && length(anchor) > 1) anchor <- anchor[sample]
  }

  message("Extracting expression data..")
  fcs_data <- flowset %>%
    purrr::when(down_sample ~ flowCore::fsApply(., cyCombine:::fcs_sample,
                                                sample = sample,
                                                nrows = nrows),
                ~ flowCore::fsApply(., Biobase::exprs)) %>%
    tibble::as_tibble() %>%
    dplyr::mutate(id = ids) %>%
    dplyr::select(id, dplyr::everything())

  # Clean column names
  if (!is.null(panel)){
    if("character" %in% class(panel)){
      if(endsWith(panel, suffix = ".xlsx")){
        panel <- suppressMessages(file.path(panel) %>%
                                       readxl::read_xlsx())
      } else if(endsWith(panel, suffix = ".csv")){
        panel <- suppressMessages(file.path(panel) %>%
                                       readr::read_csv())
      } else {
        stop("Sorry, the panel file is not in a supported format. Please use a .xlsx or .csv file.")
      }
    }
    cols <- match(colnames(fcs_data), panel[[panel_channel]]) %>%
      .[!is.na(.)]
    col_names <- panel[[panel_antigen]][cols]

    fcs_data <- fcs_data %>%
      dplyr::select(id, dplyr::all_of(panel[[panel_channel]][cols]))
  }else{
    col_names <- flowset[[1]] %>%
      flowCore::parameters() %>%
      Biobase::pData() %>%
      dplyr::pull(desc)
  }
  if(clean_colnames) {
    col_names <- col_names %>%
      stringr::str_remove_all("^\\d+[A-Za-z]+_") %>%
      stringr::str_remove_all("[- _\\[\\](){}]")
    }
  colnames(fcs_data) <- c("id", col_names)

  # Add optional columns
  if(!is.null(sample_ids)) fcs_data$sample <- sample_ids
  if(!is.null(batch_ids)) fcs_data$batch <- batch_ids
  if(!is.null(condition)) fcs_data$condition <- condition
  if(!is.null(anchor)) fcs_data$anchor <- anchor

  message("Your flowset is now converted into a dataframe.")
  return(fcs_data)
}




#' Extract from a flowset given a sample of indices
#' @noRd
fcs_sample <- function(flowframe, sample, nrows, seed = 473){

  # Determine which flowframe was given
  ff_name <- flowCore::keyword(flowframe)$FILENAME %>%
      basename()
  ff_number <- which(rownames(nrows) == ff_name)

  # Down-sample based on accumulated nrows (ensures the correct rows are extracted from each flowframe)
  nrows_acc <- c(0, nrows %>%
                   purrr::accumulate(`+`))
  ff_sample <- sample - nrows_acc[ff_number]
  ff_sample <- ff_sample[ff_sample > 0]
  ff_sample <- ff_sample[ff_sample <= nrows[ff_number]]

  # Extract expression data
  ff <- flowframe %>%
    Biobase::exprs()
  ff <- ff[ff_sample, ]

  return(ff)
}




#### Data transformation ----

#' Transform data using asinh
#'
#' Inverse sine transformation of a tibble.
#'  This function can also de-randomize data.
#'
#' @param df The tibble to transform
#' @param markers The markers to transform on
#' @param cofactor The cofactor to use when transforming
#' @param derand Derandomize. Should be TRUE for CyTOF data, otherwise FALSE.
#' @param .keep Keep all channels. If FALSE, channels that are not transformed are removed
#' @param reverse Reverses the asinh transformation if TRUE
#' @family dataprep
#' @examples
#' \dontrun{
#' uncorrected <- df %>%
#'   transform_asinh(markers = markers)
#'   }
#' @export
transform_asinh <- function(df,
                            markers = NULL,
                            cofactor = 5,
                            derand = TRUE,
                            .keep = FALSE,
                            reverse = FALSE){
  if(is.null(markers)){
    markers <- df %>%
      cyCombine::get_markers()
  }
  # Use global non_markers if available
  if(!is.null(.GlobalEnv$non_markers)) non_markers <- .GlobalEnv$non_markers

  if(any(markers %!in% colnames(df))){
    mes <- stringr::str_c("Not all given markers are in the data.\nCheck if the markers contain a _ or -:",
                          stringr::str_c(markers, collapse = ", "),
                          "Columns:",
                          stringr::str_c(colnames(df), collapse = ", "),
                          sep = "\n"
    )
    stop(mes)
  } else if(.keep & any(colnames(df) %!in% unique(colnames(df)))){
    stop("Your data contains non-unique column names. Please ensure they are unique. The column names are: ", stringr::str_c(colnames(df), collapse = ", "))
  }
  message("Transforming data using asinh with a cofactor of ", cofactor, "..")
  transformed <- df %>%
    purrr::when(.keep ~ .,
                ~ dplyr::select_if(., colnames(.) %in% c(markers, non_markers))) %>%
    # Transform all data on those markers
    dplyr::mutate(dplyr::across(dplyr::all_of(markers),
                     .fns = function(x){
                       if(derand & !reverse) x <- ceiling(x)
                       if(reverse) sinh(x)*cofactor else asinh(x/cofactor)
                     }))
  return(transformed)
}

#### Wrapper function ----



#' Prepare a directory of .fcs files
#'
#' This is a wrapper function that takes you from a directory of .fcs files or a flowset to a transformed tibble.
#'
#'
#' @param flowset Optional: Prepare a flowset instead of a directory of fcs files
#' @inheritParams compile_fcs
#' @inheritParams convert_flowset
#' @inheritParams transform_asinh
#' @param transform If TRUE, the data will be transformed; if FALSE, it will not.
#' @family dataprep
#' @examples
#' \dontrun{
#' uncorrected <- data_dir %>%
#'   prepare_data(metadata = "metadata.csv",
#'   markers = markers,
#'   filename_col = "FCS_name",
#'   batch_ids = "Batch",
#'   condition = "condition",
#'   down_sample = FALSE)
#'   }
#' @export
prepare_data <- function(data_dir = NULL,
                         flowset = NULL,
                         markers = NULL,
                         pattern = "\\.fcs",
                         metadata = NULL,
                         filename_col = "filename",
                         sample_ids = NULL,
                         batch_ids = NULL,
                         condition = NULL,
                         anchor = NULL,
                         down_sample = TRUE,
                         sample_size = 500000,
                         sampling_type = "random",
                         seed = 473,
                         panel = NULL,
                         panel_channel = "fcs_colname",
                         panel_antigen = "antigen",
                         transform = TRUE,
                         cofactor = 5,
                         derand = TRUE,
                         .keep = FALSE,
                         clean_colnames = TRUE){

  # Stop if no data is given
  if(is.null(data_dir) & is.null(flowset)) stop("No data given.")

  if(!is.null(data_dir)){
    # Remove slash at end of data_dir
    if(data_dir %>% endsWith("/")) data_dir <- data_dir %>% stringr::str_sub(end = -2)

    # Compile directory to flowset
    if(is.null(flowset)){
      flowset <- data_dir %>%
        cyCombine::compile_fcs(pattern = pattern)
    }

    # Look for metadata in data_dir
    if(!is.null(metadata)){
      if(!"data.frame" %in% class(metadata)){
        if(!file.exists(file.path(metadata)) & file.exists(file.path(data_dir, metadata))) metadata <- file.path(data_dir, metadata)
      }
    }
  }
  # Convert flowset to dataframe
  fcs_data <- flowset %>%
    cyCombine::convert_flowset(metadata = metadata,
                               filename_col = filename_col,
                               sample_ids = sample_ids,
                               batch_ids = batch_ids,
                               condition = condition,
                               anchor = anchor,
                               down_sample = down_sample,
                               sample_size = sample_size,
                               sampling_type = sampling_type,
                               seed = seed,
                               panel = panel,
                               panel_channel = panel_channel,
                               panel_antigen = panel_antigen,
                               clean_colnames = clean_colnames) %>%
    # Transform dataset with asinh
    purrr::when(transform ~ cyCombine::transform_asinh(., markers = markers,
                                                       cofactor = cofactor,
                                                       derand = derand,
                                                       .keep = .keep),
                ~ .)

  message("Done!")
  return(fcs_data)
}
biosurf/cyCombine documentation built on May 23, 2024, 4:07 a.m.