R/wsp_get_ff.R

Defines functions wsp_get_ff

Documented in wsp_get_ff

#' Get (subsetted) flowFrames from one or many flowjo workspaces
#'
#' From a flowjo workspace with gated populations and the respective fcs files flowframes are generated.
#' Under the hood CytoML::flowjo_to_gatingset applies the geometric gate definitions and filters
#' relevant rows (indices) of fcs files. The compensation matrix as defined in flowjo will be used
#' to compensate fluorescence intensities.
#'
#' If it is intended to pass flowframes to fcexpr::dr_to_fcs, it is recommended to have transformed
#' and untransformed expression values returned.
#'
#' @param wsp vector of paths to flowjo workspaces
#' @param FCS.file.folder path to folder(s) of FCS files; may be one path for all wsp or a vector of paths, one for each wsp;
#' if not provided fcs file paths are derived individually from the wsp (xml). If the workspace was generated and saved on
#' another computer you will have to provide the path to FCS files on the current computer.
#' @param groups vector or list of groups in flowjo to consider; if a list, each index corresponds to the same index in wsp;
#' if NULL samples from all groups are read
#' @param population which population (=node, =gate) to subset flowFrames on; use fcexpr::wsx_get_poppaths to get paths
#' @param invert_groups logical whether to invert group selection
#' @param samples vector or list of samples to select (names of FCS files), each index corresponds to the index in wsp;
#' if NULL all samples (from selected groups) are read
#' @param invert_samples logical whether to invert sample selection
#' @param downsample numeric, if < 0 then a fraction of events is sampled, if > 0 an absolute number of events is sampled; or set to "min"
#' which will lead to downsampling each flowframe to the number of events in the flowframe with lowest number of events; can be a single value to treat all
#' FCS files equally or can be a vector of same length as FCS files
#' @param remove_redundant_channels remove channels that are not part of the gating tree, mainly to reduce memory load
#' @param lapply_fun lapply function name, unquoted; lapply, pbapply::pblapply or parallel::mclapply are suggested
#' @param ... additional argument to the lapply function; mainly mc.cores when parallel::mclapply is chosen
#' @param return_untransformed logical; do return untransformed (inverse) data
#' @param return_transformed logical; do return transformed data (transformation as in flowjo?!)
#' @param seed set a seed to reproduce downsampling
#' @param channels channels to use for leverage score calculation; use wsx_get_keywords to retrieve channel names/descriptions
#' @param leverage_score_for_sampling logical whether to use leverage scores for downsampling
#'
#' @return a list of (subsetted) flowframes with events that are within the gated population only
#' @export
#'
#' @importFrom magrittr "%>%"
#'
#' @examples
#'\dontrun{
#' ff_list <- fcexpr::wsp_get_ff(wsp = "mypath/my.wsp", population = "CD8+")
#' # ff.list[[1]] may be passed to fcexpr::dr_to_fcs for instance
#'}
wsp_get_ff <- function(wsp,
                       FCS.file.folder = NULL,
                       groups = NULL,
                       population,
                       invert_groups = F,
                       samples = NULL,
                       invert_samples = F,
                       return_untransformed = T,
                       return_transformed = T,
                       downsample = 1,
                       remove_redundant_channels = F,
                       lapply_fun = lapply,
                       seed = 42,
                       channels = NULL,
                       leverage_score_for_sampling = F,
                       ...) {

  if (!requireNamespace("BiocManager", quietly = T)) {
    utils::install.packages("BiocManager")
  }
  if (!requireNamespace("CytoML", quietly = T)) {
    BiocManager::install("CytoML")
  }
  if (!requireNamespace("flowWorkspace", quietly = T)) {
    BiocManager::install("flowWorkspace")
  }
  lapply_fun <- match.fun(lapply_fun)

  checked_in <- check_in(wsp = wsp, groups = groups, samples = samples, FCS.file.folder = FCS.file.folder,
                         return_untransformed = return_untransformed,
                         return_transformed = return_transformed)
  groups <- checked_in[["groups"]]
  samples <- checked_in[["samples"]]
  FCS.file.folder <- checked_in[["FCS.file.folder"]]

  smpl <- get_smpl_df(wsp = wsp,
                      groups = groups,
                      invert_groups = invert_groups,
                      samples = samples,
                      invert_samples = invert_samples,
                      FCS.file.folder = FCS.file.folder)
  if (is.null(smpl)) {
    return(NULL)
  }

  if (is.numeric(downsample)) {
    ds <- downsample
  } else if (all(downsample == "min")) {
    ds <- 1
  } else {
    stop("downsample has to be numeric or 'min'. With min all flowframes will be downsampled to that flowframe with the lowest number of events.")
  }

  # check length of downsample equal to length of ind_mat or equal to 1
  if (length(ds) != 1 && length(ds) != nrow(smpl)) {
    stop("downsample has to have length 1 or length of ind_mat (one value for each FCS file).")
  }

  if (length(ds) != 1) {
    smpl$downsample <- 1
    for (x in 1:nrow(smpl)) {
      smpl$downsample[x] <- ds[x]
    }
  }

  if (any(table(smpl$FilePath) > 1)) {
    warning("Same FCS files found in multiple workspaces. This cannot be handled. Please provide the samples and/or groups argument or fix manually.")
    stop(smpl$FilePath[which(table(smpl$FilePath) > 1)])
  }


  pp <- do.call(rbind, lapply(wsp, function(x) {
    wsx_get_poppaths(x, collapse = F)
  }))
  pp <- pp[which(pp$FileName %in% smpl$FileName),]
  pp <-
    pp %>%
    dplyr::group_by(PopulationFullPath, Population, ws) %>%
    dplyr::summarise(FileName = list(FileName), .groups = "drop")
  pp <- as.data.frame(pp)
  pp_is <- unique(unlist(pp[unique(c(which(pp$PopulationFullPath == population), which(pp$Population == population))), "FileName"]))
  if (is.null(pp_is)) {
    stop("Population was not found for any sample.")
  }
  if (length(smpl$FileName[which(!smpl$FileName %in% pp_is)]) > 0) {
    message("For ", paste(smpl$FileName[which(!smpl$FileName %in% pp_is)], collapse = ", "), " population was not found.")
  }
  smpl <- smpl[which(smpl$FileName %in% pp_is),]

  ff.list <- lapply_fun(stats::setNames(split(smpl, 1:nrow(smpl)), smpl$FileName),
                        get_ff,
                        return_untransformed = return_untransformed,
                        return_transformed = return_transformed,
                        downsample = ds,
                        remove_redundant_channels = remove_redundant_channels,
                        population = population,
                        seed = seed,
                        channels = channels,
                        leverage_score_for_sampling = leverage_score_for_sampling,
                        ...)

  if (all(downsample == "min")) {
    min <- min(unlist(lapply(sapply(sapply(ff.list, "[", 1), "[", 1), nrow)))
    for (x in seq_along(ff.list)) {
      set.seed(seed)
      inds <- sample(c(rep(T, min), rep(F, nrow(ff.list[[x]][[1]][[1]])-min)))
      for (y in seq_along(ff.list[[x]][[1]])) {
        ff.list[[x]][[1]][[y]] <- subset(ff.list[[x]][[1]][[y]], inds)
      }
      ff.list[[x]][["ind_mat"]] <- ff.list[[x]][["ind_mat"]][inds,]
    }
  }

  return(list(flowframes = lapply(stats::setNames(seq_along(ff.list[[1]][[1]]), names(ff.list[[1]][[1]])),
                                  function(x) lapply(lapply(ff.list, "[[", 1), "[[", x)),
              indices = stats::setNames(sapply(ff.list, "[", 2), smpl$FileName)))
}
Close-your-eyes/fcexpr documentation built on Sept. 29, 2023, 12:27 a.m.