R/debarcoder.R

Defines functions debarcode debarcoderPlots debarcoderExportDebarcodedFcs debarcoderUnlabelEvents debarcoderLabelEvents debarcoderPrepareFcs debarcoderImportKey .verifyKey .verifyExprsList

Documented in debarcode debarcoderExportDebarcodedFcs debarcoderImportKey debarcoderLabelEvents debarcoderPlots debarcoderPrepareFcs debarcoderUnlabelEvents

.verifyExprsList <- function(exprs_list) {
  if (!is.list(exprs_list)) stop("exprs_list is not a list")
  if (is.null(exprs_list$exprs)) stop("exprs_list is missing exprs field")
  if (is.null(exprs_list$exprs_sorted)) {
    stop("exprs_list is missing exprs_sorted field")
  }
}

.verifyKey <- function(key) {
  if (!is.list(key)) stop("key is not a list")
  if (is.null(key$key)) stop("key is missing key field")
  if (is.null(key$channels)) stop("key is missing channels field")
  if (is.null(key$n_pos_channels)) stop("key is missing n_pos_channels field")
}

#' Import a barcoding key.
#'
#' Import a barcoding key from a CSV file. The CSV header should be of the
#' format:
#'
#' code,channel_name_1,channel_name_2,...
#'
#' Channel names should be explicit (such as Pd102Di). Channel values should be
#' denoted as 1 (channel positive for code) or 0 (channel negative for code).
#'
#' @param filename CSV file from which to import the key.
#' @return List with barcoding key, barcoding channel names, and number of
#' positive channels for each code.
#' @export
debarcoderImportKey <- function(filename) {
  key <- read.csv(filename, stringsAsFactors = FALSE)
  colnames(key)[1] <- tolower(colnames(key)[1])

  channels <- colnames(key)[2:ncol(key)]
  # Alert if degenerate channels exist.
  degenerate_channels <-
    names(which(apply(key[, channels], 2, function(v) !any(v))))
  if (length(degenerate_channels) > 0) {
    warning(paste0(
      "key includes degenerate channels which might impact performance: ",
      paste(degenerate_channels, collapse = ", ")
    ))
  }

  n <- nrow(key)
  channel_data <- key[, channels]

  if (colnames(key)[1] != "code") {
    stop("first column of key should be named \"code\"")
  }

  if (!all(channel_data == 0 | channel_data == 1)) {
    stop("channel values are not 0 or 1")
  }

  if (nrow(unique(channel_data)) != n) {
    stop("barcoding key cannot include duplicate code channel combinations")
  }

  if (length(unique(key$code)) != n) {
    stop("barcoding key cannot include duplicate code names")
  }


  n_pos_channels <- unique(rowSums(channel_data))
  if (length(n_pos_channels) > 1) {
    stop("codes have a variable number of positive channels")
  }

  # Convert channel values to boolean.
  key[, channels] <- channel_data == 1
  # Make sure code is always character.
  key$code <- as.character(key$code)

  list(
    key = key,
    channels = channels,
    degenerate_channels = degenerate_channels,
    n_pos_channels = n_pos_channels
  )
}

#' Prepare FCS data for debarcoding.
#'
#' Transform expression data using cofactor, scale to [0..1] range, match to
#' barcoding channels, and sort each event separately.
#'
#' @param fcs FCS flow frame.
#' @param key Barcoding key data frame.
#' @param cofactor Before scaling, data will be transformed using inverse
#' hyperbolic sin with given cofactor.
#' @return Expression data after transformation, scaling, and matching, and an
#' additional copy after sorting.
#' @export
debarcoderPrepareFcs <- function(fcs, key, cofactor = 10) {
  if (class(fcs) != "flowFrame") stop("fcs is not a flowFrame object")
  .verifyKey(key)

  # Verify FCS has all key channels.
  missing_channels <- setdiff(key$channels, fcs@parameters@data$name)
  if (length(missing_channels) > 0) {
    stop("FCS data is missing the following barcoding channels: ",
         paste(missing_channels, collapse = ", "))
  }

  # Transform and scale.
  exprs <- flowCore::exprs(fcs)
  exprs <- asinh(exprs / cofactor)
  exprs <-
    apply(exprs, 2, function(x) (x - min(x)) / (quantile(x, 0.99) - min(x)))
  # Match FCS channels to barcoding channels.
  exprs <- exprs[, key$channels]
  exprs_sorted <- t(apply(exprs, 1, sort, decreasing = TRUE))

  list(
    exprs = exprs,
    exprs_sorted = exprs_sorted
  )
}


#' Label events.
#'
#' For each event, label it with the barcoding code that matches that event's
#' top N highest intensity channels.
#'
#' @param exprs_list A list with expression and sorted expression matrices.
#' @param unlabeled_label Label for events that don't match any key codes.
#' @inheritParams debarcoderPrepareFcs
#' @return A data frame with a row for every event and a single Label column.
#' @export
debarcoderLabelEvents <- function(exprs_list,
                                  key,
                                  unlabeled_label = "unlabeled") {
  .verifyExprsList(exprs_list)
  .verifyKey(key)

  event_thresh <- exprs_list$exprs_sorted[, key$n_pos_channels]
  exprs_bool <- as.data.frame(exprs_list$exprs >= event_thresh)
  exprs_bool$Index <- seq(nrow(exprs_bool))
  exprs_bool <- merge(exprs_bool, key$key, all.x = TRUE)
  labels <- exprs_bool$code[order(exprs_bool$Index)]
  labels[is.na(labels)] <- unlabeled_label

  data.frame(
    Label = labels,
    stringsAsFactors = FALSE
  )
}

#' Unlabel events.
#'
#' Unlabel events if their Mahalanobis ratio is below the valley in the bimodal
#' Mahalanobis ratio distribution.
#'
#' @param labels Data frame with an event label column.
#' @inheritParams debarcoderLabelEvents
#' @inheritParams debarcode
#' @return A data frame with a row for every event. Columns are label, barcoding
#' separation distance, Mahalanobis ratio, and Mahalanobis distance.
#' @export
debarcoderUnlabelEvents <- function(exprs_list,
                                    labels,
                                    key,
                                    unlabeled_label = "unlabeled") {
  .verifyExprsList(exprs_list)
  if (!is.data.frame(labels)) stop("labels is not a data frame")
  if (is.null(labels$Label)) stop("labels is missing the Label column")
  if (!is.null(labels$MahalRatio)) {
    stop("labels has already been unlabeled")
  }

  # Calculate barcoding separation distance.
  n_pos <- key$n_pos_channels
  bc_separation_dist <-
    with(exprs_list, exprs_sorted[, n_pos] - exprs_sorted[, n_pos + 1])

  # Calculate Mahalanobis ratio.
  codes <- sort(setdiff(labels$Label, unlabeled_label))

  indices <- lapply(codes, function(code) which(labels$Label == code))
  names(indices) <- codes
  centers <- lapply(indices, function(idx) colMeans(exprs_list$exprs[idx, ]))
  covs <- lapply(indices, function(idx) cov(exprs_list$exprs[idx, ]))

  dists <- lapply(codes, function(code) {
    mahalanobis(exprs_list$exprs, centers[[code]], covs[[code]])
  })
  dists <- do.call(cbind, dists)
  dists_sorted <- t(apply(dists, 1, sort))
  mahal_ratio <- dists_sorted[, 2] / dists_sorted[, 1]
  # Ratio of unlabeled events should be set to null.
  mahal_ratio[labels$Label == unlabeled_label] <- NA
  log10_mahal_ratio <- log10(mahal_ratio)

  # Unlabel each code separately.
  for (code in codes) {
    code_indices <- which(labels$Label == code)

    # Find Mahalanobis ratio local minima and unlabel events below that value.
    dens <- density(log10_mahal_ratio[code_indices], n = 128, na.rm = TRUE)
    local_minima_indices <- which(diff(sign(diff(dens$y))) == 2) + 1
    local_minima <- dens$x[local_minima_indices]
    # Threshold is highest local minima which is lower than the median.
    thresh <-
      tail(local_minima[local_minima <
                          median(log10_mahal_ratio, na.rm = TRUE)], 1)
    if (length(thresh) == 0) {
      stop("unable to find Mahalanobis ratio local minima")
    }
    # Unlabel events below threshold.
    unlabel_indices <-
      intersect(code_indices, which(log10_mahal_ratio < thresh))
    labels$Label[unlabel_indices] <- unlabeled_label

    # Find BC separation distance local minima and unlabel events below it.
    dens <- density(bc_separation_dist[code_indices], n = 128, na.rm = TRUE)
    local_minima_indices <- which(diff(sign(diff(dens$y))) == 2) + 1
    local_minima <- dens$x[local_minima_indices]
    thresh <-
      tail(local_minima[local_minima <
                          median(bc_separation_dist, na.rm = TRUE)], 1)
    if (length(thresh) == 0) {
      stop("unable to find barcoding separation distance local minima")
    }
    unlabel_indices <-
      intersect(code_indices, which(bc_separation_dist < thresh))
    labels$Label[unlabel_indices] <- unlabeled_label
  }

  # Add barcoding separation distance, Mahalanobis ratio, and Mahalanobis
  # distance to labels.
  labels$BcSepDist <- bc_separation_dist
  labels$MahalRatio <- mahal_ratio
  labels$MahalDistance <- dists_sorted[, 1]

  labels
}

#' Export debarcoded FCS files.
#'
#' Given an FCS file, export a separate FCS file for each of the event labels
#' (including unlabeled). If any additional parameters were used in the labeling
#' (such as Mahalanobis Ratio) they will be included as columns in the FCS data.
#'
#' @param path_prefix Prefix to path where files will be exported. Each file will
#' be named [prefix_path].[label].fcs.
#' @inheritParams debarcoderPrepareFcs
#' @inheritParams debarcoderUnlabelEvents
#' @export
debarcoderExportDebarcodedFcs <- function(path_prefix, fcs, labels) {
  cols <- setdiff(colnames(labels), "Label")
  codes <- unique(labels$Label)
  orig_pdata <- flowCore::pData(flowCore::parameters(fcs))

  # Export each code in turn.
  for (code in codes) {
    code_indices <- which(labels$Label == code)
    code_fcs <- fcs[code_indices, ]

    # Add any columns that were involved in labeling.
    if (length(cols) > 0) {
      e <- flowCore::exprs(code_fcs)

      for (col in cols) {
        e <- cbind(e, labels[code_indices, col])
        colnames(e)[ncol(e)] <- col
      }

      code_fcs <- flowCore::flowFrame(e, description = code_fcs@description)

      # Do some trickery to make sure that the desc field is correct.
      params <- flowCore::parameters(code_fcs)
      pdata <- flowCore::pData(params)
      pdata$desc <- as.character(pdata$desc)
      pdata$desc[1:nrow(orig_pdata)] <- as.character(orig_pdata$desc)
      flowCore::pData(params) <- pdata
      flowCore::parameters(code_fcs) <- params
    }

    # Update description with correct filename.
    filename <- paste0(basename(path_prefix), ".", code, ".fcs")
    desc <- flowCore::description(code_fcs)
    desc[["$FIL"]] <- filename
    flowCore::description(code_fcs) <- desc

    flowCore::write.FCS(code_fcs, paste0(path_prefix, ".", code, ".fcs"))
  }
}

#' Debarcoder diagnostic plots.
#'
#' Given an FCS file, generate several debarcoder diagnostic plots. These
#' includea separate scatter plot of Mahalanobis ratio versus barcoding
#' separation distance, Mahalanobis ratio distribution, and barcoding separation
#' distance distribution.
#'
#' @param path_prefix Either NULL (do not export figures) or prefix to path
#' where files will be exported.
#' @inheritParams debarcoderPrepareFcs
#' @inheritParams debarcoderUnlabelEvents
#' @return A list of ggplot objects corresponding to the figures.
#' @import ggplot2
#' @export
debarcoderPlots <- function(path_prefix,
                            labels,
                            key,
                            exprs_list,
                            unlabeled_label = "unlabeled") {
  codes <- unique(labels$Label)

  figs <- list()

  if ("MahalRatio" %in% colnames(labels)) {
    xlim <- c(1, max(max(labels$MahalRatio, na.rm = TRUE), 10000))

    # Figures: Mahlanobis ratio versus barcoding separation distance, for each
    # code.
    for (code in codes) {
      code_indices <- which(labels$Label == code)

      figs[[code]] <-
        ggplot(labels[code_indices, ], aes(x = MahalRatio, y = BcSepDist)) +
        geom_point(size = 0, alpha = 0.2) +
        scale_x_log10(limits = xlim) +
        scale_y_continuous(limits = c(0, 1)) +
        labs(title = paste0("Debarcoding diagnostic for ", code),
             x = "log10(Mahalanobis Ratio)",
             y = "Barcoding Separation Distance") +
        theme(aspect.ratio = 1)
    }
    names(figs) <- codes

    # Figures: Mahalanobis ratio and barcoding separation distance
    # distributions.
    figs[["mahalanobis_ratio"]] <-
      ggplot(labels, aes(x = MahalRatio)) +
      geom_density(fill = "grey") +
      scale_x_log10(limits = xlim) +
      labs(title = "Mahalanobis ratio distribution",
           x = "log10(Mahalanobis Ratio)",
           y = "Density")

    figs[["barcoding_separation_distance"]] <-
      ggplot(labels, aes(x = BcSepDist)) +
      geom_density(fill = "grey") +
      scale_x_continuous(limits = c(0, 1)) +
      labs(title = "Barcoding separation distance distribution",
           x = "Barcoding Separation Distance",
           y = "Density")
  }

  # Figures: Channel intensities for each marker pair.
  biaxial_fig_names <- c()
  cells <- dplyr::left_join(labels, key$key, by = c("Label" = "code"))
  cells[is.na(cells)] <- FALSE

  for (ch_x_idx in seq(length(key$channels) - 1)) {
    ch_x <- key$channels[ch_x_idx]

    for (ch_y_idx in (ch_x_idx + 1):length(key$channels)) {
      ch_y <- key$channels[ch_y_idx]

      ch_cells <- cells
      ch_cells$Group <-
        paste0(ch_x, ifelse(ch_cells[[ch_x]], "+", "-"), " ",
               ch_y, ifelse(ch_cells[[ch_y]], "+", "-"))
      ch_cells[[ch_x]] <- exprs_list$exprs[, ch_x]
      ch_cells[[ch_y]] <- exprs_list$exprs[, ch_y]
      ch_cells <- dplyr::filter(ch_cells, Label != unlabeled_label)

      fig_name <- paste0(ch_x, "_vs_", ch_y)
      biaxial_fig_names <- c(biaxial_fig_names, fig_name)
      figs[[fig_name]] <-
        ggplot(ch_cells, aes_string(x = ch_x, y = ch_y)) +
        geom_point(size = 1, alpha = 0.5) +
        theme(aspect.ratio = 1) +
        facet_wrap(~ Group, ncol = 2)
    }
  }

  # Table: Code counts.
  code_counts <- labels %>%
    dplyr::count(Label) %>%
    dplyr::mutate(Freq = n / sum(n))
  figs[["code_counts"]] <- code_counts

  # Export figures as JPGs if path_prefix exists.
  if (!is.null(path_prefix)) {
    path <- paste0(path_prefix, ".debarcoding_figures")
    if (!file.exists(path)) dir.create(path)

    for (code in codes) {
      if (!is.null(figs[[code]])) {
        ggsave(file.path(path,
                         paste0("mahalanobis_versus_barcoding.", code, ".jpg")),
               figs[[code]], width = 4, height = 4)
      }
    }

    for (dist in c("mahalanobis_ratio", "barcoding_separation_distance")) {
      if (!is.null(figs[[dist]])) {
        ggsave(file.path(path, paste0(dist, ".jpg")),
               figs[[dist]], width = 4, height = 3)
      }
    }

    for (fig_name in biaxial_fig_names) {
      if (!is.null(figs[[fig_name]])) {
        ggsave(file.path(path, paste0("biaxial.", fig_name, ".jpg")),
               figs[[fig_name]], width = 8, height = 8)
      }
    }

    write.csv(code_counts,
              file.path(path, "code_counts.csv"),
              row.names = FALSE)
  }

  figs
}

#' Debarcode an FCS file.
#'
#' Import FCS file and barcoding key and run it through the debarcoding
#' workflow: label events using Zunder et al., unlabel events according to
#' barcoding separation distance and Mahalanobis ratio heuristics, and export
#' each label to a separate file along with an accompanying diagnostic plot.
#'
#' Please note that this function ends with a series of warnings due to
#' flowCore's write.FCS.
#'
#' @param fcs_file_path FCS file path.
#' @param key_file_path CSV barcoding key file path.
#' @param export_files If TRUE, the function will export debarcoded FCS files.
#' @param export_figures If TRUE, the function will export diagnostic plots.
#' @param verbose If TRUE, the function will message the console with updates.
#' @return A data frame with a row for every event. Columns are label, barcoding
#' separation distance, and Mahalanobis ratio.
#' @export
debarcode <- function(fcs_file_path,
                      key_file_path,
                      export_files = TRUE,
                      export_figures = TRUE,
                      verbose = TRUE) {
  if (!file.exists(fcs_file_path)) {
    stop(paste0("unable to find ", fcs_file_path))
  }
  if (!file.exists(key_file_path)) {
    stop(paste0("unable to find ", key_file_path))
  }

  key <- debarcoderImportKey(key_file_path)
  if (verbose) message("importing FCS file ...")
  fcs <- flowCore::read.FCS(fcs_file_path)
  if (verbose) message("debarcoding ...")
  exprs_list <- debarcoderPrepareFcs(fcs, key)
  labels <- debarcoderLabelEvents(exprs_list, key)
  if (verbose) message("unlabeling events using heuristics ...")
  labels <- debarcoderUnlabelEvents(exprs_list, labels, key)

  if (export_files) {
    if (verbose) message("exporting debarcoded FCS files ...")
    debarcoderExportDebarcodedFcs(fcs_file_path, fcs, labels)
  }

  if (export_figures) {
    if (verbose) message("exporting plots ...")
    debarcoderPlots(fcs_file_path, labels, key, exprs_list)
  }

  return(labels)
}
dtelad11/cytutils documentation built on Sept. 1, 2022, 2:45 p.m.