R/main.R
In Cleanet: Automated Doublet Detection and Classification for Cytometry Data

Documented in classify_doublets cleanet compare_doublets_exp_obs filter_debris_cytof filter_debris_flow

#' @title Detect doublets in a single cytometry sample
#' @description Augments data with simulated doublets, computes
#' nearest neighbors for augmented dataset, identifies doublets as those
#' events with a high share of simulated doublets among nearest neighbors.
#' @param df A data frame containing protein expression data.
#' @param cols Columns to use in analysis.
#' @param cofactor Parameter of arcsinh transformation, applied before
#' computing nearest neighbors. Recommended values are 5 for mass cytometry
#' and 500-1000 for flow cytometry.
#' @param thresh Among the 15 nearest neighbors, how many should be simulated
#' doublets in order for the event to be classified as doublet?
#' @param is_debris Optional, binary array with length matching
#' the number of rows in df. TRUE for debris events, FALSE for everything else.
#' This package includes helper functions to compute this for flow or
#' mass cytometry data.
#' @returns A list with multiple elements, among them the singlet/doublet
#' status of each event.
#' @examples
#' path <- system.file("extdata", "df_mdipa.csv", package="Cleanet")
#' df_mdipa <- read.csv(path, check.names=FALSE)
#' cols <- c("CD45", "CD123", "CD19", "CD11c", "CD16",
#'           "CD56", "CD294", "CD14", "CD3", "CD20",
#'           "CD66b", "CD38", "HLA-DR", "CD45RA",
#'           "DNA1", "DNA2")
#' cleanet_res <- cleanet(df_mdipa, cols, cofactor=5)
#' @export
cleanet <- function(df, cols, cofactor,
                    thresh=5, is_debris=NULL) {
  message("Starting Cleanet...")

  if (!is.null(is_debris)) {
    if (length(is_debris) != nrow(df))
      stop("Length mismatch between debris array and data frame!")

    idx_keep <- which(!is_debris)
    n_keep <- length(idx_keep)
    perc_keep <- round(100*n_keep/nrow(df),1)
    message(paste0("Doublet simulation using ", n_keep, " events, ", perc_keep, "% of total..."))
  } else {
    idx_keep <- seq(nrow(df))
    message(paste0("Doublet simulation using all ", nrow(df),  " events..."))
  }

  n_doub <- floor(length(idx_keep)/2)
  sel1 <- sample(idx_keep, n_doub)
  sel2 <- sample(idx_keep, n_doub)
  x_aug <- augment_data(df, cols, sel1, sel2, cofactor)

  all_knn <- hnsw_knn(x_aug, k=15, distance= 'l2', n_threads=1, M=48)
  n_nb_doub <- apply(all_knn$idx, 1, function(row) length(which(row<n_doub)))
  status <- rep("Filtered", nrow(df))
  status <- if_else(n_nb_doub[-seq(n_doub)] > thresh, "Doublet", "Singlet")

  n_found <- length(which(status=="Doublet"))
  perc_doublets <- round(100*n_found/nrow(df),1)
  message(paste0("Done! Found ", n_found, " doublets, ", perc_doublets, "% of total."))

  return(list(n_nb_doub = n_nb_doub[-seq(n_doub)],
              status = status,
              nbrs = all_knn$idx[-seq(n_doub),],
              sel1 = sel1,
              sel2 = sel2,
              n_doub = n_doub,
              thresh = thresh,
              n_nb_sim = n_nb_doub[seq(n_doub)],
              sensitivity = length(which(n_nb_doub[seq(n_doub)] > 5))/n_doub,
              is_debris=is_debris))
}


augment_data <- function(df, cols, sel1, sel2, cofactor) {
  x0 <- as.matrix(df[,cols])
  x_doub <- x0[sel1,] + x0[sel2,]
  x_aug <- rbind(x_doub, x0)
  x_aug <- asinh(x_aug/cofactor)
  return(x_aug)
}


#' @title Flag debris in mass cytometry data.
#' @description Detect events with low distance from 0 in protein space.
#' This function aims for high specificity, but not high sensitivity: for
#' Cleanet's purposes, it suffices to deplete debris, even if not all of
#' it is eliminated.
#' @param df A data frame containing protein expression data.
#' @param cols Columns to use in analysis. It is recommended to use the same
#' ones in the call to cleanet.
#' @param cols_plot Two columns that are used for visual feedback.
#' @param cofactor Parameter for arcsinh transformation used before computing
#' distances. 5 is a good default for mass cytometry data.
#' @param threshold Number between 0 and 1; distances are scaled between
#' 0 and 1 and events whose distance to the origin is smaller than the
#' threshold are flagged.
#' @returns A binary array with the same length as the number of rows in df.
#' TRUE for debris, FALSE for everything else.
#' @examples
#' path <- system.file("extdata", "df_mdipa.csv", package="Cleanet")
#' df_mdipa <- read.csv(path, check.names=FALSE)
#' cols <- c("CD45", "CD123", "CD19", "CD11c", "CD16",
#'           "CD56", "CD294", "CD14", "CD3", "CD20",
#'           "CD66b", "CD38", "HLA-DR", "CD45RA",
#'           "DNA1", "DNA2")
#' is_debris <- filter_debris_cytof(df_mdipa, cols)
#' @export
filter_debris_cytof <- function(df, cols, cols_plot=c("DNA1", "CD45"),
                                cofactor=5, threshold=0.3) {
  x0 <- asinh(as.matrix(df[cols])/cofactor)
  ncol <- length(cols)

  norm <- sqrt(apply(apply(x0, 2, function(col) col/max(col))^2, 1, sum))/sqrt(ncol)

  sub <- sample(nrow(df), min(nrow(df),1e4))
  df_sub <- df[sub,] %>%
    mutate(Debris = norm[sub]<threshold) %>%
    mutate(across(cols_plot, ~asinh(.x/cofactor)))
  p <- ggplot(df_sub, aes(x=.data[[cols_plot[1]]],
                          y=.data[[cols_plot[2]]],
                          color=.data[["Debris"]])) +
    geom_point(size=0.5) +
    theme_bw()
  show(p)

  return(norm < threshold)
}


#' @title Flag debris in flow cytometry data.
#' @description Detect events in the lower left corner of FSC-A/SSC-A plots.
#' This function aims for high specificity, but not high sensitivity: for
#' Cleanet's purposes, it suffices to deplete debris, even if not all of
#' it is eliminated.
#' @param df A data frame containing scattering channels and
#' protein expression data.
#' @param sum_max Numeric; events whose sum of FSC-A and SSC-A is smaller than
#' this value are flagged.
#' @param cols Names of columns to use. This function is intended for use
#' with the area channel of forward and side scatter.
#' @returns A binary array with the same length as the number of rows in df.
#' TRUE for debris, FALSE for everything else.
#' @export
filter_debris_flow <- function(df, sum_max=5e4, cols=c("FSC-A", "SSC-A")) {
  is_debris <- df[[cols[1]]] + df[[cols[2]]] < sum_max &
    df[[cols[1]]] > 0 & df[[cols[2]]] > 0

  pal <- setNames(c("#66A61E", "#666666"), c("FALSE", "TRUE"))

  sub <- sample(nrow(df), min(nrow(df),1e4))
  df_sub <- df[sub,] %>%
    mutate(Debris = is_debris[sub])
  p <- ggplot(df_sub, aes(x=.data[[cols[1]]], y=.data[[cols[2]]], color=.data[["Debris"]])) +
    geom_point(size=0.5) +
    scale_color_manual(values=pal) +
    guides(color=guide_legend(override.aes = list(size=4))) +
    theme_bw(base_size=16) +
    theme(legend.position = "top", legend.box.spacing = margin(0))
  show(p)

  return(is_debris)
}


#' @title Classify doublets (or multiplets) based on component singlets.
#' @description Extends a classification of singlets into a classification
#' of doublets.
#' @param cleanet_res The output of a call to the cleanet function.
#' @param singlet_clas An array giving a classification of the singlets, whose
#' length must match the number of singlet events returned in cleanet_res.
#' @param max_multi The highest cardinality of a multiplet to be considered.
#' @returns An array with the same length as the number of doublets found
#' in cleanet_res, specifying the composition of each doublet.
#' @examples
#' path <- system.file("extdata", "df_mdipa.csv", package="Cleanet")
#' df_mdipa <- read.csv(path, check.names=FALSE)
#' cols <- c("CD45", "CD123", "CD19", "CD11c", "CD16",
#'           "CD56", "CD294", "CD14", "CD3", "CD20",
#'           "CD66b", "CD38", "HLA-DR", "CD45RA",
#'           "DNA1", "DNA2")
#' cleanet_res <- cleanet(df_mdipa, cols, cofactor=5)
#' singlet_clas <- df_mdipa$label[which(cleanet_res$status!="Doublet")]
#' doublet_clas <- classify_doublets(cleanet_res, singlet_clas)
#' @export
classify_doublets <- function(cleanet_res, singlet_clas, max_multi=4) {

  sel1 <- cleanet_res[["sel1"]]
  sel2 <- cleanet_res[["sel2"]]
  nbrs <- cleanet_res[["nbrs"]]
  n_doub <- cleanet_res[["n_doub"]]
  thresh <- cleanet_res[["thresh"]]
  n_nb_doub <- cleanet_res[["n_nb_doub"]]

  lab_with_doub <- rep("Doublet", nrow(nbrs))
  lab_with_doub[which(cleanet_res$status!="Doublet")] <- singlet_clas

  for (iter in seq(max_multi)) {
    sim_lab <- if_else(lab_with_doub[sel1] < lab_with_doub[sel2],
                       paste0(lab_with_doub[sel1], "_", lab_with_doub[sel2]),
                       paste0(lab_with_doub[sel2], "_", lab_with_doub[sel1]))
    sim_lab_fac <- as.factor(sim_lab)

    doub_c <- doub_lab_c(nbrs, sim_lab_fac, n_doub, length(levels(sim_lab_fac)))
    doub_lab <- c("",levels(sim_lab_fac))[doub_c+1]

    to_replace <- which(lab_with_doub=="Doublet" & !grepl("Doublet", doub_lab))
    lab_with_doub[to_replace] <- doub_lab[to_replace]
  }

  return(lab_with_doub[which(n_nb_doub>thresh)])
}


#' @title Tabulate expected and observed proportions of doublet types.
#' @description Given compatible classifications of singlets and doublets,
#' this function computes expected proportions of doublets as the product
#' of the proportions of their components.
#' @param cleanet_res The output of a call to the cleanet function.
#' @param singlet_clas An array giving a classification of the singlets, whose
#' length must match the number of singlet events returned in cleanet_res.
#' @param doublet_clas An array giving a classification of the doublets, whose
#' length must match the number of doublet events returned in cleanet_res.
#' @returns A data frame tabulating expected and observed proportions for
#' each unique doublet type.
#' @examples
#' path <- system.file("extdata", "df_mdipa.csv", package="Cleanet")
#' df_mdipa <- read.csv(path, check.names=FALSE)
#' cols <- c("CD45", "CD123", "CD19", "CD11c", "CD16",
#'           "CD56", "CD294", "CD14", "CD3", "CD20",
#'           "CD66b", "CD38", "HLA-DR", "CD45RA",
#'           "DNA1", "DNA2")
#' cleanet_res <- cleanet(df_mdipa, cols, cofactor=5)
#' singlet_clas <- df_mdipa$label[which(cleanet_res$status!="Doublet")]
#' doublet_clas <- classify_doublets(cleanet_res, singlet_clas)
#' df_exp_obs <- compare_doublets_exp_obs(doublet_clas, singlet_clas, cleanet_res)
#' @export
compare_doublets_exp_obs <- function(doublet_clas, singlet_clas, cleanet_res) {
  if (is.null(cleanet_res$is_debris)) {
    tab_sing <- table(singlet_clas)
  } else {
    tab_sing <- table(singlet_clas[which(!cleanet_res$is_debris[which(cleanet_res$status!="Doublet")])])
  }

  out <- outer(tab_sing, tab_sing) / (sum(tab_sing))^2
  df_exp <- melt(out)
  names(df_exp) <- c("Var1", "Var2", "value")

  df_exp <- df_exp %>%
    as_tibble() %>%
    filter(as.character(.data[["Var1"]]) <= as.character(.data[["Var2"]])) %>%
    mutate(value = if_else(.data[["Var1"]] == .data[["Var2"]],
                           .data[["value"]],
                           2*.data[["value"]])) %>%
    mutate(Type = if_else(as.character(.data[["Var1"]]) < as.character(.data[["Var2"]]),
                          paste0(.data[["Var1"]], "_", .data[["Var2"]]),
                          paste0(.data[["Var2"]], "_", .data[["Var1"]])),
           Expected = .data[["value"]],
           .keep="unused")

  tab_doub <- table(doublet_clas)
  df_obs <- tibble(Type = names(tab_doub),
                   Observed = as.numeric(tab_doub) / length(doublet_clas))

  df_exp_obs <- full_join(df_exp, df_obs) %>%
    replace_na(list("Observed"=0, "Expected"=0))
  return(df_exp_obs)
}

Any scripts or data that you put into this service are public.

Cleanet documentation built on April 4, 2025, 12:17 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

Cleanet
Automated Doublet Detection and Classification for Cytometry Data

R/main.R
In Cleanet: Automated Doublet Detection and Classification for Cytometry Data

Defines functions compare_doublets_exp_obs classify_doublets filter_debris_flow filter_debris_cytof augment_data cleanet

Documented in classify_doublets cleanet compare_doublets_exp_obs filter_debris_cytof filter_debris_flow

Try the Cleanet package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

Cleanet Automated Doublet Detection and Classification for Cytometry Data

R/main.R In Cleanet: Automated Doublet Detection and Classification for Cytometry Data

Defines functions compare_doublets_exp_obs classify_doublets filter_debris_flow filter_debris_cytof augment_data cleanet

Documented in classify_doublets cleanet compare_doublets_exp_obs filter_debris_cytof filter_debris_flow

Try the Cleanet package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

Cleanet
Automated Doublet Detection and Classification for Cytometry Data

R/main.R
In Cleanet: Automated Doublet Detection and Classification for Cytometry Data