knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

In this vignette, you learn how CytoNorm can be used to normalize a dataset towards a specific distribution.

Often, data is analyzed over a longer period of time, especially in clinical trials. Instead of waiting for all data to start the computational data analysis, one can already analyze a first part of the data and integrate the later data afterwards. A pitfall here is that the later data should not be normalized towards a median over the batches because otherwise the earlier data analysis, clustering for example, needs to be re-done. Rather, we would like to normalize towards the goal distribution of a previous model. In another experiment, it might be the case that manual gating is done on a single batch. In order to apply this gating strategy to a second batch, the second batch needs to be normalized towards the first batch. It is clear that some experimental settings or designs require the normalization algorithm to be flexible, specifically regarding the normalization goal. In most normalization algorithms, this is not the case.

By default, CytoNorm normalizes the data towards the median over the batches, but it is also convenient to specify the goal distribution. It is possible to assign one of the batches as “goal batch”, or a goal distribution from a previous CytoNorm model can be retrieved and passed along in the call when training a new CytoNorm model.

Prepare the CytoNorm normalization step

In the Use case 1: Run CytoNorm on a dataset without dedicated controls vignette,the two first cohort batches of our dataset were normalized towards each other.Now, we want to normalize the data of the second cohort (P1_C2 and P2_C2) towards the level of the first.

To do this, we extract the CytoNorm goal quantiles of the model that wasgenerated for the first use case and pass these along while training the CytoNorm model over here. This CytoNorm model will use the same FlowSOM model as the first model and will normalize the same (backbone) markers.

library(CytoNorm)
library(flowCore)
library(FlowSOM)
library(ggpubr)
dir_prepr <- "Data/Preprocessed"

# Organize preprocessed data in batches
files_P1_C1 <- list.files(path = dir_prepr,
                          pattern = "ID[1-4]_Panel1_TP[1-3].fcs",
                          full.names = TRUE)
files_P1_C2 <- list.files(path = dir_prepr,
                          pattern = "ID[5-8]_Panel1_TP[1-3].fcs",
                          full.names = TRUE)
files_P2_C1 <- list.files(path = dir_prepr,
                          pattern = "ID[1-4]_Panel2_TP[1-3].fcs",
                          full.names = TRUE)
files_P2_C2 <- list.files(path = dir_prepr,
                          pattern = "ID[5-8]_Panel2_TP[1-3].fcs",
                          full.names = TRUE)

# Read in 1 file per panel to get channel information
ff_P1 <- read.FCS(files_P1_C1[1])
ff_P2 <- read.FCS(files_P2_C1[1])

# Retrieve channel information
cellTypeMarkers <- c("CD27", "CD38", "IgD", "IgM") 
cellTypeChannels <- GetChannels(object = ff_P1, markers = cellTypeMarkers)
P1_cellStateMarkers <- c("CD5", "CD268", "CD24", "PDL1", "PD1", "CD86", "KI67")
P1_cellStateChannels <- GetChannels(object = ff_P1, markers = P1_cellStateMarkers)
P2_cellStateMarkers <- c("CD40", "CD21", "HLA-DR", "ICOSL", "PD1", "CD86", "KI67")
P2_cellStateChannels <- GetChannels(object = ff_P2, markers = P2_cellStateMarkers)

# Read in manualLabels object that was generated in the CytoNorm_without_controls vignette
manualLabels <- readRDS("Data/Preprocessed/attachments/manualLabels.rds")

# Normalized data
files_P1_C1_norm <- list.files(path = "Data/Normalized_cellType", 
                               pattern = "ID[1-4]_Panel1_TP..fcs", full.names = TRUE)
files_P2_C1_norm <- list.files(path = "Data/Normalized_cellType", 
                               pattern = "ID[1-4]_Panel2_TP..fcs", full.names = TRUE)

Train the CytoNorm model

Extract the goal distribution

previousModel <- readRDS("RDS/model1_withoutControl.rds")
goal_q <- getCytoNormQuantiles(previousModel)

Train the CytoNorm model

model2_towardsDistribution <- CytoNorm.train(files = c(files_P1_C2, files_P2_C2),
                                             labels = c(rep(x = "P1",times = length(files_P1_C2)),
                                                        rep(x = "P2",times = length(files_P2_C2))),
                                             channels = cellTypeChannels,
                                             transformList = NULL,
                                             seed = 1,
                                             plot = TRUE,
                                             verbose = TRUE,
                                             normParams = list("goal" = goal_q))
                                             #normParams = list("goal" = "P1")) #E.g. to normalize towards one of the batches
saveRDS(object = model2_towardsDistribution, file = "RDS/model2_towardsDistribution.rds")

The normParams argument can be used to pass parameters to the normalization method call, QuantileNorm.train by default. Here, you can specify for example nQ, the number of quantiles that will be computed and aligned, or goal, which are the goal quantiles. Note that no FlowSOM.params are specified, since CytoNorm will look in the outputDir (default = “./tmp”) for a previous FlowSOM object. If this is the case, it will throw a warning saying CytoNorm will reuse the saved FlowSOM result. Here, it will thus reuse the FlowSOM clustering from the previous CytoNorm model (use case 1). If this is not the desired behavior, one can either specify another outputDir or move the FlowSOM object in the file manager.

Apply the CytoNorm model

CytoNorm.normalize(model = model2_towardsDistribution,
                   files = c(files_P1_C2, files_P2_C2),
                   labels = c(rep(x = "P1",times = length(files_P1_C2)),
                              rep(x = "P2",times = length(files_P2_C2))),
                   transformList = NULL,
                   verbose = TRUE,
                   prefix = "",
                   transformList.reverse = NULL, 
                   outputDir = "Data/Normalized_cellType")

Evauate the quality

Density plots

# List files
files_P1_C2_norm <- list.files(path = "Data/Normalized_cellType", 
                               pattern = "ID[5-8]_Panel1_TP[1-3].fcs", full.names = TRUE)
files_P2_C2_norm <- list.files(path = "Data/Normalized_cellType", 
                               pattern = "ID[5-8]_Panel2_TP[1-3].fcs", full.names = TRUE)

# Make plots
p <- plotDensities(input = list("P1_C2" = files_P1_C2,
                                "P2_C2" = files_P2_C2,
                                "P1_C2_norm" = files_P1_C2_norm,
                                "P2_C2_norm" = files_P2_C2_norm),
                   channels = cellTypeChannels,
                   colors = c("P1_C1" = "#900002","P1_C2" = "#FF3437",
                              "P2_C1" = "#0046A1","P2_C2" = "#6FCFFF",
                              "C1_goal" = "#7209b7", "Goal distribution" = "#7209b7"),
                   model = model2_towardsDistribution,
                   show_goal = TRUE)

# Arrange figure
p_ <- ggarrange(ggarrange(plotlist = p[1:length(p)-1], 
                          ncol = (length(p)-1)/(2*length(cellTypeChannels)), 
                          nrow = 2*length(cellTypeChannels)),
                widths = c(1,3),
                p$legend, nrow = 2, heights = c(10,1))
print(p_)

Spline plots

plotSplines(model = model2_towardsDistribution, channels = cellTypeChannels, groupClusters = TRUE)

EMD and MAD

Make aggregates and get aggregate manual labels

dir.create("tmp/before")
dir.create("tmp/after")

aggregates <- list("agg_P1_C2" = files_P1_C2,
                   "agg_P2_C2" = files_P2_C2)

for (agg in names(aggregates)){
  writeLines(agg)
  files <- aggregates[[agg]]
  set.seed(2023)
  before <- AggregateFlowFrames(fileNames = files, silent = TRUE,
                                cTotal = length(files) * 10000)

  write.FCS(before, paste0("tmp/before/", agg, ".fcs"))  
  after <- before

  manual <- c()
  for (i_file in unique(before@exprs[,"File"])){
    ff <- read.FCS(paste0("Data/Normalized_cellType/", basename(aggregates[[agg]][i_file])))
    i_IDs <- before@exprs[before@exprs[,"File"] == i_file,"Original_ID2"]
    after@exprs[before@exprs[,"File"] == i_file,1:(ncol(after@exprs)-3)] <- ff@exprs[i_IDs, ]
    labels <- as.character(manualLabels[[basename(aggregates[[agg]][i_file])]])
    manual <- c(manual, labels[i_IDs])
  }    

  manualLabels[[paste0(agg, ".fcs")]] <- factor(manual, levels = levels(manualLabels[[1]]))
  write.FCS(after, paste0("tmp/after/", agg, ".fcs"))
}

Define cell types and files for model evaluation

cellTypes <- c("unlabeled",
               "All B cells",
               "Transitional B cells",
               "Naive B cells",
               "IgD+ memory B cells",
               "IgM+ memory B cells", 
               "Class switched memory B cells", 
               "Double negative B cells")

models <- list("Model2_before" = list("files" = c(files_P1_C1_norm, files_P2_C1_norm, 
                                                   "tmp/after/agg_P1_C1.fcs", "tmp/after/agg_P2_C1.fcs",
                                                   "tmp/before/agg_P1_C2.fcs", "tmp/before/agg_P2_C2.fcs", 
                                                   files_P1_C2, files_P2_C2),
                                       "channels" = cellTypeChannels),
                "Model2_after" = list("files" = c(files_P1_C1_norm, files_P2_C1_norm, 
                                                  "tmp/after/agg_P1_C1.fcs", "tmp/after/agg_P2_C1.fcs",
                                                  "tmp/after/agg_P1_C2.fcs", "tmp/after/agg_P2_C2.fcs", 
                                                  files_P1_C2_norm, files_P2_C2_norm),
                                       "channels" = cellTypeChannels))

Calculate EMD and MAD

EMDs <- list()
MADs <- list()

for (model in names(models)){
  print(model)
  files <- models[[model]][["files"]]
  channels <- models[[model]][["channels"]]

  EMDs[[model]] <- emdEvaluation(files = files,
                                 channels = channels,
                                 manual = manualLabels, return_all = TRUE, 
                                 manualThreshold = 20)
  MADs[[model]] <- CytoNorm:::madEvaluation(files = files[!startsWith(files, "tmp")], 
                                 channels = channels, return_all = TRUE,
                                 manual = manualLabels, transformList = NULL)
}

EMD_scores <- list()
for (subset in names(models)){
  dist <- EMDs[[subset]]$distances
  EMD_scores[[subset]] <- matrix(NA,
                                 nrow = length(dist), ncol = length(dist[[1]]),
                                 dimnames = list(names(dist),
                                                 names(dist[[1]])))
  for (cellType in names(dist)){
      for (marker in names(dist[[cellType]])){
        EMD_scores[[subset]][cellType, marker] <- max(dist[[cellType]][[marker]][25:26, 27:28], na.rm = T)
      }
  }
}

Make figures

markerlevels <- NULL
plotlist <- list()

# EMD plot
EMD_before <- EMD_scores[["Model2_before"]][-2,]
EMD_after <- EMD_scores[["Model2_after"]][-2,]

EMD_df <- data.frame("before" = c(EMD_before),
                     "after" = c(EMD_after),
                     "feature" = paste(rownames(EMD_before), 
                                       rep(colnames(EMD_before), 
                                           each = nrow(EMD_before)), sep = "_"))
if (is.null(markerlevels)){
  markerlevels <- unique(unique(sub(".*_", "", EMD_df$feature)))
}
EMD_df$marker <- factor(sub(".*_", "", EMD_df$feature), levels = markerlevels)
EMD_df$celltype <- factor(sub("_.*", "", EMD_df$feature), levels = c("AllCells", cellTypes))
max <- max(EMD_df[,1:2], na.rm = T)
p_emd <- ggplot(EMD_df, aes(x = after, y = before)) +
  xlim(0,max) + ylim(0,max) +
  geom_point(aes(color = marker, shape = celltype)) +
  geom_abline() +
  scale_shape_manual(values = c(16, 17, 15, 3, 7, 8, 6)) +
  ggtitle("EMD") +
  theme_minimal()

leg_emd <- get_legend(p_emd)

p_emd <- p_emd  +
    theme(legend.position = "none")
plotlist[[length(plotlist)+1]] <- p_emd    

names(markerlevels) <- sub(".*<(.*)>.*", "\\1", markerlevels)

# MAD plot
MAD_before <- MADs[["Model2_before"]][["comparison"]][-2,]
MAD_after <- MADs[["Model2_after"]][["comparison"]][-2,]
MAD_df <- data.frame("before" = c(MAD_before),
                     "after" = c(MAD_after),
                     "feature" = paste(rownames(MAD_before), 
                                       rep(colnames(MAD_before), 
                                           each = nrow(MAD_before)), sep = "_"))
MAD_df$marker <- factor(markerlevels[sub(".*_", "", MAD_df$feature)], levels = markerlevels)
MAD_df$celltype <- factor(sub("_.*", "", MAD_df$feature), c("AllCells", cellTypes))
max <- max(MAD_df[,1:2], na.rm = T)
p_mad <- ggplot(MAD_df, aes(x = after, y = before)) +
  xlim(0,max) + ylim(0,max) +
  geom_point(aes(color = marker, shape = celltype)) +
  geom_abline() +
  scale_shape_manual(values = c(16, 17, 15, 3, 7, 8, 6)) +
  ggtitle("MAD") +
  theme_minimal()

leg_mad <- get_legend(p_mad)

p_mad <- p_mad  +
    theme(legend.position = "none")

plotlist[[length(plotlist)+1]] <- p_mad

ggarrange(ggarrange(plotlist = plotlist),
          as_ggplot(leg_emd), ncol=2, widths = c(4,1.5))


saeyslab/CytoNorm documentation built on Nov. 2, 2024, 12:39 p.m.