knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
Often, data is analyzed over a longer period of time, especially in clinical trials. Instead of waiting for all data to start the computational data analysis, one can already analyze a first part of the data and integrate the later data afterwards. A pitfall here is that the later data should not be normalized towards a median over the batches because otherwise the earlier data analysis, clustering for example, needs to be re-done. Rather, we would like to normalize towards the goal distribution of a previous model. In another experiment, it might be the case that manual gating is done on a single batch. In order to apply this gating strategy to a second batch, the second batch needs to be normalized towards the first batch. It is clear that some experimental settings or designs require the normalization algorithm to be flexible, specifically regarding the normalization goal. In most normalization algorithms, this is not the case.
By default, CytoNorm normalizes the data towards the median over the batches, but it is also convenient to specify the goal distribution. It is possible to assign one of the batches as “goal batch”, or a goal distribution from a previous CytoNorm model can be retrieved and passed along in the call when training a new CytoNorm model.
In the Use case 1: Run CytoNorm on a dataset without dedicated controls vignette,the two first cohort batches of our dataset were normalized towards each other.Now, we want to normalize the data of the second cohort (P1_C2 and P2_C2) towards the level of the first.
To do this, we extract the CytoNorm goal quantiles of the model that wasgenerated for the first use case and pass these along while training the CytoNorm model over here. This CytoNorm model will use the same FlowSOM model as the first model and will normalize the same (backbone) markers.
library(CytoNorm) library(flowCore) library(FlowSOM) library(ggpubr)
dir_prepr <- "Data/Preprocessed" # Organize preprocessed data in batches files_P1_C1 <- list.files(path = dir_prepr, pattern = "ID[1-4]_Panel1_TP[1-3].fcs", full.names = TRUE) files_P1_C2 <- list.files(path = dir_prepr, pattern = "ID[5-8]_Panel1_TP[1-3].fcs", full.names = TRUE) files_P2_C1 <- list.files(path = dir_prepr, pattern = "ID[1-4]_Panel2_TP[1-3].fcs", full.names = TRUE) files_P2_C2 <- list.files(path = dir_prepr, pattern = "ID[5-8]_Panel2_TP[1-3].fcs", full.names = TRUE) # Read in 1 file per panel to get channel information ff_P1 <- read.FCS(files_P1_C1[1]) ff_P2 <- read.FCS(files_P2_C1[1]) # Retrieve channel information cellTypeMarkers <- c("CD27", "CD38", "IgD", "IgM") cellTypeChannels <- GetChannels(object = ff_P1, markers = cellTypeMarkers) P1_cellStateMarkers <- c("CD5", "CD268", "CD24", "PDL1", "PD1", "CD86", "KI67") P1_cellStateChannels <- GetChannels(object = ff_P1, markers = P1_cellStateMarkers) P2_cellStateMarkers <- c("CD40", "CD21", "HLA-DR", "ICOSL", "PD1", "CD86", "KI67") P2_cellStateChannels <- GetChannels(object = ff_P2, markers = P2_cellStateMarkers) # Read in manualLabels object that was generated in the CytoNorm_without_controls vignette manualLabels <- readRDS("Data/Preprocessed/attachments/manualLabels.rds") # Normalized data files_P1_C1_norm <- list.files(path = "Data/Normalized_cellType", pattern = "ID[1-4]_Panel1_TP..fcs", full.names = TRUE) files_P2_C1_norm <- list.files(path = "Data/Normalized_cellType", pattern = "ID[1-4]_Panel2_TP..fcs", full.names = TRUE)
previousModel <- readRDS("RDS/model1_withoutControl.rds") goal_q <- getCytoNormQuantiles(previousModel)
model2_towardsDistribution <- CytoNorm.train(files = c(files_P1_C2, files_P2_C2), labels = c(rep(x = "P1",times = length(files_P1_C2)), rep(x = "P2",times = length(files_P2_C2))), channels = cellTypeChannels, transformList = NULL, seed = 1, plot = TRUE, verbose = TRUE, normParams = list("goal" = goal_q)) #normParams = list("goal" = "P1")) #E.g. to normalize towards one of the batches saveRDS(object = model2_towardsDistribution, file = "RDS/model2_towardsDistribution.rds")
The normParams
argument can be used to pass parameters to the normalization method call, QuantileNorm.train
by default. Here, you can specify for example nQ
, the number of quantiles that will be computed and aligned, or goal, which are the goal quantiles. Note that no FlowSOM.params
are specified, since CytoNorm will look in the outputDir
(default = “./tmp”) for a previous FlowSOM object. If this is the case, it will throw a warning saying CytoNorm will reuse the saved FlowSOM result. Here, it will thus reuse the FlowSOM clustering from the previous CytoNorm model (use case 1). If this is not the desired behavior, one can either specify another outputDir
or move the FlowSOM object in the file manager.
CytoNorm.normalize(model = model2_towardsDistribution, files = c(files_P1_C2, files_P2_C2), labels = c(rep(x = "P1",times = length(files_P1_C2)), rep(x = "P2",times = length(files_P2_C2))), transformList = NULL, verbose = TRUE, prefix = "", transformList.reverse = NULL, outputDir = "Data/Normalized_cellType")
# List files files_P1_C2_norm <- list.files(path = "Data/Normalized_cellType", pattern = "ID[5-8]_Panel1_TP[1-3].fcs", full.names = TRUE) files_P2_C2_norm <- list.files(path = "Data/Normalized_cellType", pattern = "ID[5-8]_Panel2_TP[1-3].fcs", full.names = TRUE) # Make plots p <- plotDensities(input = list("P1_C2" = files_P1_C2, "P2_C2" = files_P2_C2, "P1_C2_norm" = files_P1_C2_norm, "P2_C2_norm" = files_P2_C2_norm), channels = cellTypeChannels, colors = c("P1_C1" = "#900002","P1_C2" = "#FF3437", "P2_C1" = "#0046A1","P2_C2" = "#6FCFFF", "C1_goal" = "#7209b7", "Goal distribution" = "#7209b7"), model = model2_towardsDistribution, show_goal = TRUE) # Arrange figure p_ <- ggarrange(ggarrange(plotlist = p[1:length(p)-1], ncol = (length(p)-1)/(2*length(cellTypeChannels)), nrow = 2*length(cellTypeChannels)), widths = c(1,3), p$legend, nrow = 2, heights = c(10,1)) print(p_)
plotSplines(model = model2_towardsDistribution, channels = cellTypeChannels, groupClusters = TRUE)
dir.create("tmp/before") dir.create("tmp/after") aggregates <- list("agg_P1_C2" = files_P1_C2, "agg_P2_C2" = files_P2_C2) for (agg in names(aggregates)){ writeLines(agg) files <- aggregates[[agg]] set.seed(2023) before <- AggregateFlowFrames(fileNames = files, silent = TRUE, cTotal = length(files) * 10000) write.FCS(before, paste0("tmp/before/", agg, ".fcs")) after <- before manual <- c() for (i_file in unique(before@exprs[,"File"])){ ff <- read.FCS(paste0("Data/Normalized_cellType/", basename(aggregates[[agg]][i_file]))) i_IDs <- before@exprs[before@exprs[,"File"] == i_file,"Original_ID2"] after@exprs[before@exprs[,"File"] == i_file,1:(ncol(after@exprs)-3)] <- ff@exprs[i_IDs, ] labels <- as.character(manualLabels[[basename(aggregates[[agg]][i_file])]]) manual <- c(manual, labels[i_IDs]) } manualLabels[[paste0(agg, ".fcs")]] <- factor(manual, levels = levels(manualLabels[[1]])) write.FCS(after, paste0("tmp/after/", agg, ".fcs")) }
cellTypes <- c("unlabeled", "All B cells", "Transitional B cells", "Naive B cells", "IgD+ memory B cells", "IgM+ memory B cells", "Class switched memory B cells", "Double negative B cells") models <- list("Model2_before" = list("files" = c(files_P1_C1_norm, files_P2_C1_norm, "tmp/after/agg_P1_C1.fcs", "tmp/after/agg_P2_C1.fcs", "tmp/before/agg_P1_C2.fcs", "tmp/before/agg_P2_C2.fcs", files_P1_C2, files_P2_C2), "channels" = cellTypeChannels), "Model2_after" = list("files" = c(files_P1_C1_norm, files_P2_C1_norm, "tmp/after/agg_P1_C1.fcs", "tmp/after/agg_P2_C1.fcs", "tmp/after/agg_P1_C2.fcs", "tmp/after/agg_P2_C2.fcs", files_P1_C2_norm, files_P2_C2_norm), "channels" = cellTypeChannels))
EMDs <- list() MADs <- list() for (model in names(models)){ print(model) files <- models[[model]][["files"]] channels <- models[[model]][["channels"]] EMDs[[model]] <- emdEvaluation(files = files, channels = channels, manual = manualLabels, return_all = TRUE, manualThreshold = 20) MADs[[model]] <- CytoNorm:::madEvaluation(files = files[!startsWith(files, "tmp")], channels = channels, return_all = TRUE, manual = manualLabels, transformList = NULL) } EMD_scores <- list() for (subset in names(models)){ dist <- EMDs[[subset]]$distances EMD_scores[[subset]] <- matrix(NA, nrow = length(dist), ncol = length(dist[[1]]), dimnames = list(names(dist), names(dist[[1]]))) for (cellType in names(dist)){ for (marker in names(dist[[cellType]])){ EMD_scores[[subset]][cellType, marker] <- max(dist[[cellType]][[marker]][25:26, 27:28], na.rm = T) } } }
markerlevels <- NULL plotlist <- list() # EMD plot EMD_before <- EMD_scores[["Model2_before"]][-2,] EMD_after <- EMD_scores[["Model2_after"]][-2,] EMD_df <- data.frame("before" = c(EMD_before), "after" = c(EMD_after), "feature" = paste(rownames(EMD_before), rep(colnames(EMD_before), each = nrow(EMD_before)), sep = "_")) if (is.null(markerlevels)){ markerlevels <- unique(unique(sub(".*_", "", EMD_df$feature))) } EMD_df$marker <- factor(sub(".*_", "", EMD_df$feature), levels = markerlevels) EMD_df$celltype <- factor(sub("_.*", "", EMD_df$feature), levels = c("AllCells", cellTypes)) max <- max(EMD_df[,1:2], na.rm = T) p_emd <- ggplot(EMD_df, aes(x = after, y = before)) + xlim(0,max) + ylim(0,max) + geom_point(aes(color = marker, shape = celltype)) + geom_abline() + scale_shape_manual(values = c(16, 17, 15, 3, 7, 8, 6)) + ggtitle("EMD") + theme_minimal() leg_emd <- get_legend(p_emd) p_emd <- p_emd + theme(legend.position = "none") plotlist[[length(plotlist)+1]] <- p_emd names(markerlevels) <- sub(".*<(.*)>.*", "\\1", markerlevels) # MAD plot MAD_before <- MADs[["Model2_before"]][["comparison"]][-2,] MAD_after <- MADs[["Model2_after"]][["comparison"]][-2,] MAD_df <- data.frame("before" = c(MAD_before), "after" = c(MAD_after), "feature" = paste(rownames(MAD_before), rep(colnames(MAD_before), each = nrow(MAD_before)), sep = "_")) MAD_df$marker <- factor(markerlevels[sub(".*_", "", MAD_df$feature)], levels = markerlevels) MAD_df$celltype <- factor(sub("_.*", "", MAD_df$feature), c("AllCells", cellTypes)) max <- max(MAD_df[,1:2], na.rm = T) p_mad <- ggplot(MAD_df, aes(x = after, y = before)) + xlim(0,max) + ylim(0,max) + geom_point(aes(color = marker, shape = celltype)) + geom_abline() + scale_shape_manual(values = c(16, 17, 15, 3, 7, 8, 6)) + ggtitle("MAD") + theme_minimal() leg_mad <- get_legend(p_mad) p_mad <- p_mad + theme(legend.position = "none") plotlist[[length(plotlist)+1]] <- p_mad ggarrange(ggarrange(plotlist = plotlist), as_ggplot(leg_emd), ncol=2, widths = c(4,1.5))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.