Load libraries
suppressPackageStartupMessages({ library(ggplot2) # For nice plots library(pheatmap) # For pretty heatmap plots library(readxl) # For reading excel files library(flowCore) # For basic cytometry operations library(flowDensity) # For density visualizations and density based gating library(Rtsne) # For tSNE approximations library(flowAI) # For quality control over time library(FlowSOM) # For FlowSOM library(FlowSOMworkshop) # Some extra functions to make life easier in this workshop library(flowStats) # For normalization library(CytoNorm) # For normalization library(PeacoQC) # For QC over time, replacement for flowAI })
Identify the fcs files of interest on the system
dir <- system.file("extdata", package="FlowSOMworkshop") files <- list.files(dir, pattern = "Live.fcs") files
Load an fcs file into a flowFrame
ff <- read.FCS(file.path(dir, files[4])) ff
Check the expression matrix stored in the flowframe
head(ff@exprs)
Check the metadata stored in the flowframe
head(ff@description)
Plot two markers against each other in the traditional dot plot
plotDens(ff, get_channels(ff, c("CD3", "CD19")))
Compensate the data with the spillover matrix acquired at the machine and transform with the logicle transformation
comp <- ff@description$`$SPILLOVER` comp
ff <- compensate(ff, comp) ff <- transform(ff, estimateLogicle(ff, colnames(ff)[7:19])) plotDens(ff, get_channels(ff, c("CD3", "CD19")))
Store the markers of interest for later use
channels_of_interest <- colnames(ff)[c(7, 9:13, 15:19)] print(get_markers(ff, channels_of_interest))
How many events are measured in Tube 30?
Plot CD64 against FcERI for Tube 30
Provide a directory for the quality control results
QC_dir <- "Workshop_QC" preprocessed_dir <- "Workshop_preprocessed" if(!dir.exists(QC_dir)){ dir.create(QC_dir) dir.create(preprocessed_dir) dir.create(file.path(QC_dir, "flowAI")) dir.create(file.path(QC_dir, "PeacoQC")) }
Preprocess the samples
cellCount <- rep(NA, length(files)) names(cellCount) <- files for (file in files){ # Read the FCS file ff <- read.FCS(file.path(dir, file)) cellCount[file] <- nrow(ff) # Compensate with the acquisition defined matrix # Alternatively, export the adapted matrix from FlowJo into a csv and read that in R as a matrix ff <- compensate(ff, ff@description$`$SPILLOVER`) # Transform using a logicle transform on all fluorochrome channels ff <- transform(ff, estimateLogicle(ff, colnames(ff)[7:19])) # Adaptation to the time because we started from FlowJO exported files, # which saves the time in seconds instead of hundredths of seconds ff@exprs[,"Time"] <- ff@exprs[,"Time"]*100 # head(ff@exprs[,"Time"]) # tail(ff@exprs[,"Time"]) # ff@description$`$BTIM` # ff@description$`$ETIM` # Run flowAI on the samples resQC <- flow_auto_qc(fcsfiles = ff, folder_results = file.path(QC_dir, "flowAI"), output = 1) # Run PeacoQC, the QC algorithm which Annelies is currently developping resQC <- PeacoQC(ff = ff, determine_good_cells = "all", channels = c(1,4,7:19), plot = TRUE, output_folder = file.path(QC_dir, "PeacoQC"), compensation_matrix = ff@description$`$SPILLOVER`) ff <- ff[resQC$GoodCells,] saveRDS(resQC, file.path(QC_dir, "PeacoQC", gsub(".fcs", "_QC.RDS", file))) write.FCS(ff, file.path(preprocessed_dir, file)) }
ggplot(data.frame(file = gsub(".*_(Tube_[0-9]*)_.*", "\\1", names(cellCount)), count = cellCount)) + geom_bar(aes(x = file, y = count), stat = "identity") + theme_minimal() + coord_flip()
Make an aggregate file to combine all files
agg_dir <- "Workshop_aggregate" if(!dir.exists(agg_dir)) dir.create(agg_dir) set.seed(1) ff_agg <- AggregateFlowFrames(file.path(preprocessed_dir, files), cTotal = 3000000, writeMeta = TRUE, outputFile = file.path(agg_dir, "aggregate.fcs")) table(ff_agg@exprs[,"File"])
Plot a comparison of the expression between all files
plot_aggregate(input = ff_agg, labels = c("1","1","1","2","2","2"), sample_names = gsub(".*_(Tube_[0-9]*)_.*", "\\1", files), channels = channels_of_interest, output_image = file.path(QC_dir, "aggregate.png"))
Create dir for results, load the problematic data and extract dates
batch_dir <- "Workshop_batch" if(!dir.exists(batch_dir)) dir.create(batch_dir) files <- list.files(preprocessed_dir, pattern = "Live.fcs") files_batch2 <- list.files(preprocessed_dir, pattern = "Batch.fcs") # already preprocessed and QC filtered
Comparison plot between all the files
fs_batch <- read.flowSet(file.path(preprocessed_dir, c(files, files_batch2))) # read in files in order of the date plot_aggregate(input = fs_batch, labels = rep(c("1", "2"), each = 6), channels = channels_of_interest, sample_names = gsub(".*_(Tube_[0-9]*)_.*", "\\1", names(fs_batch@frames)), output_image = file.path(batch_dir, "aggregate.png"))
We have 2 batches, normalization is needed
Min max percentile normalization (alignment of 1st and 99th percentile)
fs_minmax <- fsApply(fs_batch, function(ff){ for (col in channels_of_interest){ c <- ff@exprs[,col] c <- (c - quantile(c, 0.01)) / (quantile(c, 0.99) - quantile(c, 0.01)) ff@exprs[,col] <- c } ff }) plot_aggregate(input = fs_minmax, labels = rep(c("1", "2"), each = 6), channels = channels_of_interest, sample_names = gsub(".*_(Tube_[0-9]*)_.*", "\\1", names(fs_minmax@frames)), output_image = file.path(batch_dir, "aggregate_minmax_perFile.png"))
gaussNorm normalization (aligment of peaks in the density distribution)
fs_gaussNorm <- gaussNorm(flowset = fs_batch, channel.names = channels_of_interest, peak.density.thr=0.01) plot_aggregate(input = fs_gaussNorm[[1]], labels = rep(c("1", "2"), each = 6), channels = channels_of_interest, sample_names = gsub(".*_(Tube_[0-9]*)_.*", "\\1", names(fs_gaussNorm[[1]]@frames)), output_image = file.path(batch_dir, "aggregate_gaussNorm_perFile.png"))
CytoNorm (first FlowSOM clustering, then quantile normalization on metacluster level)
## Train model (ideally trained on control/reference samples) model <- CytoNorm.train(files = file.path(preprocessed_dir, c(files, files_batch2)), labels = 1:12, # 1 label per file channels = channels_of_interest, transformList = NULL, FlowSOM.params = list(xdim = 10, ydim = 10, nClus = 10, nCells = 1000000, scale = TRUE), normMethod.train = QuantileNorm.train, plot = T, seed = 1, outputDir = file.path(batch_dir, "CytoNorm_tmp")) ## Apply model CytoNorm.normalize(model = model, files = file.path(preprocessed_dir, c(files, files_batch2)), labels = 1:12, transformList = NULL, transformList.reverse = NULL, outputDir = file.path(batch_dir, "CytoNorm"), normMethod.normalize = QuantileNorm.normalize) fs_CytoNorm <- read.flowSet(list.files("Workshop_batch/CytoNorm", pattern = ".fcs", full.names = T)) plot_aggregate(input = fs_CytoNorm, labels = rep(c("1", "2"), each = 6), channels = channels_of_interest, sample_names = gsub(".*_(Tube_[0-9]*)_.*", "\\1", names(fs_minmax@frames)), output_image = file.path(batch_dir, "aggregate_CytoNorm_perFile.png"))
Create a flow frame per batch
batch1_ff <- AggregateFlowFrames(file.path(preprocessed_dir, files), cTotal = max(fsApply(fs_batch, nrow)) * 6, keepOrder = TRUE) batch2_ff <- AggregateFlowFrames(file.path(preprocessed_dir, files_batch2), cTotal = max(fsApply(fs_batch, nrow)) * 6, keepOrder = TRUE) splitFlowframe <- function(ff, labels = exprs(ff)[,"File"]){ lapply(unique(labels), function(label){ ff[labels == label, ] }) }
Min max norm
fs_minmax <- fsApply(flowSet(batch1_ff, batch2_ff), function(ff){ for (col in channels_of_interest){ c <- ff@exprs[,col] c <- (c - quantile(c, 0.01)) / (quantile(c, 0.99) - quantile(c, 0.01)) ff@exprs[,col] <- c } ff }) fs_minmax <- flowSet(unlist(fsApply(fs_minmax, splitFlowframe))) plot_aggregate(input = fs_minmax, labels = rep(c("1", "2"), each = 6), channels = channels_of_interest, sample_names = gsub(".*_(Tube_[0-9]*)_.*", "\\1", names(fs_minmax@frames)), output_image = file.path(batch_dir, "aggregate_minmax_batch.png"))
gaussNorm normalization (aligment of peaks in the density distribution)
fs_gaussNorm <- gaussNorm(flowset = flowSet(batch1_ff, batch2_ff), channel.names = channels_of_interest, peak.density.thr=0.01) fs_gaussNorm <- flowSet(unlist(fsApply(fs_gaussNorm$flowset, splitFlowframe))) plot_aggregate(input = fs_gaussNorm, labels = rep(c("1", "2"), each = 6), channels = channels_of_interest, sample_names = gsub(".*_(Tube_[0-9]*)_.*", "\\1", names(fs_gaussNorm@frames)), output_image = file.path(batch_dir, "aggregate_gaussNorm_batch.png"))
CytoNorm (first FlowSOM clustering, then quantile normalization on metacluster level)
## Train model (ideally trained on control/reference samples) model <- CytoNorm.train(files = file.path(preprocessed_dir, c(files, files_batch2)), labels = rep(c("1", "2"), each = 6), # 1 label per batch channels = channels_of_interest, transformList = NULL, FlowSOM.params = list(xdim = 10, ydim = 10, nClus = 10, nCells = 1000000, scale = TRUE), normMethod.train = QuantileNorm.train, plot = T, seed = 1, outputDir = file.path(batch_dir, "CytoNorm_tmp")) ## Apply model CytoNorm.normalize(model = model, files = file.path(preprocessed_dir, c(files, files_batch2)), labels = rep(c("1", "2"), each = 6), transformList = NULL, transformList.reverse = NULL, outputDir = file.path(batch_dir, "CytoNorm"), normMethod.normalize = QuantileNorm.normalize) fs_CytoNorm <- read.flowSet(list.files("Workshop_batch/CytoNorm", pattern = ".fcs", full.names = T)) plot_aggregate(input = fs_CytoNorm, labels = rep(c("1", "2"), each = 6), channels = channels_of_interest, sample_names = gsub(".*_(Tube_[0-9]*)_.*", "\\1", names(fs_minmax@frames)), output_image = file.path(batch_dir, "aggregate_CytoNorm_batch.png"))
Store channel names into variables for ease of use
ff <- read.FCS(file.path(preprocessed_dir, files[4])) # Get back to original dataset cd3_channel <- get_channels(ff, c("CD3")) cd19_channel <- get_channels(ff, c("CD19"))
Use the deGate function to find a split for a specific channel
cd3_threshold <- deGate(ff, cd3_channel) cd19_threshold <- deGate(ff, cd19_channel)
Create a flowframe with only the CD3+ CD19- cells
selection <- ff@exprs[,cd3_channel] > cd3_threshold & ff@exprs[,cd19_channel] < cd19_threshold ff_T <- ff[selection,]
Plot the result
plotDens(ff, c(cd3_channel, cd19_channel)) abline(v = cd3_threshold) abline(h = cd19_threshold) points(ff_T@exprs[,c(cd3_channel, cd19_channel)], col = "red", pch = ".")
Create a selection of CD161+ CD3- cells and show them on a scatterplot
Identify the flowjo workspace on the drive
wsp_file <- "Data/manualGating_Live.wsp" cell_types <- c("Macrophages", "B cells", "NK cells", "NK T cells", "T cells", "DCs", "Neutrophils", "Basophils") gating <- FlowSOM::GetFlowJoLabels(files, wsp_file, cell_types = cell_types) str(gating)
colSums(gating[[files[4]]]$matrix)
gating_labels <- c() for(i in seq_len(length(gating))){ labels <- as.character(gating[[files[i]]]$manual) selection_qc <- readRDS(file.path(QC_dir, "PeacoQC",gsub(".fcs", "_QC.RDS", files[i]))) selection_agg <- read.table(file.path(agg_dir, gsub(".fcs", "_selected_aggregate.txt", files[i])))$x gating_labels <- c(gating_labels, labels[selection_qc$GoodCells][selection_agg]) } gating_labels <- factor(gating_labels, levels = levels(gating[[1]]$manual))
We use a seed to make sure the results are reproducable later. This does not remove the intrinsic random decisions made by the algorithm!
We select 1000 cells from the flowframe for quicker computation time.
We compute tSNE with the default perplexity of 30.
set.seed(1) subset <- sample(seq_len(nrow(ff)), 1000) tsne <- Rtsne(ff@exprs[subset, channels_of_interest])
Plot the result
plotSNE_marker(flowframe = ff[subset,], tsne_result = tsne, marker = "CD19") + scale_color_distiller(palette = "RdYlBu")
selection <- readRDS(file.path(QC_dir, "PeacoQC",gsub(".fcs", "_QC.RDS", files[4]))) plotSNE_manual_gating(tsne, gating[[files[4]]]$manual[selection$GoodCells][subset])
Run tSNE on another 1000 randomly selected cells from the flowframe
Show the CD3 expression on this tSNE
Run the FlowSOM algorithm on the whole flowset (all cells from the 6 files). When you have more data, it might be recommended to create an aggregate flowframe first with a subset from all files (e.g. 3 million cells for most laptops).
We create a 10 by 10 grid for the initial clustering layer, and than 10 final meta-clusters.
Scale should be TRUE if the input columns have varying ranges which are not biologically relevant.
In the last line, we remove the channel name for further visualizations, only showing the marker name.
fsom <- FlowSOM(ff_agg, colsToUse = channels_of_interest, scale = FALSE, xdim = 10, ydim = 10, nClus = 10, seed = 1) fsom$FlowSOM$prettyColnames <- gsub(" <.*", "", fsom$FlowSOM$prettyColnames)
Plot the result
PlotStars(fsom$FlowSOM, backgroundValues = fsom$metaclustering)
Plot the result were all circles have the same size
PlotStars(UpdateNodeSize(fsom$FlowSOM, reset = TRUE, maxNodeSize = 8), backgroundValues = fsom$metaclustering)
Make a fsom49 object with 49 clusters instead of 100 and 8 metaclusters, and without using the AmCyan channel
Plot your fsom49 result with equal node sizes
Plot only CD3, CD19 and CD11b, by making use of the "markers" argument, which takes a vector of channel names as input.
You could make use of the get_channels() function to map marker names to channel names
Plot with the argument view = "grid"
pop_mark <- readxl::read_xlsx("Data/Populations and markers.xlsx") cellTypes <- parse_markertable(pop_mark) pop_mark
cluster_names <- query_multiple(fsom = fsom, ff = ff, cell_types = cellTypes, pdf_name = "identify_clusters.pdf")
metacluster_names <- label_metaclusters(fsom, cluster_names[GetClusters(fsom$FlowSOM)]) metacluster_names
Compute metacluster_names8 for your second FlowSOM object
Show a heatmap with the MFI values
plot_metacluster_MFIs(fsom,
metacluster_names)
Code that produces the same result, without the helper function
MFIs <- MetaclusterMFIs(fsom)[,channels_of_interest] colnames(MFIs) <- get_markers(ff, colnames(MFIs)) rownames(MFIs) <- paste0("Metacluster ",rownames(MFIs)," (", metacluster_names, ")") pheatmap::pheatmap(MFIs)
Plot the cluster numbers, to refer to later
PlotNumbers(UpdateNodeSize(fsom$FlowSOM, reset = TRUE, maxNodeSize = 0.0001), fontSize = 0.5)
Plot the metacluster numbers
PlotLabels(UpdateNodeSize(fsom$FlowSOM, reset = TRUE, maxNodeSize = 0.0001), labels = fsom$metaclustering, fontSize = 0.6)
Plot the cluster labels
PlotLabels(UpdateNodeSize(fsom$FlowSOM, maxNodeSize = 0.0001), cluster_names, fontSize = 0.5)
Plot a scatter plot of CD3/CD19 and highlight cluster 7 in red
PlotClusters2D(fsom$FlowSOM, get_channels(ff, "CD3"), get_channels(ff, "CD19"), 7)
Plot a scatter plot of CD3/CD19 and hightlight all clusters of metacluster 3 in red
PlotClusters2D(fsom$FlowSOM, get_channels(ff, "CD3"), get_channels(ff, "CD19"), which(fsom$metaclustering == 3))
Plot an overview of multiple scatter plots for multiple metaclusters
markerpairs <- list(c("CD64", "AmCyan-A"), c("CD3", "CD19"), c("CD3", "CD161"), c("CD11c", "MHCII"), c("Ly-6G", "CD11b") ) metaclusters_of_interest <- metacluster_names levels(fsom$metaclustering) <- metacluster_names png("metacluster_scatters.png", width = 500 * length(markerpairs), height = 400 * length(metaclusters_of_interest)) PlotOverview2D(fsom, markerlist = markerpairs, metaclusters = metaclusters_of_interest, ff = ff) dev.off()
Visualize the manual labels from FlowJo
PlotPies(UpdateNodeSize(fsom$FlowSOM, maxNodeSize = 8, reset = TRUE), gating_labels)
Plot some figures for your own fsom object
Create an empty matrix, and count the number of cells assigned to each cluster for every file
mat_counts <- matrix(rep(0, length(files)*fsom$FlowSOM$map$nNodes), nrow = length(files), dimnames = list(files, paste0("Cl", seq_len(fsom$FlowSOM$map$nNodes)))) for (file in files){ ff <- read.FCS(file.path(preprocessed_dir, file)) fsom_tmp <- NewData(fsom, ff) t <- table(GetClusters(fsom_tmp)) mat_counts[file, as.numeric(names(t))] <- t } rownames(mat_counts) <- gsub(".*_Tube_([0-9]*)_Live.*", "Tube \\1", rownames(mat_counts) )
Visualize in a heatmap
pheatmap(mat_counts)
Alternatively, compute the percentages for all files
pctgs <- apply(mat_counts, 2, function(x) x / rowSums(mat_counts)) pctgs <- list(pctgs = pctgs, pctgs_meta = t(apply(pctgs, 1, function(x) tapply(x, fsom$metaclustering, sum)))) pctgs
Plot the percentages in a heatmap
pheatmap::pheatmap(pctgs$pctgs_meta) pheatmap::pheatmap(pctgs$pctgs_meta, scale = "column")
The FlowSOMWorkshop package also provides a function to plot the individual values
plot_pctgs(pctgs$pctgs_meta)
Plot trees with node sizes adapted to the individual files
pdf("individual_files.pdf", useDingbats = FALSE) for(file in files){ fsom_mapped <- FlowSOM_subset(fsom, file.path(dir, file)) PlotStars(UpdateNodeSize(fsom_mapped$FlowSOM)) } dev.off()
Given the cell counts we computed earlier, FlowSOM provides a function to make a plot per group, with the node size adapted to the average of the group, and the background color indicating statistically significant differences in comparison to the first group.
groupRes <- CountGroups(fsom$FlowSOM, groups=list("WT" = mat_counts[4:6,], "NK_KO" = mat_counts[1:3,])) PlotGroups(fsom[[1]], groupRes, p_tresh = 0.3)
Compute some more statistics
statistics_res <- compute_wilcox(pctgs$pctgs_meta, group1 = rownames(pctgs$pctgs_meta)[grep("028|030|031", files)], group2 = rownames(pctgs$pctgs_meta)[grep("011|012|013", files)]) statistics_res
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.