In aef1004/cytotypr: Pipeline to Identify Cell Types from Cytometry Data

knitr::opts_chunk$set(echo = TRUE)

There are a few packages that will need to be downloaded from CRAN:

if (!require("data.table")) install.packages("data.table")
if (!require("dplyr")) install.packages("dplyr")
if (!require("stringr")) install.packages("stringr")
if (!require("scales")) install.packages("scales")
if (!require("tidyr")) install.packages("tidyr")
if (!require("superheat")) install.packages("superheat")
if (!require("tibble")) install.packages("tibble")
if (!require("pheatmap")) install.packages("pheatmap")

library(data.table)
library(dplyr)
library(ggcyto)
library(stringr) #this should be already added to the cytotypr package
library(scales)
library(tidyr)
library(superheat)
library(tibble)
library(pheatmap)

Convert data to "tidy data" format

Now that the initial gating has been applied, to limit the data to measurements oflive, singlet lymphocyte cells, we convert the data to a "tidy data" format, to allow us to work with "tidyverse" tools for further analysis and visualization.

# Pull out the data from the 'live' node of the gating set (the last node
# in the initial gating strategy).
flowset_FMO_gated_data <- gs_pop_get_data(gs_FMO, "live") %>% 
  cytoset_to_flowSet()

Apply the tidy_flow_set function to the 'flowSet' of gated FMO data to output a dataframe:

FMO_gated_data <- tidy_flow_set(flowset_FMO_gated_data)
FMO_gated_data

Note that when you plot the FMOs here, you should see all of the FMOs that you want to use. If you don't see all of them, ensure that all ofyour filenames and column names for each of the markers matches exactly. For example, if the filename says "CD103_f" but the corresponding column name for that marker is "CD103", you need to either change the filename or column name so that they are exactly the same.

# note that here the filename and the column marker names need to match exactly
df_FMO_gated_data <- FMO_gated_data %>%
  select(ends_with("-A"), -`FSC-A`, `SSC-A`, filename) %>%
  dplyr::rename(`FoxP3` = "APC-A",
         `CD44` = "APC-Fire 750-A",
        `CD103` =  "APC-R700-A",       
         `CD3` = "Alexa Fluor 532-A",
         `Sca1` = "BB515-A",
         `IL_10` = "BV421-A",
         `CD4` = "BV480-A",
         `CD69` = "BV510-A",
         `CD8` = "BV570-A", 
         `CTLA4` = "BV605-A",
         `CD27` = "BV650-A",
         `CD153` = "BV711-A",
         `KLRG1` = "BV785-A",
         `IL_17` = "PE-A",
         `CD122` = "PE-Cy5-A",
         `IFN` = "PE-Cy7-A", 
         `CD62L` = "PE-Dazzle594-A",
         `TNF` = "Pacific Blue-A", 
         `CD28` = "PE-Cy5.5-A",
         `PD1` = "PerCP-eFluor 710-A")  %>%
   na.omit()%>%
  dplyr::filter(`SSC-A` != max(`SSC-A`)) %>%
  mutate(filename = str_replace(filename, ".fcs", "")) %>%
  mutate(filename = str_replace(filename, "IFNG", "IFN"))


FMO_filtered_data <- filter_FMO(df_FMO_gated_data)

add_quantile <- get_99(FMO_filtered_data)

plot_FMOs(FMO_filtered_data, add_quantile)

Gated Data

Pull out the gated data

# Pull out the gated data - could potentially add to the function below
gated_flowset <- gs_pop_get_data(gs, "live") %>% 
  cytoset_to_flowSet() 

# tidy the flowset and convert to a dataframe
df_all_gated <-  tidy_flow_set(gated_flowset) 

unique(df_all_gated$filename)

Feature engineer the data

Feature cut the data to get all of the possible populations for each file with the cell count and number of cells. Feature engineering is based on the 99%.

Remove FoxP3 Remove CD69 - spreading from Sca1 makes the marker unusable

The "count_calc" function at the end calculates the cell counts and percentage of cells in each sample for each population. The dataframe that is input into this function should only contain the markers that you're interesting in looking at, and should remove SSC-A, FSC-A, etc.

all_fe <- df_all_gated %>%
  mutate(Timepoint = str_extract(filename, "D[0-9]*")) %>%
  mutate(Group = str_extract(filename, "Group[:punct:][0-9][:punct:][A-Z]"),
         Group = str_replace(Group, "Group_", "")) %>%
  unite(filename, c("Timepoint", "Group")) %>%
  select(ends_with("-A"), -`FSC-A`, filename) %>%
  dplyr::rename(`FoxP3` = "APC-A",
         `CD44` = "APC-Fire 750-A",
        `CD103` =  "APC-R700-A",       
         `CD3` = "Alexa Fluor 532-A",
         `Sca1` = "BB515-A",
         `IL_10` = "BV421-A",
         `CD4` = "BV480-A",
         `CD69` = "BV510-A",
         `CD8` = "BV570-A", 
         `CTLA4` = "BV605-A",
         `CD27` = "BV650-A",
         `CD153` = "BV711-A",
         `KLRG1` = "BV785-A",
         `IL_17` = "PE-A",
         `CD122` = "PE-Cy5-A",
         `IFN` = "PE-Cy7-A", 
         `CD62L` = "PE-Dazzle594-A",
         `TNF` = "Pacific Blue-A", 
         `CD28` = "PE-Cy5.5-A",
         `PD1` = "PerCP-eFluor 710-A") %>%
  dplyr::filter(`SSC-A` != max(`SSC-A`)) %>%
  mutate(CD3 = fe(add_quantile, CD3, "CD3"),
         CD4 = fe(add_quantile, CD4, "CD4"),
         CD8 = fe(add_quantile, CD8, "CD8"),
         CD44 = fe(add_quantile, CD44, "CD44"),
         CD103 = fe(add_quantile, CD103, "CD103"),
         Sca1 = fe(add_quantile, Sca1, "Sca1"),
         IL_17 = fe(add_quantile,IL_17, "IL_17"),
         CD69 = fe(add_quantile,CD69, "CD69"),
         CTLA4 = fe(add_quantile,CTLA4, "CTLA4"),
         CD27 = fe(add_quantile,CD27, "CD27"),
         CD153 = fe(add_quantile,CD153, "CD153"),
         KLRG1 = fe(add_quantile,KLRG1, "KLRG1"),
         IFN = fe(add_quantile,IFN, "IFN"),
         FoxP3 = fe(add_quantile,FoxP3, "FoxP3"),
         CD122 = fe(add_quantile,CD122, "CD122"),
         PD1 = fe(add_quantile,PD1, "PD1"),
         CD62L = fe(add_quantile,CD62L, "CD62L"),
         IL_10 = fe(add_quantile,IL_10, "IL_10"),
         CD28 = fe(add_quantile,CD28, "CD28"),
         TNF = fe(add_quantile,TNF, "TNF")) %>%
  select(-`Zombie Nir-A`, -`AF-A`, -`SSC-A`, -FoxP3, -CD69) %>%
  count_calc()

Visulatizations

Initial identification of populations plot

We first want to view all of the different cell phenotypes within the data

# this is the order of markers that we want for all of our plots
order_of_markers <- c("CD3", "CD4", "CD8",  "CD44", "CD103", "Sca1", "IL_17","CTLA4",
                      "CD27",  "CD153", "KLRG1", "IFN",  "CD122", "PD1", "CD62L",
                      "IL_10", "CD28","TNF")

# to view all of the possible combinations
total_phenotypes <- filter_for_total_pheno(all_fe, marker_order = order_of_markers)

heatmap_all_pheno(total_phenotypes)

# gives the total number of populations
nrow(total_phenotypes)

After identifying all phenotypes, we can filter the data to see the ones that we're interested in, for example, CD3+ cells that constitute >0.5% of total live leukocytes in a sample.

# view the specific cell phenotypes we're interested in
sample_populations <- all_fe %>%
  dplyr::filter(CD3 == 1 & percentage > 0.5) %>%
  filter_pops() 

sample_populations_all_groups <- identified_pop_perc(sample_populations, all_fe, marker_vector = order_of_markers)

Plot sample populations

############ Simple plot - sample populations #######################

simple_pop_df <- sample_populations %>%
  column_to_rownames("population") 

simple_pop_df %>%
  dplyr::select(order_of_markers) %>%
  mutate_all(~as.numeric.factor(.)) %>%
    pheatmap::pheatmap(cluster_rows = FALSE, cluster_cols = FALSE,
             labels_row = rownames(simple_pop_df),
             cellwidth = 15, cellheight = 15, angle_col = 45, 
             color = c("#3F4788FF", "#56C667FF"), cutree_rows = 2, legend = FALSE)


############ Complicated plot (background color) - sample populations ############
# these are the colors to use for th pheatmap annotations
my_colors <- list(lineage = c(`Double Negative` = "darkturquoise", 
                              `T Helper` = "plum3",
                              `Cytotoxic T` = "hotpink",
                              `Double Positive` = "orange"))

# this should be read in from the cytotypr package
# as.numeric.factor <- function(x) {
#   as.numeric(levels(x))[x]
# }


# take the filtered data and add annotations for the differnt lineages, cell type, and resident status
plot_sample <- function(df, order_of_markers, order_of_types) {

  df %>%
  mutate_all(~as.numeric.factor(.)) %>%
      mutate(lineage = ifelse(CD4 == 0 & CD8 ==0, "Double Negative",
                          ifelse(CD4 == 1 & CD8 ==0,"T Helper",
                          ifelse(CD4 ==0 & CD8 ==1, "Cytotoxic T", 
                                 "Double Positive"))), .before = all_of(order_of_markers)) %>% # could remove the "all_of"
  select(-population) %>%
  arrange(match(lineage, order_of_types)) %>%
  mutate(population = c(paste0("Pop", 1: nrow(df)))) %>%
  tibble::column_to_rownames("population")
}

order_of_types <- c("Double Negative", "T Helper", "Cytotoxic T", "Double Positive")

phenotype_data <- plot_sample(sample_populations, order_of_markers, order_of_types)

heatmap_picked_pops <- function(df, order_of_markers, plotting_colors) {

  # create an annotated data frame to add to pheatmap
  phenotype_annot <- df %>%
    dplyr::select(-order_of_markers) %>%
    dplyr::mutate(population = c(paste0("Pop", 1: nrow(df)))) %>%
    tibble::column_to_rownames("population")

  # count the number of rows with each phenotype to create gaps in the plot
  lengths_to_cut <- rownames(unique(phenotype_annot)) %>%
    stringr::str_replace("Pop", "") %>%
    as.numeric() %>%
    as.data.frame()%>%
    dplyr::mutate(. = .-1)

  # plot the data with different annotations

  df %>%
    dplyr::select(-colnames(phenotype_annot)) %>%
    dplyr::select(order_of_markers) %>%
    pheatmap::pheatmap(cluster_rows = FALSE, cluster_cols = FALSE,
             labels_row = rownames(df),
             annotation_row = phenotype_annot,  gaps_row = c(lengths_to_cut[2,], lengths_to_cut[3,], lengths_to_cut[4,]),
             cellwidth = 15, cellheight = 15, angle_col = 45, annotation_colors = plotting_colors,
             color = c("#3F4788FF", "#56C667FF"), cutree_rows = 2, legend = FALSE)
}



heatmap_picked_pops(phenotype_data, order_of_markers, colors = my_colors)

Correlation plot

############# Basic example with superheat ######################
corr <- calc_corr(sample_populations_all_groups)

superheat(corr,  row.title = "Populations", column.title = "Populations")

#bottom.label.col = c(rep("darkturquoise", DN_length), rep("plum3", T_helper_length), rep("hotpink", Cytotoxic_length), rep("orange", DP_length)), left.label.col = c(rep("darkturquoise", DN_length), rep("plum3", T_helper_length), rep("hotpink", Cytotoxic_length), rep("orange", DP_length)))



########### Basic example with ggplot ###############################
library(reshape2)
library(viridis)

corr <- calc_corr(sample_populations_all_groups)

melted_corr <- melt(corr) %>%
  mutate(Var1 = as.factor(Var1),
         Var2 = as.factor(Var2)) 

ggplot(melted_corr, aes(x = Var1, y = Var2, fill = value)) +
    geom_tile() +
    scale_fill_viridis() +
    theme_bw() +
    xlab("Populations") +
    ylab("Populations") +
    labs(fill = "Correlation") 

########### Complicated example with ggplot - not working ###############################


add_color_for_corr <- function(my_colors) {

  type_colors <- as.data.frame(my_colors) %>%
  rename(color = colnames(as.data.frame(my_colors))) %>%
  rownames_to_column("type")

  pheno_with_names <- phenotype_data %>%
  select(-order_of_markers) 

  save_color <- pheno_with_names %>%
    rename(type = colnames(pheno_with_names)) %>%
    rownames_to_column("population") %>%
  left_join(type_colors) %>%
    mutate(population = as.numeric(str_replace(population, "Pop", ""))) %>%
    distinct(color, .keep_all = TRUE)

  colo <- ifelse(melted_corr$Var1 == save_color[1, "population"]:save_color[2, "population"], save_color[1, "color"],
                 ifelse(melted_corr$Var1 == save_color[2, "population"]:save_color[3, "population"], save_color[2, "color"],
                 ifelse(melted_corr$Var1 == save_color[3, "population"]-1:save_color[4, "population"], save_color[3, "color"],
                 save_color[4, "color"])))

  # key is that there cannot be overlapping numbers
  colo <- ifelse(melted_corr$Var1 == 1:5, "blue", 
         ifelse(melted_corr$Var1 == 6:10, "red", "yellow"))
}


ggplot(melted_corr, aes(x = Var1, y = Var2, fill = value)) +
    geom_tile() +
    scale_fill_viridis() +
    theme_bw() +
    xlab("Populations") +
    ylab("Populations") +
    labs(fill = "Correlation") +
  theme(axis.text.x = element_text(color = colo))

a <- ifelse(melted_corr$Var1 == 1:5, "red", "blue")

  # count the number of rows with each phenotype to create gaps in the plot
  lengths_to_cut <- rownames(unique(phenotype_annot)) %>%
    stringr::str_replace("Pop", "") %>%
    as.numeric() %>%
    as.data.frame()%>%
    dplyr::mutate(. = .-1)

Time Series Plot

########### Basic time series example ###############################

# take the data for filtered populations and rename so that plots are pretty
pops_for_plots_average <- sample_populations_all_groups %>%
  separate(filename, into = c("Timepoint", "Group", "Number"), sep = "_") %>%
  dplyr::group_by(population, Timepoint, Group) %>%
  dplyr::mutate(average_percent = mean(percentage)) %>%
  select(-Number, -percentage) %>%
  ungroup() %>%
  unique() %>%
  dplyr::mutate(Group = str_replace(Group, "1", "PBS"),
         Group = str_replace(Group, "2", "BCG")) %>%
  dplyr::mutate(Timepoint = str_replace(Timepoint, "D30", "30"),
         Timepoint = str_replace(Timepoint, "D60", "60"),
         Timepoint = str_replace(Timepoint, "D90", "90")) %>%
  mutate(Group = str_replace(Group, "PBS", "Control"),
         Group = str_replace(Group, "BCG", "Vaccinated")) %>%
  mutate(population = factor(population, 
                             levels = paste0("Pop", c(1:length(unique(population))))))


ggplot(pops_for_plots_average, aes(x = factor(Timepoint, levels = c("30", "60", "90")),
                                 y = average_percent, group = Group, color = Group)) +
  scale_fill_identity() +
  geom_point() +
  geom_line() +
  facet_wrap("population", scales = "free", ncol = 7) +
  xlab("Days Post-Infection") +
  ylab("Average Percent of Total Live Leukocytes") +
  theme_gray() +
    theme(axis.text.x = element_text(angle = 45, size = 13, hjust = 1),
        axis.text.y = element_text(size = 13),
        strip.text.x = element_text(size = 10),
        axis.title = element_text(size = 17),
        title = element_text(size = 20),
        legend.text = element_text(size=18),
        legend.key.size = unit(1.5, "line"))

########### Complicated time series example (with background) ###############################
color_for_facets <- phenotype_data %>%
  select(lineage, population) %>%
  mutate(col = ifelse(lineage == "Double Negative", "darkturquoise", 
                      ifelse(lineage == "T Helper", "plum3",
                             ifelse(lineage == "Cytotoxic T", "hotpink", 
                                    ifelse(lineage == "Double Positive", "orange", "brown")))))

# take the data for filtered populations and rename so that plots are pretty
pops_for_plots_average <- sample_populations_all_groups %>%
  separate(filename, into = c("Timepoint", "Mouse", "Group", "Number"), sep = "_") %>%
  dplyr::group_by(population, Timepoint, Group) %>%
  dplyr::mutate(average_percent = mean(percentage)) %>%
  select(-Number, -percentage, -Mouse) %>%
  ungroup() %>%
  unique() %>%
  dplyr::mutate(Group = str_replace(Group, "1", "PBS"),
         Group = str_replace(Group, "2", "BCG")) %>%
  dplyr::mutate(Timepoint = str_replace(Timepoint, "D30", "30"),
         Timepoint = str_replace(Timepoint, "D60", "60"),
         Timepoint = str_replace(Timepoint, "D90", "90")) %>%
  mutate(Group = str_replace(Group, "PBS", "Control"),
         Group = str_replace(Group, "BCG", "Vaccinated"))

facets_for_timeseries <- left_join(pops_for_plots_average, color_for_facets, by = "population") 

# convert populations to factor to put in order for the plot
facets_for_timeseries$population <-  factor(facets_for_timeseries$population, 
                                    levels = paste0("Pop", c(1:length(unique(facets_for_timeseries$population)))))

ggplot(facets_for_timeseries, aes(x = factor(Timepoint, levels = c("30", "60", "90")),
                                 y = average_percent, group = Group, color = Group)) +
  geom_rect(aes(fill = col),xmin = -Inf,xmax = Inf,
            ymin = -Inf,ymax = Inf, alpha = 0.2) +
  scale_fill_identity() +
  geom_point() +
  geom_line() +
  facet_wrap("population", scales = "free", ncol = 7) +
  xlab("Days Post-Infection") +
  ylab("Average Percent of Total Live Leukocytes") +
  scale_color_manual(values = c("black", "white")) +
  theme_gray() +
    theme(axis.text.x = element_text(angle = 45, size = 13, hjust = 1),
        axis.text.y = element_text(size = 13),
        strip.text.x = element_text(size = 10),
        axis.title = element_text(size = 17),
        title = element_text(size = 20),
        legend.text = element_text(size=18),
        legend.key.size = unit(1.5, "line"))

Data Visualization

CFU Correlation

############### Simple CFU plotting ###########################

CFUs <- readxl::read_xlsx("./inst/extdata/CFU_data.xlsx") %>%
  dplyr::filter(Organ == "Lung") %>%
  dplyr::filter(Group == "1" | Group == "2")

# clean the flow data and prepare to join with CFU
pops_for_CFUs <- sample_populations_all_groups %>%
  separate(filename, into = c("Timepoint", "Group", "Number"), sep = "_") %>%
  dplyr::mutate(Timepoint = str_replace(Timepoint, "D", "")) %>%
  dplyr::mutate(Timepoint = as.numeric(Timepoint),
                Group = as.numeric(Group)) 

# join together the CFU data and the population data
pops_CFUs <- inner_join(pops_for_CFUs, CFUs, by = c("Group", "Number", "Timepoint")) %>%
  mutate(population = factor(population, levels = paste0("Pop", c(1:length(unique(population))))))

# calculate statistical significance


fitted_models <- pops_CFUs %>%
  group_by(population) %>%
  nest() %>%
  mutate(model = map(data, ~lm(CFU ~ percentage, data = .)),
         tidy_model = map(model, broom::glance)) %>%
  unnest(tidy_model) %>%
  select(population, adj.r.squared, p.value)


##### these should get the same values because the length of p.value is also 33, but they arent
fitted_models %>%
  mutate(p.val.adj = p.adjust(p.value, method = "bonferroni", 
                              n = 33)) %>%
  mutate("Significance" = p.val.adj < 0.05) %>%
  rename("p-value" = p.value,
         "Adjusted p-value" = p.val.adj) 

fitted_models %>%
  mutate(p.val.adj = p.adjust(p.value, method = "bonferroni", 
                              n = length(fitted_models$p.value))) %>%
  mutate("Significance" = p.val.adj < 0.05) %>%
  rename("p-value" = p.value,
         "Adjusted p-value" = p.val.adj) 

fitted_models %>%
  mutate(p.val.adj = p.adjust(p.value, method = "bonferroni", 
                              n = length(p.value))) %>%
  mutate("Significance" = p.val.adj < 0.05) %>%
  rename("p-value" = p.value,
         "Adjusted p-value" = p.val.adj) 

fitted_models %>%
  mutate(p.val.adj = p.adjust(p.value, method = "bonferroni")) %>%
  mutate("Significance" = p.val.adj < 0.05) %>%
  rename("p-value" = p.value,
         "Adjusted p-value" = p.val.adj) 

length(fitted_models$p.value)

Try using geom_label instead of stat_smooth_fun

############### add geom_text with r^2 to ggplot rather than stat_smooth_func ###########


########################### try geom_text from a separate dataframe  #######
# problem with the placement of the labels because I'm using free_x in the facet_wrap
# but when I remove the free_x, the data is scrunched up on the y-axis

values_for_plot <- fitted_models %>%
  mutate(p.val.adj = p.adjust(p.value, method = "bonferroni", 
                              n = length(fitted_models$p.value))) 

r_labeling <- left_join(pops_CFUs, values_for_plot, by = "population") %>%
  group_by(population) %>%
  mutate(average_percentage = mean(percentage)) %>%
  select(population, average_percentage, adj.r.squared, p.val.adj) %>%
  unique()

ggplot(pops_CFUs) +
  scale_fill_identity() +
  geom_point(aes(percentage, CFU), color = "black") +
  geom_smooth(aes(x = percentage, y = CFU), method = "lm", se = FALSE, color = "#3F4788FF") +
  geom_text(data = r_labeling, aes(x = average_percentage, y = 5, 
                                   label = paste("r^2 = ", round(adj.r.squared, 3)))) +
  facet_wrap(~population, scales = "free_x", ncol = 8) +
  xlab("Percentage of Cells") +
  ylab("log10 CFU") +
  ggtitle("Population and CFU Linear regression") +
  theme(axis.text.x = element_text(size = 9, hjust = 1),
        axis.text.y = element_text(size = 9),
        axis.title = element_text(size = 15),
        title = element_text(size = 17))

############### Complicated CFU plotting with background colors####################
# read in the CFU data
library(readxl)
CFUs <- readxl::read_xlsx("./inst/extdata/CFU_data.xlsx") %>%
  dplyr::filter(Organ == "Lung") %>%
  dplyr::filter(Group == "Group1" | Group == "Group2")

# clean the flow data and prepare to join with CFU
pops_for_CFUs <- sample_populations_all_groups %>%
  separate(filename, into = c("Timepoint", "Mouse", "Group", "Number"), sep = "_") %>%
  dplyr::mutate(Timepoint = str_replace(Timepoint, "D", "")) %>%
  dplyr::mutate(Timepoint = as.numeric(Timepoint)) %>%
  unite(Group, c(Mouse, Group)) %>%
  dplyr::mutate(Group = str_replace(Group, "_", "")) 

# join together the CFU data and the population data
pops_CFUs <- inner_join(pops_for_CFUs, CFUs, by = c("Group", "Number", "Timepoint")) 
facets_for_CFUs <- left_join(pops_CFUs, color_for_facets, by = "population") 

facets_for_CFUs$population <-  factor(facets_for_CFUs$population, 
                                    levels = paste0("Pop", c(1:length(unique(facets_for_CFUs$population)))))

ggplot(facets_for_CFUs, aes(percentage, CFU)) +
  geom_rect(aes(fill = col),xmin = -Inf,xmax = Inf,
            ymin = -Inf,ymax = Inf, alpha = 0.05) + 
  scale_fill_identity() +
  geom_point(color = "black") +
  geom_smooth(method = "lm", se = FALSE, color = "#3F4788FF") +
  facet_wrap(~population, scales = "free_x", ncol = 8) +
  stat_smooth_func(geom= "text", method = "lm", hjust = 0, parse = TRUE, color = "white") +
  xlab("Percentage of Cells") +
  ylab("log10 CFU") +
  ggtitle("Population and CFU Linear regression") +
  theme(axis.text.x = element_text(size = 9, hjust = 1),
        axis.text.y = element_text(size = 9),
        axis.title = element_text(size = 15),
        title = element_text(size = 17))