Introduction

Flow cytometry data is typically analysed by gating but this is very labour-intensive. The direct algorithmic translation of gating is clustering, but it is well known to be hard to find the 'best' clustering. Furthermore, when analysing a large number of samples, there is the additional obstacle of matching clusters across samples. Such algorithms often turn out to be computationally intensive.

A clustering paradigm also forces cells into a discrete number of containers, however it may be the case that cells are existing 'between' states, i.e. cells are on a continuum. It may be more appropriate to describe cells states in a probabalistic fashion. Clustering yields list-type structures and this obscures the dynamic relationship between markers, especially as it manifests across a large number of samples.

To this end, we propose viewing flow cytometry data such that systematic variation of the markers is emphasised. We devise summary matrices that emphasise two aspects that are of key biological interest: marker coexpression at the sample level, and marker coexpression at the single cell level. There is also the possibility to examine how single cell coexpression depends on sample level coexpression. Some key features of this methodology is that it is computationally cheap, and summarises the data such that it can be readily integrated with other data sources, e.g. clinical covariates.

Data

Eight healthy donor samples are stimulated eight times each by eight different pathogens, yielding 64 samples. The pathogens are candida, E.coli and C.leptum (gut), mite, aspergillus and birchgrass (lung), CMV and influenza (virus). As antigen specific T-cells occur in such low numbers, they are first pre-enriched for CD4. The CD154 distribution is compared to that of an unstimulated sample for that donor, and only events with CD154 greater than the 95th percentile of the unstimlated are included. Among the stimulated cells, the following markers are of interest: IL17, IL22, IFNg, GM-CSF, IL4, IL5 and IL10. T-helper sets are defined as following: Th1 by IFNg, Th2 by IL4 and IL5, Th17 by IL17 and IL22. IL10 (is somehow implicated in) Tregs (regulatory Tcells). Th2 cells are thought/known to be implicated in allergic reactions when they are uncontrolled by Tregs. It is of interest to understand the pathogen specific regulation of coexpression as this points to the underlying transcriptional programmes that regulate cytokine expression.

library(flowCore)
library(pheatmap)
library(cluster)
library(ggplot2)
library(reshape)
library(ade4)
library(ca)
library(vcd)
library(igraph)
library(glasso)
library(plyr)
devtools::load_all(".")

fn <- list.files('~/2015/2014-12-04_CompareDonorsManuscript/2015-09-24_Rscripts/Tcell/data/')
Data <- lapply(1:length(fn), function(i) read.FCS(paste('~/2015/2014-12-04_CompareDonorsManuscript/2015-09-24_Rscripts/Tcell/data/', fn[i], sep = '')))
metadata <- data.frame(t(mapply(function(x) strsplit(strsplit(x, split = '[.]')[[1]][1], '_')[[1]], fn))[, -1])

markers <- c('IL10', 'IL22', 'GMCSF', 'IL4', 'IFNg', 'IL5', 'IL17', 'CD154')
markers_index <- c(2, 3, 4, 8, 9, 10, 11, 14)

C <- factor(rep(1:10, 8))

lgcl <- logicleTransform(w = 2, t = 10000, m = 4.5)
L <- transformList(c("b-LP505 525_50-B-A", "b-LP650 675_20-A-A", "r-670_30-C-A", "v-LP555 575_15-B-A", "v-LP600 610_20-A-A", "yg-585_15-E-A", "yg-LP600 610_20-D-A", "yg-LP750 780_60-A-A"), lgcl)
D_comp <- mapply(function(x) compensate(x, description(x)$`$SPILLOVER`), Data, SIMPLIFY = FALSE)
D_comp_lgcl <- mapply(function(x) exprs(transform(x, L)), D_comp, SIMPLIFY = FALSE)


# comparing distribution of CD154, stimulated and unstimulated
# df_cd154 <- sapply(1:8, function(i) sapply(1:2, function(j) sapply(1:5, function(k) {
#   cbind(D_comp_lgcl[metadata$X1 == levels(metadata$X1)[i] & metadata$X2 == levels(metadata$X2)[j] & metadata$X3 == levels(metadata$X3)[k]][[1]][, 14], i, k + 5 * (j-1))
# }
# )
# )
# )
# df_cd154 <- do.call(rbind, df_cd154)
# colnames(df_cd154) <- c('cd154', 'donor', 'stim')
# df_cd154 <- as.data.frame(df_cd154)
# df_cd154$donor <- factor(df_cd154$donor)
# df_cd154$stim <- factor(df_cd154$stim)
# 
# 
# for (i in c(1, 3:10)){
#   fn <- paste('cd154_', i, '.pdf', sep = '')
# pdf(fn)
# print(
# ggplot(df_cd154[df_cd154$stim == as.character(i) | df_cd154$stim == '2', ], aes(cd154, colour = stim, fill = stim)) + geom_density(alpha = 0.1) + facet_wrap(~donor)
# )
# dev.off()
# }

Data <- mapply(exprs, Data)

q <- lapply(1:8, function(i) quantile(D_comp_lgcl[metadata$X1 == levels(metadata$X1)[i] & metadata$X2 == '1' & metadata$X3 == '2'][[1]][, 14], 0.95))
i = 1
D_comp_cd154 <- lapply(1:8, function(i) {
  tmp <- D_comp_lgcl[metadata$X1 == levels(metadata$X1)[i]][-c(2, 10)]
  tmp <- mapply(function(x) x[x[, 14] > q[i], ], tmp, SIMPLIFY = FALSE)
  return(tmp)
}
)
D_comp_cd154 <- unlist(D_comp_cd154, recursive = FALSE)

D_comp_cd154 <- mapply(function(x) x[, markers_index], D_comp_cd154, SIMPLIFY = FALSE)
for (i in 1:length(D_comp_cd154)) {
  colnames(D_comp_cd154[[i]]) <- markers
}

D_comp_cd154 <- mapply(as.data.frame, D_comp_cd154, SIMPLIFY = FALSE)

sample_fac <- data.frame(donor = rep(1:8, rep(8, 8)), stim = rep(1:8, 8))
stim <- c('asp', 'can', 'mite', 'bgr', 'cmv', 'ecoli', 'clep', 'inf')
stim_type <- rep(c('lung', 'gut', 'lung', 'lung', 'virus', 'gut', 'gut', 'virus'), 8)
# breaks <- lapply(1:7, function(i) {
# x <- unlist(mapply(function(x) x[, i], D_comp_cd154))
# c(min(x), min(x) + ((max(x) - min(x))/2), max(x))
# }
# )

# make_breaks <- function(X) {
#   lapply(1:ncol(X[[1]]), function(i) {
# x <- unlist(mapply(function(x) x[, i], X))
# c(min(x), min(x) + ((max(x) - min(x))/2), max(x))
# }
# )
# }
# 
breaks <- make_breaks(mapply(function(x) x[, 1:7], D_comp_cd154, SIMPLIFY = FALSE))


# burt <- 
# lapply(1:length(D_comp_cd154), function(i) {
# x <- mapply(function(x, y) cut(x, y, FALSE, TRUE), D_comp_cd154[[i]][, -8], breaks)
# tmp <- ddply(as.data.frame(x), markers[-8], nrow)
# colnames(tmp)[8] <- 'counts'
# tmp$counts <- tmp$counts/sum(tmp$counts)
# tmp$which <- i
# tmp
# }
# )

# make_burt <- function(X, breaks, markers) {
# tmp1 <- lapply(1:length(X), function(i) {
# x <- mapply(function(x, y) cut(x, y, FALSE, TRUE), X[[i]], breaks)
# tmp <- ddply(as.data.frame(x), markers, nrow)
# colnames(tmp)[ncol(X[[1]]) + 1] <- 'counts'
# tmp$counts <- tmp$counts/sum(tmp$counts)
# tmp$which <- i
# tmp
# }
# )
# return(tmp1)
# }

burt <- make_burt(mapply(function(x) x[, 1:7], D_comp_cd154, SIMPLIFY = FALSE), breaks, markers[1:7])

Basic statistics

Some basic statistics: proportion of at least single positive, at least double positive, at least triple positive, at least quadruple positive, at least 5 times positive. It is most likely a cell will be single positive, and almost impossible to be quadruple positive. Nothing is positive for 5 markers.

burt_all <- do.call(rbind, burt)
apply(burt_all[, 1:7], 2, function(y) by(burt_all$counts, y, sum)[2]/length(burt))


# at least double positive 
apply(burt_all[, 1:7], 2, function(x) sum(burt_all$counts[rowSums(burt_all[, 1:7]) > 8 & x == 2])/sum(burt_all$counts))

# at least triple positive 
apply(burt_all[, 1:7], 2, function(x) sum(burt_all$counts[rowSums(burt_all[, 1:7]) > 9 & x == 2])/sum(burt_all$counts))

# at least quadruple positive 
apply(burt_all[, 1:7], 2, function(x) sum(burt_all$counts[rowSums(burt_all[, 1:7]) > 10 & x == 2])/sum(burt_all$counts))

Sample level coexpression

For each marker and unstained cell counts, check the ratio of proportion of positive events to average proportion over all samples, i.e. does a sample have more or less producers of a marker than average. The absolute number of events is thus removed, which can be problematic e.g. when comparing IFNg to IL4 or IL5.The log of ratios is taken, such that a positive value indicates more than average, and negative less than average. First a heatmap of average ratios for each stimulation, in order to gain an impression of sample level expression.

tot_pos <- apply(burt_all[, 1:7], 2, function(y) by(burt_all$counts, y, sum)[2]/length(burt))

summary_mtx <- 
  t(
sapply(1:8, function(i) {
tmp <- burt[sample_fac$stim == i]
tmp1 <- sapply(tmp, function(.x) apply(.x[, 1:7], 2, function(y) by(.x$counts, y, sum)[2]))
tmp1[is.na(tmp1)] <- 0
rowMeans(tmp1)
}
)
)
colnames(summary_mtx) <- markers[-8]
rownames(summary_mtx) <- stim

pheatmap(log(sweep(summary_mtx, 2, tot_pos, '/')))

Perform a between-component analysis (BCA) on the full sample matrix. This is to visualise the relationship between stimulation centres - there could be addtional correlation within each stimulation that makes this relationship less clear. We proceed with a descriptive/exploratory analysis.

# expression_donor <- lapply(1:8, function(i) {
# tmp <- burt[sample_fac$stim == i]
# tmp1 <- sapply(tmp, function(.x) apply(.x[, 1:7], 2, function(y) by(.x$counts, y, sum)[2]))
# tmp1[is.na(tmp1)] <- 0
# t(tmp1)
# }
# )
# expression_donor <- do.call(rbind, expression_donor)
# expression_donor <- sweep(expression_donor, 2, tot_pos, '/')
# expression_donor[expression_donor == 0] <- 1
# expression_donor <- cbind(expression_donor, unst)
# 

# make_sample_coexp <- function(burt) {
#   burt_all <- do.call(rbind, burt)
#   tot_pos <- apply(burt_all[, 1:(ncol(burt_all) - 2)], 2, function(y) by(burt_all$counts, y, sum)[2]/length(burt))
# unst <- sapply(burt, function(.x) .x[rowSums(.x[, 1:(ncol(burt_all) - 2)]) == 7, 'counts'])
# 
#   expression_donor <- 
# lapply(burt, function(.x) {
# apply(.x[, 1:(ncol(burt_all) - 2)], 2, function(y) by(.x$counts, y, sum)[2])/tot_pos
# })
# expression_donor <- do.call(rbind, expression_donor)
# expression_donor[is.na(expression_donor)] <- 1
# expression_donor <- cbind(expression_donor, unst)
# return(expression_donor)
#   }


sample_coexp <- make_sample_coexp(burt)
# expression_donor <- 
# lapply(burt, function(.x) {
# apply(.x[, 1:7], 2, function(y) by(.x$counts, y, sum)[2])/tot_pos
# })
# expression_donor <- do.call(rbind, expression_donor)
# expression_donor[is.na(expression_donor)] <- 1
# expression_donor <- cbind(expression_donor, unst)
# 
# par(mfrow = c(1, 2))
# plot(svd(log(expression_donor))$u, pch = 16, col = sample_fac$donor, main = 'donor')
# plot(svd(log(expression_donor))$u, pch = 16, col = sample_fac$stim, main = 'stimulation')

L <- log(sample_coexp)
RM <- do.call(rbind, replicate(8,  by(L, sample_fac$donor, colMeans)))
L <- L - RM
colnames(L) <- c(markers[-8], 'unst')

pca <- dudi.pca(L, scan = FALSE, nf = 8)
betw <- bca(pca, factor(stim[sample_fac$stim]), scan = FALSE, nf = 6)

par(mfrow = c(1, 2))
s.class(betw$ls, factor(stim[sample_fac$stim]), col = sample_fac$stim)
s.arrow(betw$c1)

The correlation between markers can alternatively be represented as network. It has basically the same information as the righthand plot above. Orange indicates positive correlation, blue represents negative correlation. The thickness of the line indicates the strength of association.

pc <- glasso(cor(L), rho = 0.1)$wi
colnames(pc) <- colnames(L)
rownames(pc) <- colnames(L)
#pheatmap(pc)

G <- graph_from_adjacency_matrix(pc, mode = 'undirected', weighted = TRUE, diag = FALSE)
E(G)$col <- 1
E(G)$col[E(G)$weight > 0] <- 2
E(G)$width <- abs(E(G)$weight)
plot(G, edge.color = E(G)$col, edge.width = E(G)$width*15)

It is also possible to look at differences within the three broad categories gut, lung and virus.

pca_gut <- dudi.pca(L[stim_type == 'gut', ], scan = FALSE, nf = 8)
betw_gut <- bca(pca_gut, factor(stim[sample_fac$stim][stim_type == 'gut']), scan = FALSE, nf = 4)
par(mfrow = c(1, 2))
s.class(betw_gut$ls, factor(stim[sample_fac$stim][stim_type == 'gut']))
s.arrow(betw_gut$c1)
pca_lung <- dudi.pca(L[stim_type == 'lung', ], scan = FALSE, nf = 8)
betw_lung <- bca(pca_lung, factor(stim[sample_fac$stim][stim_type == 'lung']), scan = FALSE, nf = 4)
par(mfrow = c(1, 2))
s.class(betw_lung$ls, factor(stim[sample_fac$stim][stim_type == 'lung']))
s.arrow(betw_lung$c1)
pca_virus <- dudi.pca(L[stim_type == 'virus', ], scan = FALSE, nf = 8)
par(mfrow = c(1, 2))
s.class(pca_virus$li, factor(stim[sample_fac$stim][stim_type == 'virus']))
s.arrow(pca_virus$c1)



# bp <- make_biplot(svd(L), 1, 3)
# #png('/Users/kristenfeher/Desktop/biplot_donors.png', height = 400, width= 700)
# par(mfrow = c(1, 2))
# plot(bp$F,  type= 'n', xlab = 'biplot axis 1', ylab = 'biplot axis 2', cex.lab = 1.5, main = 'biplot of donors', col = metadata[, 3], pch = 16)
# arrows(0, 0, bp$G[, 1]*2, bp$G[, 2]*2, lwd = 2, length = 0.1)
# text(bp$F, labels = stim[sample_fac$stim], col = sample_fac$stim)
# plot(bp$G*2.5,  type = 'n')
# text(bp$G*2, labels = c(markers[-8], 'unst'))
# arrows(0, 0, bp$G[, 1]*1.5, bp$G[, 2]*1.5, lwd = 2)
# 
# bp <- make_biplot(svd(L), 1, 3)
# #png('/Users/kristenfeher/Desktop/biplot_donors.png', height = 400, width= 700)
# par(mfrow = c(1, 2))
# plot(bp$F[, c(1, 3)],  type= 'n', xlab = 'biplot axis 1', ylab = 'biplot axis 2', cex.lab = 1.5, main = 'biplot of donors', col = metadata[, 3], pch = 16)
# arrows(0, 0, bp$G[, 1]*2, bp$G[, 3]*2, lwd = 2, length = 0.1)
# text(bp$F[, c(1, 3)], labels = stim[sample_fac$stim], col = sample_fac$stim)
# plot(bp$G[, c(1, 3)]*2.5,  type = 'n')
# text(bp$G[, c(1, 3)]*2, labels = c(markers[-8], 'unst'))
# arrows(0, 0, bp$G[, 1]*1.5, bp$G[, 3]*1.5, lwd = 2)

Single cell coexpression

Normalised to average number over all samples

For every pair of markers, a matrix is constructed with ratio of (at least) double positive cells to the average. The log is taken so when a sample has more than average the value is positive, and when less than average the value is negative. Some combinations are very rare, so if the count is zero in more than 25% of samples, the combination is removed.

# make_cell_coexp <- function(burt, markers) {
#   
#   burt_all <- do.call(rbind, burt)
#   N <- ncol(burt_all) - 2
#   
# double_all <- unlist(lapply(1:(N - 1), function(i) sapply((i+1):N, function(j) {
#   sum(burt_all[burt_all[, i] == 2 & burt_all[, j] == 2, 'counts'])
#   }
#   )
#   )
# )/length(burt)
# 
# double_pos <- t(sapply(1:length(burt), function(k) {
#   unlist(lapply(1:(N - 1), function(i) sapply((i+1):N, function(j) {
#   sum(burt[[k]][burt[[k]][, i] == 2 & burt[[k]][, j] == 2, 'counts'])
#   }
#   )
#   )
# )
# }
# )
# )
# cn <- unlist(lapply(1:(N - 1), function(i) sapply((i+1):N, function(j) paste(markers[j], markers[i], sep = '_'))))
# colnames(double_pos) <- cn
# double_pos <- sweep(double_pos, 2, double_all, '/')
# double_pos[double_pos == 0] <- 1
# return(double_pos)
# 
# }


# C <- cor(L)


# make_twoD_marginal <- function(burt, markers) {
#   
#   N <- ncol(burt[[1]]) - 2
# P <- t(sapply(1:length(burt), function(k) {
#   unlist(lapply(1:(N - 1), function(i) sapply((i+1):N, function(j) {
#   sum(burt[[k]][burt[[k]][, i] == 2 & burt[[k]][, j] == 2, 'counts'])/(sum(burt[[k]][burt[[k]][, i] == 2, 'counts']) * sum(burt[[k]][burt[[k]][, j] == 2, 'counts']))
#   }
#   )
#   )
# )
# }
# )
# )
# 
# cn <- unlist(lapply(1:(N - 1), function(i) sapply((i+1):N, function(j) paste(markers[j], markers[i], sep = '_'))))
# 
# colnames(P) <- cn
# P[is.na(P)] <- 1
# P[P == 0] <- 1
# return(P)
# 
# }

cell_coexp <- make_cell_coexp(burt, markers[1:7])
twoD_marginal <- make_twoD_marginal(burt, markers[1:7])


L2 <- log(cell_coexp)
RM <- do.call(rbind, replicate(8,  by(L2, sample_fac$donor, colMeans)))
L2 <- L2 - RM

missing_vals <- apply(L2, 2, function(x) length(which(x == 0)))/length(burt)
#L2 <- L2[, missing_vals < 0.25]

#pheatmap(cor(L2))

It might be naively expected that marker counts might drive the double positive marker counts. Specifically, it might be expected that markers which are coexpressed in the sample might also be coexpressed at the single cell level. If not, what is driving the single cell coexpression?

To investigate this, first regress the double positive counts on the pair of total counts. For example, regress the IL17IL22 double positive counts on both IL17 total counts and IL22 total counts simultaneously. Then plot the $R^2$ value of this regression against the absolute value of correlation between the total marker counts (e.g. correlation between IL17 total counts and IL22 total counts). Correlation which is originally negative is plotted in black.

It can be seen that only 3 pairs are well-explained by their parent counts: IL17-IL22, IFNg-GMCSF, and IL4-IL5. These pairs correspond to the Thelper subsets. This is potentially a valuable method for filtering double positive counts when there is a larger number of markers and/or uncharacterised markers.

# rsq <- 
# lapply(1:6, function(i) lapply((i+1):7, function(j) {
# #   i = 5
# #   j = 6
# #   which(paste(colnames(L)[j], colnames(L)[i], sep= '_') == colnames(L2))
# df <- data.frame(x1 = L[, i], x2 = L[, j], y = L2[, which(paste(colnames(L)[j], colnames(L)[i], sep= '_') == colnames(L2))])
# if (ncol(df) == 3) {
# lmodel <- lm(y ~ x1 + x2, df)
# summary(lmodel)$r.squared} else {0}
# } 
# )
# )
# 
# rsq <- unlist(rsq)

# make_rsq <- function(sample_coexp, cell_coexp, markers) { # can be donor-centered versions if necessary
#   N <- ncol(L) - 1
# rsq <- lapply(1:(N - 1), function(i) lapply((i+1):N, function(j) {
# #   i = 5
# #   j = 6
# #   which(paste(colnames(L)[j], colnames(L)[i], sep= '_') == colnames(L2))
# df <- data.frame(x1 = sample_coexp[, i], x2 = sample_coexp[, j], y = cell_coexp[, which(paste(colnames(sample_coexp)[j], colnames(sample_coexp)[i], sep= '_') == colnames(cell_coexp))])
# if (ncol(df) == 3) {
# lmodel <- lm(y ~ x1 + x2, df)
# summary(lmodel)$r.squared} else {0}
# } 
# )
# )
#
# rsq <- unlist(rsq)
# 
# cn <- unlist(lapply(1:(N - 1), function(i) sapply((i+1):N, function(j) paste(markers[j], markers[i], sep = '_'))))
# C <- cor(do.call(rbind, (by(L, factor(sample_fac$stim), colMeans))))
# cor_markers <- unlist(sapply(1:(N-1), function(i) C[(i+1):N, i]))
# names(cor_markers) <- cn
# 
# return(list(rsq = rsq, cor = cor_markers))
# }

rsq <- make_rsq(L, L2, markers[1:7])



par(mfrow = c(1, 1))
plot(abs(rsq$cor)[missing_vals < 0.25], rsq$rsq[missing_vals < 0.25], pch = 16, cex = 1, type = 'n', xlab = '|correlation coefficient|', ylab = 'R square')
text(abs(rsq$cor)[missing_vals < 0.25], rsq$rsq[missing_vals < 0.25], names(rsq$cor)[missing_vals < 0.25], col = sign(rsq$cor)[missing_vals < 0.25] +2)
abline(lm(rsq$rsq[missing_vals < 0.25] ~ abs(rsq$cor[missing_vals < 0.25])))

Perform a BCA on the double positive count matrix. The three categories gut, lung and virus don't necessarily group together, in particular, CMV and influenza have been separated.

#pc <- glasso(cor(L2[, -c(grep('IL5', colnames(L2)), grep('IL4', colnames(L2)))]), rho = 0.2)$wi
# pc <- glasso(cor(L2), rho = 0.3)$wi
# colnames(pc) <- colnames(L2)
# rownames(pc) <- colnames(L2)
# #pheatmap(pc)
# 
# 
# pc <- glasso(cor(C), rho = 0.5)$wi
# rownames(pc) <- colnames(C)
# colnames(pc) <- colnames(C)
# G <- graph_from_adjacency_matrix(pc, mode = 'undirected', weighted = TRUE, diag = FALSE)
# E(G)$col <- 1
# E(G)$col[E(G)$weight > 0] <- 2
# E(G)$width <- abs(E(G)$weight)
# par(mfrow = c(1, 1))
# plot(G, edge.color = E(G)$col, edge.width = E(G)$width*15)
# 

# bp <- make_biplot(svd(L2), 1, 3)
# #png('/Users/kristenfeher/Desktop/biplot_donors.png', height = 400, width= 700)
# par(mfrow = c(1, 2))
# plot(bp$F,  type= 'n', xlab = 'biplot axis 1', ylab = 'biplot axis 2', cex.lab = 1.5, main = 'biplot of donors', col = metadata[, 3], pch = 16)
# arrows(0, 0, bp$G[, 1]*2, bp$G[, 2]*2, lwd = 2, length = 0.1)
# text(bp$F, labels = stim[sample_fac$stim], col = sample_fac$stim)
# plot(bp$G*2.5,  type = 'n')
# text(bp$G*2, labels = colnames(L2), cex = 0.5)
# arrows(0, 0, bp$G[, 1]*1.5, bp$G[, 2]*1.5, lwd = 2, length = 0.1)

L2 <- L2[, missing_vals < 0.25]
pca <- dudi.pca(L2, scan = FALSE, nf = 8)
betw <- bca(pca, factor(stim[sample_fac$stim]), scan = FALSE, nf = 3)
par(mfrow = c(1, 2))
s.class(betw$ls, factor(stim[sample_fac$stim]), col = sample_fac$stim)
s.arrow(betw$c1, clabel = 0.5)

#coinertia analysis, not sure what it brings
pca1 <- dudi.pca(L, scan = FALSE, nf = 8)
bet <- bca(pca, scan = FALSE, nf = 8, fac = factor(stim[sample_fac$stim]))
bet1 <- bca(pca1, scan = FALSE, nf = 8, fac = factor(stim[sample_fac$stim]))
coin <- coinertia(bet, bet1, scannf = FALSE)
plot(coin)

Visually inspecting the correlation between all double positive counts, there are two strong groups and one weaker group.

# cross correlation matrix
pheatmap(cor(L2))

# groups <- list(c(8, 7, 14), c(1, 5, 6, 11))
# groups[[3]] <- (1:14)[-unlist(groups)]
# 
# bp <- make_biplot(svd(L2[, groups[[1]]]), 1, 3)
# #png('/Users/kristenfeher/Desktop/biplot_donors.png', height = 400, width= 700)
# par(mfrow = c(1, 2))
# plot(bp$F,  type= 'n', xlab = 'biplot axis 1', ylab = 'biplot axis 2', cex.lab = 1.5, main = 'biplot of donors', col = metadata[, 3], pch = 16)
# arrows(0, 0, bp$G[, 1]*2, bp$G[, 2]*2, lwd = 2, length = 0.1)
# text(bp$F, labels = stim[sample_fac$stim], col = sample_fac$stim)
# plot(bp$G*2.5,  type = 'n', xlim = c(0, 1.8))
# text(bp$G*2, labels = colnames(L2)[groups[[1]]], cex = 1)
# arrows(0, 0, bp$G[, 1]*1.5, bp$G[, 2]*1.5, lwd = 2, length = 0.1)
# pca <- dudi.pca(L2[, groups[[1]]], scan = FALSE, nf = 8)
# betw <- bca(pca, factor(stim[sample_fac$stim]), scan = FALSE, nf = 3)
# par(mfrow = c(1, 2))
# s.class(betw$ls, factor(stim[sample_fac$stim]), col = sample_fac$stim)
# s.arrow(betw$c1)
# 
# df <- lapply(1:8, function(i) data.frame(x = factor(colnames(L2)[groups[[1]]]), y = colMeans(L2[sample_fac$stim == i, groups[[1]]]), stim = stim[i]))
# df <- do.call(rbind, df)
# ggplot(df, aes(x = x, y = y, group = stim, colour = stim)) + geom_line()
# 
# 
# bp <- make_biplot(svd(L2[, groups[[2]]]), 1, 3)
# #png('/Users/kristenfeher/Desktop/biplot_donors.png', height = 400, width= 700)
# par(mfrow = c(1, 2))
# plot(bp$F,  type= 'n', xlab = 'biplot axis 1', ylab = 'biplot axis 2', cex.lab = 1.5, main = 'biplot of donors', col = metadata[, 3], pch = 16)
# arrows(0, 0, bp$G[, 1]*2, bp$G[, 2]*2, lwd = 2, length = 0.1)
# text(bp$F, labels = stim[sample_fac$stim], col = sample_fac$stim)
# plot(bp$G*2.5,  type = 'n', xlim = c(0, 1.7))
# text(bp$G*2, labels = colnames(L2)[groups[[2]]], cex = 1)
# arrows(0, 0, bp$G[, 1]*1.5, bp$G[, 2]*1.5, lwd = 2, length = 0.1)
# pca <- dudi.pca(L2[, groups[[2]]], scan = FALSE, nf = 8)
# betw <- bca(pca, factor(stim[sample_fac$stim]), scan = FALSE, nf = 3)
# par(mfrow = c(1, 2))
# s.class(betw$ls, factor(stim[sample_fac$stim]), col = sample_fac$stim)
# s.arrow(betw$c1)
# 
# df <- lapply(1:8, function(i) data.frame(x = factor(colnames(L2)[groups[[2]]]), y = colMeans(L2[sample_fac$stim == i, groups[[2]]]), stim = stim[i]))
# df <- do.call(rbind, df)
# ggplot(df, aes(x = x, y = y, group = stim, colour = stim)) + geom_line()
# 
# 
# 
# bp <- make_biplot(svd(L2[, groups[[3]]]), 1, 3)
# #png('/Users/kristenfeher/Desktop/biplot_donors.png', height = 400, width= 700)
# par(mfrow = c(1, 2))
# plot(bp$F,  type= 'n', xlab = 'biplot axis 1', ylab = 'biplot axis 2', cex.lab = 1.5, main = 'biplot of donors', col = metadata[, 3], pch = 16)
# arrows(0, 0, bp$G[, 1]*2, bp$G[, 2]*2, lwd = 2, length = 0.1)
# text(bp$F, labels = stim[sample_fac$stim], col = sample_fac$stim)
# plot(bp$G*2.5,  type = 'n')
# text(bp$G*2, labels = colnames(L2)[groups[[3]]], cex = 1)
# arrows(0, 0, bp$G[, 1]*1.5, bp$G[, 2]*1.5, lwd = 2, length = 0.1)
# pca <- dudi.pca(L2[, groups[[3]]], scan = FALSE, nf = 8)
# betw <- bca(pca, factor(stim[sample_fac$stim]), scan = FALSE, nf = 3)
# par(mfrow = c(1, 2))
# s.class(betw$ls, factor(stim[sample_fac$stim]), col = sample_fac$stim)
# s.arrow(betw$c1)
# 
# df <- lapply(1:8, function(i) data.frame(x = factor(colnames(L2)[groups[[3]]]), y = colMeans(L2[sample_fac$stim == i, groups[[3]]]), stim = stim[i]))
# df <- do.call(rbind, df)
# ggplot(df, aes(x = x, y = y, group = stim, colour = stim)) + geom_line()
# 
# df <- lapply(1:8, function(i) data.frame(x = factor(colnames(L2)[unlist(groups)], levels = colnames(L2)[unlist(groups)]), y = colMeans(L2[sample_fac$stim == i, unlist(groups)]), stim = stim[i]))
# df <- do.call(rbind, df)
# ggplot(df, aes(x = x, y = y, group = stim, colour = stim)) + geom_line()

It is of interest to know the connection between double positive counts and total counts. To do this, calculate the cross-correlation matrix. The group structure of above is maintained, except interestingly, IFNg-GMSCF switches to group 1 (I don't know why yet). Then we can see:

C <- cor(cbind(L, L2))[1:8, 9:22]
pheatmap(C)

Just for interest, here is the correlation matrix of the above cross-correlation matrix. It is quite similar to the original correlation matrix above in terms of group structure, except for IFNg-GMCSF and GMCSF-IL10 which are weakly in group 1. GMCSF-IL10 somehow maintains a foot in group 3.

pheatmap(cor(C))

Single Cell coexpression

Normalised to expected number given total counts within sample

We can make another double positive matrix, taking into account the contingency table structure. Suppose in one sample, there are 10% IL17 cells, and 12% IL17 cells. If the cells are double positive at random, then there would be (0.1)(0.12) = 1.2% cells. In fact, if there are actually 5% cells, then the value for that sample in the IL17-IL22 column would be $log(5/1.2, 2) = 2.06$. This means there are 4 times the number of IL17-IL22 cells than expected, given the total IL17 and IL22 counts.

First plot BCA.

L3 <- log(twoD_marginal)
RM <- do.call(rbind, replicate(8,  by(L3, sample_fac$donor, colMeans)))
L3 <- L3 - RM
L3 <- L3[, missing_vals < 0.25]



pca <- dudi.pca(L3, scan = FALSE, nf = 8)
betw <- bca(pca, factor(stim[sample_fac$stim]), scan = FALSE, nf = 3)
#plot(betw, col = sample_fac$stim)

par(mfrow = c(1, 2))
#s.class(pca$li, factor(stim[sample_fac$stim]), col = sample_fac$stim)
s.class(betw$ls, factor(stim[sample_fac$stim]), col = sample_fac$stim)
s.arrow(betw$c1, clabel = 0.5)

# just for fun, all three matrices combined into one
# pca <- dudi.pca(cbind(L, L2, L3), scan = FALSE, nf = 8)
# betw <- bca(pca, factor(stim[sample_fac$stim]), scan = FALSE, nf = 3)
# #plot(betw, col = sample_fac$stim)
# 
# par(mfrow = c(1, 2))
# #s.class(pca$li, factor(stim[sample_fac$stim]), col = sample_fac$stim)
# s.class(betw$ls[, c(1, 2)], factor(stim[sample_fac$stim]), col = sample_fac$stim)
# #s.class(betw$ls[, c(1, 3)], factor(stim[sample_fac$stim]), col = sample_fac$stim)
# s.arrow(betw$c1[, c(1, 2)], clabel = 0.5)

Turn straight to the cross-correlation matrix

# cross correlation matrix
C <- cor(cbind(L, L3))[1:8, 9:22]
pheatmap(C)

pheatmap(cor(C))
# rsq <- 
# lapply(1:6, function(i) lapply((i+1):7, function(j) {
# #   i = 5
# #   j = 6
# #   which(paste(colnames(L)[j], colnames(L)[i], sep= '_') == colnames(L2))
# df <- data.frame(x1 = L[, i], x2 = L[, j], y = L2[, which(paste(colnames(L)[j], colnames(L)[i], sep= '_') == colnames(L2))])
# if (ncol(df) == 3) {
# lmodel <- lm(y ~ x1 + x2, df)
# summary(lmodel)$r.squared} else {0}
# } 
# )
# )
# 
# rsq <- unlist(rsq)
# 
# par(mfrow = c(1, 1))
# plot(abs(cor_markers)[missing_vals < 0.25], rsq[missing_vals < 0.25], pch = 16, cex = 1, type = 'n')
# text(abs(cor_markers)[missing_vals < 0.25], rsq[missing_vals < 0.25], names(cor_markers)[missing_vals < 0.25], col = sign(cor_markers)[missing_vals < 0.25] +2)
# abline(lmrob(rsq[missing_vals < 0.25] ~ abs(cor_markers[missing_vals < 0.25])))
# 
# rsq_pearson <- 
# lapply(1:6, function(i) lapply((i+1):7, function(j) {
# #   i = 5
# #   j = 6
# #   which(paste(colnames(L)[j], colnames(L)[i], sep= '_') == colnames(L2))
# df <- data.frame(x1 = L[, i], x2 = L[, j], y = L3[, which(paste(colnames(L)[j], colnames(L)[i], sep= '_') == colnames(L2))])
# if (ncol(df) == 3) {
# lmodel <- lm(y ~ x1 + x2, df)
# summary(lmodel)$r.squared} else {0}
# } 
# )
# )
# 
# rsq_pearson <- unlist(rsq_pearson)
# 
# par(mfrow = c(1, 1))
# plot(abs(cor_markers)[missing_vals < 0.25], rsq_pearson[missing_vals < 0.25], pch = 16, cex = 1, type = 'n')
# text(abs(cor_markers)[missing_vals < 0.25], rsq_pearson[missing_vals < 0.25], names(cor_markers)[missing_vals < 0.25], col = sign(cor_markers)[missing_vals < 0.25] +2)
# abline(lmrob(rsq_pearson[missing_vals < 0.25] ~ abs(cor_markers[missing_vals < 0.25])))


kristenmf/flowR documentation built on May 20, 2019, 6:15 p.m.