Nothing
## ---- message=FALSE, warning=FALSE, class.source = 'fold-hide'----------------
## Source required libraries
library(data.table)
library(tidyverse)
library(ggthemes)
library(ggrepel)
library(harmony)
library(patchwork)
library(tidyr)
## Useful util functions
cosine_normalize <- function(X, margin) {
if (margin == 1) {
res <- sweep(as.matrix(X), 1, sqrt(rowSums(X ^ 2)), '/')
row.names(res) <- row.names(X)
colnames(res) <- colnames(X)
} else {
res <- sweep(as.matrix(X), 2, sqrt(colSums(X ^ 2)), '/')
row.names(res) <- row.names(X)
colnames(res) <- colnames(X)
}
return(res)
}
onehot <- function(vals) {
t(model.matrix(~0 + as.factor(vals)))
}
colors_use <- c(`jurkat` = rgb(129, 15, 124, maxColorValue=255),
`t293` = rgb(208, 158, 45, maxColorValue=255),
`half` = rgb(0, 109, 44, maxColorValue=255))
do_scatter <- function(umap_use, meta_data, label_name, no_guides = TRUE, do_labels = TRUE, nice_names,
palette_use = colors_use,
pt_size = 4, point_size = .5, base_size = 10, do_points = TRUE, do_density = FALSE, h = 4, w = 8) {
umap_use <- umap_use[, 1:2]
colnames(umap_use) <- c('X1', 'X2')
plt_df <- umap_use %>% data.frame() %>%
cbind(meta_data) %>%
dplyr::sample_frac(1L)
plt_df$given_name <- plt_df[[label_name]]
if (!missing(nice_names)) {
plt_df %<>%
dplyr::inner_join(nice_names, by = "given_name") %>%
subset(nice_name != "" & !is.na(nice_name))
plt_df[[label_name]] <- plt_df$nice_name
}
plt <- plt_df %>%
ggplot(aes(X1, X2, colour = .data[[label_name]], fill = .data[[label_name]])) +
theme_tufte(base_size = base_size) +
theme(panel.background = element_rect(fill = NA, color = "black")) +
guides(color = guide_legend(override.aes = list(stroke = 1, alpha = 1, shape = 16, size = 4)), alpha = FALSE) +
scale_color_manual(values = palette_use) +
scale_fill_manual(values = palette_use) +
theme(plot.title = element_text(hjust = .5)) +
labs(x = "UMAP 1", y = "UMAP 2")
if (do_points)
plt <- plt + geom_point(size = 0.2)
if (do_density)
plt <- plt + geom_density_2d()
if (no_guides)
plt <- plt + guides("none")
if (do_labels)
plt <- plt + geom_label_repel(data = data.table(plt_df)[, .(X1 = mean(X1), X2 = mean(X2)), by = label_name], label.size = NA,
aes(label = .data[[label_name]]), color = "white", size = pt_size, alpha = 1, segment.size = 0) +
guides(col = FALSE, fill = FALSE)
return(plt)
}
## -----------------------------------------------------------------------------
data(cell_lines)
V <- cell_lines$scaled_pcs
V_cos <- cosine_normalize(V, 1)
meta_data <- cell_lines$meta_data
## ---- warning=FALSE, fig.width=5, fig.height=3, fig.align="center"------------
do_scatter(V, meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) +
labs(title = 'Colored by dataset', x = 'PC1', y = 'PC2') +
do_scatter(V, meta_data, 'cell_type', no_guides = TRUE, do_labels = TRUE) +
labs(title = 'Colored by cell type', x = 'PC1', y = 'PC2') +
NULL
## -----------------------------------------------------------------------------
set.seed(1)
harmonyObj <- harmony::RunHarmony(
data_mat = V, ## PCA embedding matrix of cells
meta_data = meta_data, ## dataframe with cell labels
theta = 1, ## cluster diversity enforcement
vars_use = 'dataset', ## variable to integrate out
nclust = 5, ## number of clusters in Harmony model
max_iter = 0, ## stop after initialization
return_object = TRUE ## return the full Harmony model object
)
## ---- fig.width=5, fig.height=3, fig.align="center"---------------------------
do_scatter(t(harmonyObj$Z_orig), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) +
labs(title = 'Z_orig', subtitle = 'Euclidean distance', x = 'PC1', y = 'PC2') +
do_scatter(t(harmonyObj$Z_cos), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) +
labs(title = 'Z_cos', subtitle = 'Induced Cosine distance', x = 'PC1', y = 'PC2')
## ---- fig.width=8, fig.height=3, out.width="100%"-----------------------------
harmonyObj$Z_cos %>% t %>% data.frame() %>%
cbind(meta_data) %>%
tidyr::gather(key, val, X1:X20) %>%
ggplot(aes(reorder(gsub('X', 'PC', key), as.integer(gsub('X', '', key))), val)) +
geom_boxplot(aes(color = dataset)) +
scale_color_manual(values = colors_use) +
labs(x = 'PC number', y = 'PC embedding value', title = 'Z_cos (unit scaled PCA embeddings) for all 20 PCs') +
theme_tufte(base_size = 10) + geom_rangeframe() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
## ---- fig.width=4, fig.height=3, fig.align="center"---------------------------
cluster_centroids <- harmonyObj$Y
do_scatter(t(harmonyObj$Z_cos), meta_data, 'dataset', no_guides = FALSE, do_labels = FALSE) +
labs(title = 'Initial kmeans cluster centroids', subtitle = '', x = 'PC1', y = 'PC2') +
geom_point(
data = data.frame(t(cluster_centroids)),
color = 'black', fill = 'black', alpha = .8,
shape = 21, size = 6
) +
NULL
## -----------------------------------------------------------------------------
cluster_assignment_matrix <- harmonyObj$R
## ---- fig.height=5, fig.width=5-----------------------------------------------
t(harmonyObj$Z_cos) %>% data.frame() %>%
cbind(meta_data) %>%
tibble::rowid_to_column('id') %>%
dplyr::inner_join(
cluster_assignment_matrix %>% t() %>% data.table() %>%
tibble::rowid_to_column('id') %>%
tidyr::gather(cluster, r, -id) %>%
dplyr::mutate(cluster = gsub('V', 'Cluster ', cluster)),
by = 'id'
) %>%
dplyr::sample_frac(1L) %>%
ggplot(aes(X1, X2, color = r)) +
geom_point(size=0.2) +
theme_tufte(base_size = 10) + theme(panel.background = element_rect()) +
facet_grid(cluster ~ dataset) +
scale_color_gradient(low = 'lightgrey', breaks = seq(0, 1, .1)) +
labs(x = 'Scaled PC1', y = 'Scaled PC2', title = 'Initial probabilistic cluster assignments')
## -----------------------------------------------------------------------------
observed_counts <- with(harmonyObj, R %*% t(as.matrix(Phi)))
round(observed_counts)
## -----------------------------------------------------------------------------
## observed counts
round(harmonyObj$O)
## observed counts
round(harmonyObj$E)
## -----------------------------------------------------------------------------
phi_celltype <- onehot(meta_data$cell_type)
observed_cell_counts <- harmonyObj$R %*% t(phi_celltype)
round(observed_cell_counts)
## -----------------------------------------------------------------------------
harmonyObj$max_iter_kmeans
## -----------------------------------------------------------------------------
## we can specify how many rounds of clustering to do
harmonyObj$max_iter_kmeans <- 10
harmonyObj$cluster_cpp()
## -----------------------------------------------------------------------------
round(harmonyObj$O)
## ---- fig.height=5, fig.width=5-----------------------------------------------
new_cluster_assignment_matrix <- harmonyObj$R
t(harmonyObj$Z_cos) %>% data.frame() %>%
cbind(meta_data) %>%
tibble::rowid_to_column('id') %>%
dplyr::inner_join(
new_cluster_assignment_matrix %>% t() %>% data.table() %>%
tibble::rowid_to_column('id') %>%
tidyr::gather(cluster, r, -id) %>%
dplyr::mutate(cluster = gsub('V', 'Cluster ', cluster)),
by = 'id'
) %>%
dplyr::sample_frac(1L) %>%
ggplot(aes(X1, X2, color = r)) +
geom_point(shape = '.') +
theme_tufte(base_size = 10) + theme(panel.background = element_rect()) +
facet_grid(cluster ~ dataset) +
scale_color_gradient(low = 'lightgrey', breaks = seq(0, 1, .1)) +
labs(x = 'Scaled PC1', y = 'Scaled PC2', title = 'New probabilistic cluster assignments')
## -----------------------------------------------------------------------------
phi_celltype <- onehot(meta_data$cell_type)
observed_cell_counts <- harmonyObj$R %*% t(phi_celltype)
round(observed_cell_counts)
## -----------------------------------------------------------------------------
round(apply(prop.table(observed_cell_counts, 1), 1, min) * 100, 3)
## -----------------------------------------------------------------------------
with(harmonyObj, {
distance_matrix <- 2 * (1 - t(Y) %*% Z_cos)
distance_score <- exp(-distance_matrix / as.numeric(sigma))
diversity_score <- sweep(E / O, 2, theta, '/') %*% as.matrix(Phi)
## new assignments are based on distance and diversity
R_new <- distance_score * diversity_score
## normalize R so each cell sums to 1
R_new <- prop.table(R_new, 2)
})
## -----------------------------------------------------------------------------
## with theta = 0
with(harmonyObj, {
(E / O) ^ 0
})
## -----------------------------------------------------------------------------
## with theta = 1
with(harmonyObj, {
round((E / O) ^ 1, 2)
})
## -----------------------------------------------------------------------------
## as theta approach infinity
with(harmonyObj, {
round((E / O) ^ 1e6, 2)
})
## -----------------------------------------------------------------------------
Y_unscaled <- with(harmonyObj, Z_cos %*% t(R))
## -----------------------------------------------------------------------------
Y_new <- cosine_normalize(Y_unscaled, 2)
## -----------------------------------------------------------------------------
harmonyObj$moe_correct_ridge_cpp()
## ---- fig.width=5, fig.height=3, fig.align="center"---------------------------
do_scatter(cosine_normalize(t(harmonyObj$Z_orig), 1), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) +
labs(title = 'Z_cos before MoE', x = 'PC1', y = 'PC2') +
do_scatter(t(harmonyObj$Z_cos), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) +
labs(title = 'Z_cos after MoE', x = 'PC1', y = 'PC2')
## ---- fig.width=8, fig.height=3, fig.align="center", out.width="100%"---------
do_scatter(t(harmonyObj$Z_orig), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) +
labs(title = 'Z_orig', subtitle = 'Original PCA embeddings', x = 'PC1', y = 'PC2') +
do_scatter(t(harmonyObj$Z_corr), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) +
labs(title = 'Z_corr', subtitle = '= Z_orig - correction_factors', x = 'PC1', y = 'PC2') +
do_scatter(t(harmonyObj$Z_cos), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) +
labs(title = 'Z_cos', subtitle = '= Unit_scaled(Z_corr)', x = 'Scaled PC1', y = 'Scaled PC2') +
NULL
## ---- fig.width=5, fig.height=3, fig.align="center"---------------------------
plt <- data.table(PC1_After = harmonyObj$Z_corr[1, ], PC1_Before = harmonyObj$Z_orig[1, ]) %>%
cbind(meta_data) %>%
dplyr::sample_frac(1L) %>%
ggplot(aes(PC1_Before, PC1_After)) +
geom_abline(slope = 1, intercept = 0) +
theme_tufte(base_size = 10) + geom_rangeframe() +
scale_color_tableau() +
guides(color = guide_legend(override.aes = list(stroke = 1, alpha = 1, shape = 16, size = 4))) +
NULL
plt + geom_point(shape = '.', aes(color = dataset)) +
labs(x = 'PC1 before correction', y = 'PC1 after correction',
title = 'PC1 correction for each cell', subtitle = 'Colored by Dataset') +
plt + geom_point(shape = '.', aes(color = cell_type)) +
labs(x = 'PC1 before correction', y = 'PC1 after correction',
title = 'PC1 correction for each cell', subtitle = 'Colored by Cell Type') +
NULL
## ---- echo=TRUE---------------------------------------------------------------
W <- list()
## Convert sparse data structures to dense matrix
Phi.moe <- as.matrix(harmonyObj$Phi_moe)
lambda <- diag(c(harmonyObj$lambda))
## Get beta coeeficients for all the clusters
for (k in 1:harmonyObj$K) {
W[[k]] <- solve(Phi.moe %*% diag(harmonyObj$R[k, ]) %*% t(Phi.moe) + lambda) %*% (Phi.moe %*% diag(harmonyObj$R[k, ])) %*% t(harmonyObj$Z_orig)
}
## ---- fig.width=5, fig.height=5-----------------------------------------------
cluster_assignment_matrix <- harmonyObj$R
t(harmonyObj$Z_orig) %>% data.frame() %>%
cbind(meta_data) %>%
tibble::rowid_to_column('id') %>%
dplyr::inner_join(
cluster_assignment_matrix %>% t() %>% data.table() %>%
tibble::rowid_to_column('id') %>%
tidyr::gather(cluster, r, -id) %>%
dplyr::mutate(cluster = gsub('V', 'Cluster ', cluster)),
by = 'id'
) %>%
dplyr::sample_frac(1L) %>%
ggplot(aes(X1, X2, color = r)) +
geom_point(shape = 0.2) +
theme_tufte(base_size = 10) + theme(panel.background = element_rect()) +
facet_grid(cluster ~ dataset) +
scale_color_gradient(low = 'grey', breaks = seq(0, 1, .2)) +
labs(x = 'PC1', y = 'PC2', title = 'Cluster assigned in original PCA space (Z_orig)')
## -----------------------------------------------------------------------------
plt_list <- lapply(1:harmonyObj$K, function(k) {
plt_df <- W[[k]] %>% data.frame() %>%
dplyr::select(X1, X2)
## Append n
plt_df <- plt_df %>%
cbind(
data.frame(t(matrix(unlist(c(c(0, 0), rep(plt_df[1, ], 3))), nrow = 2))) %>%
dplyr::rename(x0 = X1, y0 = X2)
) %>%
cbind(type = c('intercept', unique(meta_data$dataset)))
plt <- plt_df %>%
ggplot() +
geom_point(aes(X1, X2),
data = t(harmonyObj$Z_orig) %>% data.frame(),
size = 0.5,
color = 'grey'
) +
geom_segment(aes(x = x0, y = y0, xend = X1 + x0, yend = X2 + y0, color = type), linewidth=1) +
scale_color_manual(values = c('intercept' = 'black', colors_use)) +
theme_tufte(base_size = 10) + theme(panel.background = element_rect()) +
labs(x = 'PC 1', y = 'PC 2', title = sprintf('Cluster %d', k))
plt <- plt + guides(color = guide_legend(override.aes = list(stroke = 1, alpha = 1, shape = 16)))
# if (k == harmonyObj$K) {
# } else {
# plt <- plt + guides(color = FALSE)
# }
plt
})
## ---- fig.height=6, fig.width=6-----------------------------------------------
Reduce(`+`, plt_list) +
patchwork::plot_annotation(title = 'Mixture of experts beta terms before correction (Z_orig)') +
plot_layout(ncol = 2)
## ---- fig.width=4, fig.height=3, fig.align="center"---------------------------
plt_list <- lapply(1:harmonyObj$K, function(k) {
plt_df <- W[[k]] %>% data.frame() %>%
dplyr::select(X1, X2)
plt_df <- plt_df %>%
cbind(
data.frame(t(matrix(unlist(c(c(0, 0), rep(plt_df[1, ], 3))), nrow = 2))) %>%
dplyr::rename(x0 = X1, y0 = X2)
) %>%
cbind(type = c('intercept', unique(meta_data$dataset)))
plt <- plt_df %>%
ggplot() +
geom_point(aes(X1, X2),
data = t(harmonyObj$Z_corr) %>% data.frame(),
shape = '.',
color = 'grey'
) +
geom_segment(aes(x = x0, y = y0, xend = X1 + x0, yend = X2 + y0, color = type), linewidth=1) +
scale_color_manual(values = c('intercept' = 'black', colors_use)) +
theme_tufte(base_size = 10) + theme(panel.background = element_rect()) +
labs(x = 'PC 1', y = 'PC 2', title = sprintf('Cluster %d', k))
plt <- plt + guides(color = guide_legend(override.aes = list(stroke = 1, alpha = 1, shape = 16)))
plt
})
## ---- fig.height=6, fig.width=6-----------------------------------------------
Reduce(`+`, plt_list) +
patchwork::plot_annotation(title = 'Mixture of experts beta terms after correction (Z_corr)') +
plot_layout(ncol = 2)
## ---- echo=TRUE---------------------------------------------------------------
Z_i <- harmonyObj$Z_orig[, 5]
Z_i_pred <- Reduce(`+`, lapply(1:harmonyObj$K, function(k) {
W[[k]] * harmonyObj$Phi_moe[, 5] * harmonyObj$R[k, 5]
})) %>% colSums
## ---- fig.width=4, fig.height=3, fig.align="center"---------------------------
data.table(obs = Z_i, pred = Z_i_pred) %>%
tibble::rowid_to_column('PC') %>%
ggplot(aes(obs, pred)) +
geom_point(shape = 21) +
geom_label_repel(aes(label = PC)) +
geom_abline(slope = 1, intercept = 0) +
theme_tufte() + geom_rangeframe() +
labs(x = 'Observed PC score', 'Predicted PC score', title = 'Observed and predicted values of PC scores\nfor cell 5') +
NULL
## -----------------------------------------------------------------------------
delta <- Reduce(`+`, lapply(1:harmonyObj$K, function(k) {
W[[k]][2:4, ] * harmonyObj$Phi[, 5] * harmonyObj$R[k, 5]
})) %>% colSums
Z_corrected <- harmonyObj$Z_orig[, 5] - delta
## ---- fig.width=3, fig.height=3, fig.align="center"---------------------------
harmonyObj$Z_orig %>% t %>% data.frame() %>%
ggplot(aes(X1, X2)) +
geom_point(shape = '.') +
geom_point(
data = data.frame(t(harmonyObj$Z_orig[, 5, drop = FALSE])),
color = 'red'
) +
geom_segment(
data = data.table(x0 = harmonyObj$Z_orig[1, 5],
y0 = harmonyObj$Z_orig[2, 5],
x1 = Z_corrected[1],
y1 = Z_corrected[2]),
aes(x = x0, y = y0, xend = x1, yend = y1),
linewidth = 1,
color = 'red',
arrow = arrow(length = unit(0.05, "npc"), type = 'closed')
) +
theme_tufte(base_size = 10) + geom_rangeframe() +
labs(x = 'PC1', y = 'PC2', title = 'Correction of cell #5')
## -----------------------------------------------------------------------------
harmonyObj <- RunHarmony(
data_mat = V, ## PCA embedding matrix of cells
meta_data = meta_data, ## dataframe with cell labels
theta = 1, ## cluster diversity enforcement
vars_use = 'dataset', ## (list of) variable(s) we'd like to Harmonize out
nclust = 50, ## number of clusters in Harmony model
max_iter = 0, ## don't actually run Harmony, stop after initialization
return_object = TRUE ## return the full Harmony model object, not just the corrected PCA matrix
)
## ---- message=FALSE, fig.width=5, fig.height=3, fig.align="center"------------
i <- 0
do_scatter(t(harmonyObj$Z_cos), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) +
labs(title = sprintf('Round %d', i), subtitle = 'Colored by dataset', x = 'Scaled PC1', y = 'Scaled PC2') +
do_scatter(t(harmonyObj$Z_cos), meta_data, 'cell_type', no_guides = TRUE, do_labels = TRUE) +
labs(title = sprintf('Round %d', i), subtitle = 'Colored by cell type', x = 'Scaled PC1', y = 'Scaled PC2') +
NULL
## ---- fig.width=5, fig.height=3, fig.align="center", message=FALSE------------
for (i in 1:2) {
harmony:::harmonize(harmonyObj, 1)
plt <- do_scatter(t(harmonyObj$Z_cos), meta_data, 'dataset', no_guides = TRUE, do_labels = TRUE) +
labs(title = sprintf('Round %d', i), subtitle = 'Colored by dataset', x = 'Scaled PC1', y = 'Scaled PC2') +
do_scatter(t(harmonyObj$Z_cos), meta_data, 'cell_type', no_guides = TRUE, do_labels = TRUE) +
labs(title = sprintf('Round %d', i), subtitle = 'Colored by cell type', x = 'Scaled PC1', y = 'Scaled PC2') +
NULL
plot(plt)
}
## -----------------------------------------------------------------------------
sessionInfo()
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.