knitr::opts_chunk$set(warning = FALSE) library(TapestriR) library(tidyverse) theme_set(theme_bw()) library(patchwork) packages <- c("factoextra", "NbClust", 'uwot', 'ComplexHeatmap') for(i in 1:length(packages)) { package = packages[i] if (!requireNamespace(package, quietly = TRUE)) print(package) #install.packages(package) require(package=package, character.only=TRUE) }
We will analyze single cell DNA and protein sequencing data from the Tapestri Platform to demonstrate the ability to detect both CNVs and SNVs simultaneously and see how the expression of proteins is changing in different cell populations that are mutant or not.
#replace filename with path to your data filename <- system.file("extdata", "4_cell_line_mix_dna_protein.h5", package = "TapestriR") experiment = read_tap(filename) experiment
Best practice is to create a multi-assay, multi-sample h5 in Pipeline, and apply filters before loading into R. We've done some filtering here. Notice how the number of cells and features change after filtering.
# Ideally, we would start by loading a filtered H5, but for now, we load data with some basic filters. filtered_variants = filter_variants(experiment$assays$dna_variants) # Add the filtered data back to the experiment. This will be a subset of the rest of the assays to ensure that we have the same cells. experiment = add_assay(experiment,filtered_variants, keep_common_cells = TRUE) experiment
Picking the right set of variants is critical.
#manually select the rest of variants good_variants = experiment$assays$dna_variants$feature_annotations %>% filter(QUAL > 30000) #Manual filters manually_filtered_variants = subset_assay(assay = experiment$assays$dna_variants, keep_feature_ids = good_variants$id) experiment = add_assay(experiment,manually_filtered_variants, keep_common_cells = TRUE) experiment #DT::datatable(experiment$assays$dna$feature_annotations)
1) Load Protein assay and normalize. 2) Load DNA read counts.
# Normalize the protein data using the clr method. protein_counts_norm = experiment$assays$protein_read_counts$data_layers$read_counts %>% clr_by_feature() %>% as_tibble(rownames = NA) # Add the normalized data to the protein assay. experiment$assays$protein_read_counts = add_data_layer(experiment$assays$protein_read_counts,'normalized',protein_counts_norm) # Normalize the DNA read counts. normalized_dna_reads = normalize_dna_reads(experiment$assays$dna_read_counts$data_layers$read_counts) experiment$assays$dna_read_counts = add_data_layer(experiment$assays$dna_read_counts,'normalized',normalized_dna_reads)
1) Select the proteins to plot on the X and Y axes. 2) Select the set of other feature(s) to color the plot by. If you choose more than one feature, each feature will be plotted in a subplot.
################## # Select the proteins to plot on X and Y. ################## # protein_x = 'CD34' # protein_y = 'CD38' protein_x = 1 protein_y = 2 ################## # Select 1 or more features to color by. # color_by should be a vector of the column header you want to color by. ################## # All proteins color_by = experiment$assays$protein_read_counts$data_layers$normalized # Select a few proteins. # color_by = experiment$assays$protein_read_counts$data_layers$normalized %>% select('CD110','CD117') # Select a few variants. # color_by = experiment$assays$dna$data_layers$NGT %>% select(1:10) %>% mutate_all(as_factor) %>% mutate_all(recode_genotypes) p = tapestri_scatterplot(x = experiment$assays$protein_read_counts$data_layers$normalized[[protein_x]], y= experiment$assays$protein_read_counts$data_layers$normalized[[protein_y]], color_by = color_by)+ scale_colour_gradient2(low="yellow", mid='grey', high="darkred") p = p + xlab(protein_x) + ylab(protein_y) + ggtitle('Color by Protein Expression') p # Select a few variants. color_by = experiment$assays$dna_variants$data_layers$NGT %>% select(1:10) %>% mutate_all(as_factor) %>% mutate_all(recode_genotypes) p = tapestri_scatterplot(x = experiment$assays$protein_read_counts$data_layers$normalized[[protein_x]], y= experiment$assays$protein_read_counts$data_layers$normalized[[protein_y]], color_by = color_by) p = p + xlab(protein_x) + ylab(protein_y) + ggtitle('Color by Genotypes') p
We will start with the genotype data and explore also how we can identify clones
based on genotypes, group each cell into subclones as Tapestri Insights does.
#define subclones based on NGT values similar to TI ngt_clones = define_subclones(experiment$assays$dna_variants, ignore_zygosity = TRUE) experiment$assays$dna_variants = add_analysis_layer(assay = experiment$assays$dna_variants, layer_name = 'TI subclone', ngt_clones$subclone_label)
First, let's reduce the dimensions and see how this will be displayed in a UMAP plot.
data = experiment$assays$dna_variants$data_layers$AF projections = list() # Dimensional reduction using umap. set.seed(111) umap_values <- umap(data, scale=TRUE, metric="manhattan", init="laplacian", pca=20) projections[['Projection:umap_manhattan DR:pca data:vaf']] = tibble(x = umap_values[,1], y = umap_values[,2]) # umap_values <- umap(data, scale=TRUE, metric="euclidean", init="laplacian", pca=20) # projections[['Projection:umap_euclidean DR:pca data:vaf']] = tibble(x = umap_values[,1], y = umap_values[,2]) # # umap_values <- umap(data, scale=TRUE, metric="cosine", init="laplacian", pca=20) # projections[['Projection:umap_cosine DR:pca data:vaf']] = tibble(x = umap_values[,1], y = umap_values[,2]) experiment$assays$dna_variants = add_analysis_layer(assay = experiment$assays$dna_variants, layer_name = 'projections', as_tibble(projections))
Visualize the projection
# Show a simple plot of the projection. projection = experiment$assays$dna_variants$analysis_layers$projections$`Projection:umap_manhattan DR:pca data:vaf` ggplot(data=projection) + geom_point(aes(x = x, y = y), alpha = 0.5, size=0.8)
In this step, we determine the optimal number of clusters for our dataset using k-means or hierarchical clustering. This is not a trivial problem. It is left to the user to explore the data. Here are a few examples for determining the number of clusters in your data.
cluster_on = experiment$assays$dna_variants$analysis_layers$projections$`Projection:umap_manhattan DR:pca data:vaf` # Elbow method elbow = fviz_nbclust(cluster_on, kmeans, method = "wss") + labs(subtitle = "Elbow method") # Silhouette method silhouette = fviz_nbclust(cluster_on, kmeans, method = "silhouette")+ labs(subtitle = "Silhouette method") # Gap statistic # nboot = 50 to keep the function speedy. # Recommended value: nboot= 500 for your analysis # Use verbose = FALSE to hide computing progression # set.seed(123) # gap_stat = fviz_nbclust(cluster_on, kmeans, nstart = 25, method = "gap_stat", nboot = 25)+ # labs(subtitle = "Gap statistic method") (elbow + silhouette) #/ # (gap_stat + plot_spacer())
Clustering the data is a not a trival task. Clustering can be done one umap projection or the raw data. There are numerous clustering methods with parameters that need optimization for your data
It is left to use to explore the rich community of R packages for clustering. Here we show clustering by kmeans on a umap projection.
# Hold all the different customer labels in a single structure. cluster_by = experiment$assays$dna_variants$analysis_layers$projections$`Projection:umap_manhattan DR:pca data:vaf` clusters = list() # Do the clustering. for(i in 2:10) { kmean_values <- kmeans(cluster_by, i ,iter.max=500) clusters[[paste0('umap.kmean.cluster.',i)]] = as_factor(kmean_values$cluster) } ############# # Add cluster labels to the analysis data structure. ############# experiment$assays$dna_variants = add_analysis_layer(assay = experiment$assays$dna_variants, layer_name = 'umap_vaf_clusters', as_tibble(clusters))
In this example we're plotting the TI subclone labels on the the umap projection on VAF data.
color_by can be a data.frame of multiple clustering methods and all will be plotted side by side
color_by = experiment$assays$dna_variants$analysis_layers$`TI subclone` projection = experiment$assays$dna_variants$analysis_layers$projections$`Projection:umap_manhattan DR:pca data:vaf` p = tapestri_scatterplot( x = projection$x, y= projection$y, color_by = color_by) p = p + umap_theme() + ggtitle('umap_vaf_clusters') p
# %>% select(!contains('chr2:198267')) color_by = experiment$assays$dna_variants$data_layers$NGT %>% select(1:20) %>% mutate_all(as_factor) %>% mutate_all(recode_genotypes) projection = experiment$assays$dna_variants$analysis_layers$projections$`Projection:umap_manhattan DR:pca data:vaf` p = tapestri_scatterplot( x = projection$x, y= projection$y, color_by = color_by) p = p + umap_theme() p = p + ggtitle('Color by Genotypes') p
color_by = experiment$assays$dna_variants$analysis_layers$umap_vaf_clusters$umap.kmean.cluster.4 projection = experiment$assays$dna_variants$analysis_layers$projections$`Projection:umap_manhattan DR:pca data:vaf` p = tapestri_scatterplot( x = projection$x, y= projection$y, color_by = color_by) p = p + umap_theme() + ggtitle('umap_vaf_clusters') + theme(legend.position = 'none') v = tapestri_violinplot(clusters = color_by, features = experiment$assays$protein_read_counts$data_layers$normalized) v = v + theme_bw() + theme(legend.position = "none", axis.text.x = element_text(angle = 90, hjust = 1)) # pathwork magic p / v
color_by = experiment$assays$protein_read_counts$data_layers$normalized projection = experiment$assays$dna_variants$analysis_layers$projections$`Projection:umap_manhattan DR:pca data:vaf` p = tapestri_scatterplot( x = projection$x, y= projection$y, color_by = color_by) p = p + umap_theme() + scale_colour_gradient2(low="yellow", mid='grey', high="darkred") p = p + ggtitle('Color by Proteins') p
Next we plot the SNVs for each filtered variant across all cells in heatmap format. We're only providing a simple example to get started.
Users should become familiar with ComplexHeatmap::Heatmap https://jokergoo.github.io/ComplexHeatmap-reference/book/.
# Order the features in chr order. variant_order = experiment$assays$dna_variants$feature_annotations %>% mutate(CHROM = as.numeric(CHROM), POS = as.numeric(POS)) %>% arrange(CHROM, POS) genotypes.mat = experiment$assays$dna_variants@data_layers$NGT %>% select(variant_order$id) clusters = experiment$assays$dna_variants@analysis_layers$umap_vaf_clusters$umap.kmean.cluster.4 snv.h <- ComplexHeatmap::Heatmap( as.matrix(genotypes.mat), name = "GT", col = c("lightgrey", "yellow", "blue", "black"), #circlize::colorRamp2(c(0, 1, 2, 3), c("grey", "yellow", "blue", "black")) heatmap_legend_param = list(labels = c("WT", "HET", "HOM", "Missing")), split = factor(clusters), cluster_rows = FALSE, show_row_names = FALSE, cluster_columns = FALSE, row_title_gp = grid::gpar(fontsize = 6), column_names_gp = grid::gpar(fontsize = 8), show_column_dend = FALSE ) snv.h
Let's explore our protein count data. As we identify 4 clusters using the genotype data, we define also here 4 clones too.
# Dimensional reduction using umap set.seed(111) umap_values <- umap(experiment$assays$protein_read_counts$data_layers$normalized, scale=TRUE, metric="euclidean", init="laplacian", pca=20) umap_layer = tibble( x = umap_values[,1], y = umap_values[,2] ) experiment$assays$protein_read_counts = add_analysis_layer(assay = experiment$assays$protein_read_counts, layer_name = 'umap', umap_layer) # Hold all the different customer labels in a single structure. # cluster_by = experiment$assays$protein_read_counts$analysis_layers$umap cluster_by = experiment$assays$protein_read_counts$data_layers$normalized clusters = list() # Do the clustering. for(i in 2:10) { kmean_values <- kmeans(cluster_by, i ,iter.max=500) clusters[[paste0('umap.kmean.cluster.',i)]] = as_factor(kmean_values$cluster) } # Add cluster labels to the analysis data structure. experiment$assays$protein_read_counts = add_analysis_layer(assay = experiment$assays$protein_read_counts, layer_name = 'clusters', as_tibble(clusters))
Plot all the different clusters on the same umap projection.
p = tapestri_scatterplot( x = experiment$assays$protein_read_counts$analysis_layers$umap$x, y= experiment$assays$protein_read_counts$analysis_layers$umap$y, color_by = experiment$assays$protein_read_counts$analysis_layers$clusters) p = p + xlab('') + ylab('') p = p + umap_theme() p
To do:
p_umap = tapestri_scatterplot( x = experiment$assays$protein_read_counts$analysis_layers$umap$x, y= experiment$assays$protein_read_counts$analysis_layers$umap$y, color_by = experiment$assays$protein_read_counts$analysis_layers$clusters$umap.kmean.cluster.4) + xlab('') + ylab('') + umap_theme() p_violin = tapestri_violinplot( clusters = experiment$assays$protein_read_counts$analysis_layers$clusters$umap.kmean.cluster.4, features = experiment$assays$protein_read_counts$data_layers$normalized) p_umap / p_violin
p = tapestri_scatterplot( x = experiment$assays$protein_read_counts$analysis_layers$umap$x, y= experiment$assays$protein_read_counts$analysis_layers$umap$y, color_by = experiment$assays$protein_read_counts$data_layers$normalized) p = p + umap_theme() + scale_colour_gradient2(low="yellow", mid='grey', high="darkred") p
To do:
Users should become familiar with ComplexHeatmap::Heatmap https://jokergoo.github.io/ComplexHeatmap-reference/book/. We're only providing a simple example to get started.
# Order the features in chr order. protein.mat = experiment$assays$protein_read_counts$data_layers$normalized clusters = experiment$assays$dna_variants@analysis_layers$umap_vaf_clusters$umap.kmean.cluster.4 protein.h <- ComplexHeatmap::Heatmap( as.matrix(protein.mat), name = "Protein", cluster_rows = FALSE, cluster_columns = FALSE, col = circlize::colorRamp2(c(-2, 0, 2), c("yellow", "grey", "blue")), split = factor(clusters), show_row_names = FALSE, row_title_gp = grid::gpar(fontsize = 6), #heatmap_legend_param=legend_params, column_names_gp = grid::gpar(fontsize = 8), show_column_dend = FALSE ) protein.h
Usage:
# Select a clustering method that best represents your data and identify the normal cluster. # Normalize all the read counts based on the normal cluster. cnv.mat = compute_ploidy( reads = experiment$assays$dna_read_counts$data_layers$normalized, clusters = experiment$assays$dna_variants$analysis_layers$umap_vaf_clusters$umap.kmean.cluster.4, baseline_cluster = 2 ) experiment$assays$dna_read_counts <- add_analysis_layer(assay = experiment$assays$dna_read_counts,layer_name = 'norm_to_baseline',data = cnv.mat)
From the heatmap we can clearly see the regions where there is loss of heterozygosity and ploidy from 2 is reduced closer to 1.
cnv.mat = as.matrix(experiment$assays$dna_read_counts$analysis_layers$norm_to_baseline) cnv.mat[is.na(cnv.mat) | is.infinite(cnv.mat)] <-2 clusters = experiment$assays$dna_variants@analysis_layers$umap_vaf_clusters$umap.kmean.cluster.4 cnv.h <- ComplexHeatmap::Heatmap( as.matrix(cnv.mat), name = "CNV", col = circlize::colorRamp2(c(0, 2, 4), c("yellow", "grey", "blue")), split=factor(clusters), cluster_rows = FALSE, show_row_names=FALSE, cluster_columns = FALSE, row_title_gp = grid::gpar(fontsize = 6), column_names_gp = grid::gpar(fontsize=8), show_column_dend=FALSE) cnv.h
p = tapestri_ploidy_plot( normalized_reads = experiment$assays$dna_read_counts$analysis_layers$norm_to_baseline, clusters = experiment$assays$dna_variants$analysis_layers$umap_vaf_clusters$umap.kmean.cluster.4 ) p
Combine SNV, CNV and Protein data into a single heatmap. A nice feature of Complexheatmaps package
snv.h + cnv.h + protein.h
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.