knitr::opts_chunk$set(warning = FALSE)

library(TapestriR)
library(tidyverse)
theme_set(theme_bw())
library(patchwork)


packages <- c("factoextra",  "NbClust", 'uwot', 'ComplexHeatmap')

for(i in 1:length(packages)) {
  package = packages[i]
  if (!requireNamespace(package, quietly = TRUE))
    print(package) #install.packages(package)

  require(package=package, character.only=TRUE)
}

Load multiomics H5

We will analyze single cell DNA and protein sequencing data from the Tapestri Platform to demonstrate the ability to detect both CNVs and SNVs simultaneously and see how the expression of proteins is changing in different cell populations that are mutant or not.

#replace filename with path to your data
filename <- system.file("extdata", "4_cell_line_mix_dna_protein.h5", package = "TapestriR")

experiment = read_tap(filename)

experiment

Filter variants

Best practice is to create a multi-assay, multi-sample h5 in Pipeline, and apply filters before loading into R. We've done some filtering here. Notice how the number of cells and features change after filtering.

# Ideally, we would start by loading a filtered H5, but for now, we load data with some basic filters.
filtered_variants = filter_variants(experiment$assays$dna_variants)

# Add the filtered data back to the experiment. This will be a subset of the rest of the assays to ensure that we have the same cells.
experiment = add_assay(experiment,filtered_variants, keep_common_cells = TRUE)

experiment

Filtered variants

Picking the right set of variants is critical.

#manually select the rest of variants
good_variants = experiment$assays$dna_variants$feature_annotations %>% filter(QUAL > 30000)

#Manual filters
manually_filtered_variants = subset_assay(assay = experiment$assays$dna_variants, keep_feature_ids = good_variants$id)
experiment = add_assay(experiment,manually_filtered_variants, keep_common_cells = TRUE)
experiment

#DT::datatable(experiment$assays$dna$feature_annotations)

Normalize the read counts

1) Load Protein assay and normalize. 2) Load DNA read counts.

# Normalize the protein data using the clr method.
protein_counts_norm = experiment$assays$protein_read_counts$data_layers$read_counts %>% clr_by_feature() %>% as_tibble(rownames = NA)

# Add the normalized data to the protein assay.
experiment$assays$protein_read_counts = add_data_layer(experiment$assays$protein_read_counts,'normalized',protein_counts_norm)

# Normalize the DNA read counts. 
normalized_dna_reads = normalize_dna_reads(experiment$assays$dna_read_counts$data_layers$read_counts)
experiment$assays$dna_read_counts = add_data_layer(experiment$assays$dna_read_counts,'normalized',normalized_dna_reads)

Explore the data {.tabset}

X-Y plots

1) Select the proteins to plot on the X and Y axes. 2) Select the set of other feature(s) to color the plot by. If you choose more than one feature, each feature will be plotted in a subplot.

##################
# Select the proteins to plot on X and Y.
##################

# protein_x = 'CD34'
# protein_y = 'CD38'

protein_x = 1
protein_y = 2

##################
# Select 1 or more features to color by.
# color_by should be a vector of the column header you want to color by.
##################

# All proteins
color_by = experiment$assays$protein_read_counts$data_layers$normalized

# Select a few proteins.
# color_by =  experiment$assays$protein_read_counts$data_layers$normalized %>% select('CD110','CD117')

# Select a few variants.
# color_by =  experiment$assays$dna$data_layers$NGT %>% select(1:10) %>% mutate_all(as_factor) %>% mutate_all(recode_genotypes)


p  = tapestri_scatterplot(x = experiment$assays$protein_read_counts$data_layers$normalized[[protein_x]], 
                 y= experiment$assays$protein_read_counts$data_layers$normalized[[protein_y]], 
                 color_by = color_by)+ scale_colour_gradient2(low="yellow", mid='grey', high="darkred") 
p = p + xlab(protein_x) + ylab(protein_y) + ggtitle('Color by Protein Expression')
p

# Select a few variants.
color_by =  experiment$assays$dna_variants$data_layers$NGT %>% select(1:10) %>% mutate_all(as_factor) %>% mutate_all(recode_genotypes)


p  = tapestri_scatterplot(x = experiment$assays$protein_read_counts$data_layers$normalized[[protein_x]], 
                 y= experiment$assays$protein_read_counts$data_layers$normalized[[protein_y]], 
                 color_by = color_by)
p = p + xlab(protein_x) + ylab(protein_y) + ggtitle('Color by Genotypes')
p

SNV

We will start with the genotype data and explore also how we can identify clones

determine clones similar to Tapestri Insights

based on genotypes, group each cell into subclones as Tapestri Insights does.

#define subclones based on NGT values similar to TI
ngt_clones = define_subclones(experiment$assays$dna_variants, ignore_zygosity = TRUE)

experiment$assays$dna_variants = add_analysis_layer(assay = experiment$assays$dna_variants, layer_name = 'TI subclone', ngt_clones$subclone_label)

Umap projection

First, let's reduce the dimensions and see how this will be displayed in a UMAP plot.

data = experiment$assays$dna_variants$data_layers$AF

projections = list()

# Dimensional reduction using umap.

set.seed(111)
umap_values <- umap(data, scale=TRUE, metric="manhattan", init="laplacian", pca=20) 
projections[['Projection:umap_manhattan DR:pca data:vaf']] = tibble(x = umap_values[,1], y = umap_values[,2])

# umap_values <- umap(data, scale=TRUE, metric="euclidean", init="laplacian", pca=20) 
# projections[['Projection:umap_euclidean DR:pca data:vaf']] = tibble(x = umap_values[,1], y = umap_values[,2])
# 
# umap_values <- umap(data, scale=TRUE, metric="cosine", init="laplacian", pca=20) 
# projections[['Projection:umap_cosine DR:pca data:vaf']] = tibble(x = umap_values[,1], y = umap_values[,2])

experiment$assays$dna_variants = add_analysis_layer(assay = experiment$assays$dna_variants, layer_name = 'projections', as_tibble(projections))

Visualize the projection

# Show a simple plot of the projection.
projection = experiment$assays$dna_variants$analysis_layers$projections$`Projection:umap_manhattan DR:pca data:vaf`
ggplot(data=projection) + geom_point(aes(x = x, y = y), alpha = 0.5, size=0.8)

Determine the number of clusters

In this step, we determine the optimal number of clusters for our dataset using k-means or hierarchical clustering. This is not a trivial problem. It is left to the user to explore the data. Here are a few examples for determining the number of clusters in your data.

cluster_on = experiment$assays$dna_variants$analysis_layers$projections$`Projection:umap_manhattan DR:pca data:vaf`

# Elbow method
elbow = fviz_nbclust(cluster_on, kmeans, method = "wss") +
  labs(subtitle = "Elbow method")

# Silhouette method
silhouette = fviz_nbclust(cluster_on, kmeans, method = "silhouette")+
  labs(subtitle = "Silhouette method")

# Gap statistic
# nboot = 50 to keep the function speedy. 
# Recommended value: nboot= 500 for your analysis
# Use verbose = FALSE to hide computing progression
# set.seed(123)
# gap_stat = fviz_nbclust(cluster_on, kmeans, nstart = 25,  method = "gap_stat", nboot = 25)+
#   labs(subtitle = "Gap statistic method")


(elbow + silhouette) #/ 
  # (gap_stat + plot_spacer())

Cluster on umap projection

Clustering the data is a not a trival task. Clustering can be done one umap projection or the raw data. There are numerous clustering methods with parameters that need optimization for your data

It is left to use to explore the rich community of R packages for clustering. Here we show clustering by kmeans on a umap projection.

# Hold all the different customer labels in a single structure.

cluster_by = experiment$assays$dna_variants$analysis_layers$projections$`Projection:umap_manhattan DR:pca data:vaf`
clusters = list()

# Do the clustering.

for(i in 2:10) {
  kmean_values <- kmeans(cluster_by, i ,iter.max=500)
  clusters[[paste0('umap.kmean.cluster.',i)]] = as_factor(kmean_values$cluster)
}

#############
# Add cluster labels to the analysis data structure.
#############
experiment$assays$dna_variants = add_analysis_layer(assay = experiment$assays$dna_variants, layer_name = 'umap_vaf_clusters', as_tibble(clusters))

Compare different clusters

In this example we're plotting the TI subclone labels on the the umap projection on VAF data.

color_by can be a data.frame of multiple clustering methods and all will be plotted side by side

color_by = experiment$assays$dna_variants$analysis_layers$`TI subclone`
projection = experiment$assays$dna_variants$analysis_layers$projections$`Projection:umap_manhattan DR:pca data:vaf`


p  = tapestri_scatterplot(
                 x = projection$x, 
                 y= projection$y, 
                 color_by = color_by)
p = p + umap_theme() + ggtitle('umap_vaf_clusters')
p

Color the UMAP by genotypes

 # %>% select(!contains('chr2:198267'))

color_by = experiment$assays$dna_variants$data_layers$NGT %>% select(1:20) %>% mutate_all(as_factor) %>% mutate_all(recode_genotypes)
projection = experiment$assays$dna_variants$analysis_layers$projections$`Projection:umap_manhattan DR:pca data:vaf`

p  = tapestri_scatterplot(
                 x = projection$x, 
                 y= projection$y, 
                 color_by = color_by)
p = p + umap_theme() 
p = p + ggtitle('Color by Genotypes')
p

Violin plots

color_by = experiment$assays$dna_variants$analysis_layers$umap_vaf_clusters$umap.kmean.cluster.4
projection = experiment$assays$dna_variants$analysis_layers$projections$`Projection:umap_manhattan DR:pca data:vaf`

p  = tapestri_scatterplot(
                 x = projection$x, 
                 y= projection$y, 
                 color_by = color_by)
p = p + umap_theme() + ggtitle('umap_vaf_clusters') + theme(legend.position = 'none')


v = tapestri_violinplot(clusters = color_by,
               features = experiment$assays$protein_read_counts$data_layers$normalized)
v = v + theme_bw() + theme(legend.position = "none",
                            axis.text.x = element_text(angle = 90, hjust = 1))

# pathwork magic
p / v

Color the UMAP by proteins

color_by = experiment$assays$protein_read_counts$data_layers$normalized
projection = experiment$assays$dna_variants$analysis_layers$projections$`Projection:umap_manhattan DR:pca data:vaf`


p  = tapestri_scatterplot(
                 x = projection$x, 
                 y= projection$y, 
                 color_by = color_by)
p = p + umap_theme() + scale_colour_gradient2(low="yellow", mid='grey', high="darkred") 
p = p + ggtitle('Color by Proteins')
p

Single assay heatmaps

Next we plot the SNVs for each filtered variant across all cells in heatmap format. We're only providing a simple example to get started.

Users should become familiar with ComplexHeatmap::Heatmap https://jokergoo.github.io/ComplexHeatmap-reference/book/.

# Order the features in chr order.
variant_order = experiment$assays$dna_variants$feature_annotations %>%
  mutate(CHROM = as.numeric(CHROM), POS = as.numeric(POS)) %>%
  arrange(CHROM, POS)

genotypes.mat = experiment$assays$dna_variants@data_layers$NGT %>% select(variant_order$id)

clusters = experiment$assays$dna_variants@analysis_layers$umap_vaf_clusters$umap.kmean.cluster.4

snv.h <- ComplexHeatmap::Heatmap(
  as.matrix(genotypes.mat),
  name = "GT",
  col = c("lightgrey", "yellow", "blue", "black"),
  #circlize::colorRamp2(c(0, 1, 2, 3), c("grey", "yellow", "blue", "black"))

  heatmap_legend_param = list(labels = c("WT", "HET", "HOM", "Missing")),

  split = factor(clusters),

  cluster_rows = FALSE,
  show_row_names = FALSE,
  cluster_columns = FALSE,
  row_title_gp = grid::gpar(fontsize = 6),
  column_names_gp = grid::gpar(fontsize = 8),
  show_column_dend = FALSE
)

snv.h

Protein

Cluster by proteins

Let's explore our protein count data. As we identify 4 clusters using the genotype data, we define also here 4 clones too.

# Dimensional reduction using umap
set.seed(111)
umap_values <- umap(experiment$assays$protein_read_counts$data_layers$normalized, scale=TRUE, metric="euclidean", init="laplacian", pca=20) 

umap_layer = tibble(    
      x = umap_values[,1],
      y = umap_values[,2]
)

experiment$assays$protein_read_counts = add_analysis_layer(assay = experiment$assays$protein_read_counts, layer_name = 'umap', umap_layer)

# Hold all the different customer labels in a single structure.
# cluster_by = experiment$assays$protein_read_counts$analysis_layers$umap
cluster_by = experiment$assays$protein_read_counts$data_layers$normalized

clusters = list()

# Do the clustering.

for(i in 2:10) {
  kmean_values <- kmeans(cluster_by, i ,iter.max=500)
  clusters[[paste0('umap.kmean.cluster.',i)]] = as_factor(kmean_values$cluster)

}

# Add cluster labels to the analysis data structure.
experiment$assays$protein_read_counts = add_analysis_layer(assay = experiment$assays$protein_read_counts, layer_name = 'clusters', as_tibble(clusters))

Plot the UMAP and clusters

Plot all the different clusters on the same umap projection.

p  = tapestri_scatterplot(
                 x = experiment$assays$protein_read_counts$analysis_layers$umap$x, 
                 y= experiment$assays$protein_read_counts$analysis_layers$umap$y, 
                 color_by = experiment$assays$protein_read_counts$analysis_layers$clusters)

p = p + xlab('') + ylab('')
p = p + umap_theme() 
p

Violin plots

To do:

p_umap  = tapestri_scatterplot(
                 x = experiment$assays$protein_read_counts$analysis_layers$umap$x, 
                 y= experiment$assays$protein_read_counts$analysis_layers$umap$y, 
                 color_by = experiment$assays$protein_read_counts$analysis_layers$clusters$umap.kmean.cluster.4) + 
  xlab('') + ylab('') + umap_theme() 

p_violin = tapestri_violinplot(
           clusters = experiment$assays$protein_read_counts$analysis_layers$clusters$umap.kmean.cluster.4,
           features = experiment$assays$protein_read_counts$data_layers$normalized)

p_umap / p_violin

Color UMAP by features

p  = tapestri_scatterplot(
                 x = experiment$assays$protein_read_counts$analysis_layers$umap$x, 
                 y= experiment$assays$protein_read_counts$analysis_layers$umap$y, 
                 color_by = experiment$assays$protein_read_counts$data_layers$normalized)
p = p + umap_theme() + scale_colour_gradient2(low="yellow", mid='grey', high="darkred") 
p

Single assay heatmaps

To do:

Users should become familiar with ComplexHeatmap::Heatmap https://jokergoo.github.io/ComplexHeatmap-reference/book/. We're only providing a simple example to get started.

# Order the features in chr order.
protein.mat = experiment$assays$protein_read_counts$data_layers$normalized
clusters = experiment$assays$dna_variants@analysis_layers$umap_vaf_clusters$umap.kmean.cluster.4



protein.h <- ComplexHeatmap::Heatmap(
  as.matrix(protein.mat),
  name = "Protein",
  cluster_rows = FALSE,
  cluster_columns = FALSE,

  col = circlize::colorRamp2(c(-2, 0, 2), c("yellow", "grey", "blue")),

  split = factor(clusters),
  show_row_names = FALSE,
  row_title_gp = grid::gpar(fontsize = 6),
  #heatmap_legend_param=legend_params,
  column_names_gp = grid::gpar(fontsize = 8),
  show_column_dend = FALSE
)

protein.h

CNV

Calculate ploidy

Usage:

# Select a clustering method that best represents your data and identify the normal cluster.
# Normalize all the read counts based on the normal cluster.
cnv.mat = compute_ploidy(
  reads = experiment$assays$dna_read_counts$data_layers$normalized,
  clusters = experiment$assays$dna_variants$analysis_layers$umap_vaf_clusters$umap.kmean.cluster.4,
  baseline_cluster = 2
 )
experiment$assays$dna_read_counts <- add_analysis_layer(assay = experiment$assays$dna_read_counts,layer_name = 'norm_to_baseline',data = cnv.mat)

Heatmap

From the heatmap we can clearly see the regions where there is loss of heterozygosity and ploidy from 2 is reduced closer to 1.

cnv.mat = as.matrix(experiment$assays$dna_read_counts$analysis_layers$norm_to_baseline)
cnv.mat[is.na(cnv.mat) | is.infinite(cnv.mat)] <-2

clusters = experiment$assays$dna_variants@analysis_layers$umap_vaf_clusters$umap.kmean.cluster.4

cnv.h <- ComplexHeatmap::Heatmap(
   as.matrix(cnv.mat), 
   name = "CNV",
   col = circlize::colorRamp2(c(0, 2, 4), c("yellow", "grey", "blue")), 
   split=factor(clusters),

   cluster_rows = FALSE,  
   show_row_names=FALSE, 
   cluster_columns = FALSE,  
   row_title_gp = grid::gpar(fontsize = 6), 
   column_names_gp = grid::gpar(fontsize=8),
   show_column_dend=FALSE)
cnv.h

Ploidy line plot

p = tapestri_ploidy_plot(
  normalized_reads = experiment$assays$dna_read_counts$analysis_layers$norm_to_baseline,
  clusters = experiment$assays$dna_variants$analysis_layers$umap_vaf_clusters$umap.kmean.cluster.4
  )
p

Multi-omic heatmap

Combine SNV, CNV and Protein data into a single heatmap. A nice feature of Complexheatmaps package

snv.h + cnv.h + protein.h


MissionBio/tapestriR documentation built on Feb. 25, 2021, 8:29 p.m.