# use BiocManager to install ParetoTI # BiocManager::install("vitkl/ParetoTI", dependencies = T) # use guide via the link to install python dependency for ParetoTI: https://vitkl.github.io/ParetoTI/ suppressPackageStartupMessages({ library(ParetoTI) library(SingleCellExperiment) library(ggplot2) library(cowplot) library(Matrix) })
This document looks at the division of labour within hepatocytes by characterising within-cell type variability in gene expression using Pareto front model.
Need to perform multiple tasks and natural selection put cells on a Pareto front, a narrow subspace where performance at those tasks is optimal. How important the tasks are in the environment puts cells at different locations along Pareto front. This reflects trade-off in performance at those tasks. Pareto front in the performance space translates into simple shapes gene expression of cell population. By finding minimal simplex polytope (triangle in 2D, tetrahedron in 3D, 5-vertex polytope in 4D) that encloses most of the data you can describe within cell-type heterogeniety. Cells near each vertex are specialists at one task, cells withing the shape perform a weighted combination of tasks. You can indentify the cellular tasks by finding what is special about cells closest to each vertex. This relies on recent work by Uri Alon group that adapted the multiobjective optimisation theory to cells and showed that Pareto front is equal to minimal polytope defined by specialist phenotypes.
This document looks at mouse hepatocyte measured with MARS-seq scRNA-seq protocol (both UMI and full-length). Original study by Shalev Itzkovitz group mapped scRNA-seq data to space using marker genes and found that about 50% of hepatocyte genes have a zonation gration in liver lobules. This spatial gradient results in transcriptional heterogeniety within one cell type, hepatocytes. This link between gradient in space and gradient in gene expression was recently investigated by Miri Adler & Uri Alon and exploited in novoSpaRc (de novo Spatial Reconstruction) method by Nikolaus Rajewsky. This analysis should reproduce the findings from the above mentioned study by Miri Adler & Uri Alon Continuum of Gene-Expression Profiles Provides Spatial Division of Labor within a Differentiated Cell Type.
These examples motivate using Pareto front model to describe within cell type heterogeniety to understand division of labour between cells and how these cellular tasks are distributed in space.
# uncomment to load data ------------------------------------------------------- #gse = GEOquery::getGEO("GSE84498", GSEMatrix = TRUE) #filePaths = GEOquery::getGEOSuppFiles("GSE84498", fetch_files = T, baseDir = "./processed_data/") filePaths = c("./processed_data/GSE84498/GSE84498_experimental_design.txt.gz", "./processed_data/GSE84498/GSE84498_umitab.txt.gz") design = fread(filePaths[1], stringsAsFactors = F) data = fread(filePaths[2], stringsAsFactors = F, header = T) data = as.matrix(data, rownames = "gene") # convert to single cell experiment hepatocytes = SingleCellExperiment(assays = list(counts = data), colData = design) # look at mitochondrial-encoded MT genes mito.genes = grep(pattern = "^mt-", x = rownames(data), value = TRUE) hepatocytes$perc.mito = colSums(counts(hepatocytes[mito.genes, ])) / colSums(counts(hepatocytes)) #qplot(hepatocytes$perc.mito, geom = "histogram") # look at nuclear-encoded MT genes (find those genes using GO annotations) go_annot = map_go_annot(taxonomy_id = 10090, keys = rownames(hepatocytes), columns = c("GOALL"), keytype = "ALIAS", ontology_type = c("CC")) mitochondria_located_genes = unique(go_annot$annot_dt[GOALL == "GO:0005739", ALIAS]) hepatocytes$all_mito_genes = colSums(counts(hepatocytes[mitochondria_located_genes, ])) / colSums(counts(hepatocytes)) #qplot(hepatocytes$perc.mito, hepatocytes$all_mito_genes, geom = "bin2d")
## Filtering # remove batches of different cells (probably non-hepatocytes) hepatocytes = hepatocytes[, !hepatocytes$batch %in% c("AB630", "AB631")] # remove cells with more less than 1000 or more than 30000 UMI hepatocytes = hepatocytes[, colSums(counts(hepatocytes)) > 1000 & colSums(counts(hepatocytes)) < 30000] # remove cells that express less than 1% of albumine alb_perc = counts(hepatocytes)["Alb",] / colSums(counts(hepatocytes)) hepatocytes = hepatocytes[, alb_perc > 0.01] # remove genes with too many zeros (> 95% cells) hepatocytes = hepatocytes[rowMeans(counts(hepatocytes) > 0) > 0.05,] # remove cells with too many zeros (> 85%) hepatocytes = hepatocytes[,colMeans(counts(hepatocytes) > 0) > 0.15] # Normalise gene expression by cell sum factors and log-transform hepatocytes = scran::computeSumFactors(hepatocytes) hepatocytes = scater::normalize(hepatocytes) hepatocytes = scater::normalize(hepatocytes, return_log = FALSE) # just normalise
Plot below shows first 3 PCs colored by batch.
# Find principal components hepatocytes = scater::runPCA(hepatocytes, ncomponents = 7, scale_features = T, exprs_values = "logcounts") # Plot PCA colored by batch scater::plotReducedDim(hepatocytes, ncomponents = 3, use_dimred = "PCA", colour_by = "batch")
# extract PCs (centered at 0 with runPCA()) PCs4arch = t(reducedDim(hepatocytes, "PCA"))
# find archetypes arc_ks = k_fit_pch(PCs4arch, ks = 2:8, check_installed = T, bootstrap = T, bootstrap_N = 200, maxiter = 1000, bootstrap_type = "m", seed = 2543, volume_ratio = "t_ratio", # set to "none" if too slow delta=0, conv_crit = 1e-04, order_type = "align", sample_prop = 0.75) # Show variance explained by a polytope with each k (cumulative) plot_arc_var(arc_ks, type = "varexpl", point_size = 2, line_size = 1.5) + theme_bw() # Show variance explained by k-vertex model on top of k-1 model (each k separately) plot_arc_var(arc_ks, type = "res_varexpl", point_size = 2, line_size = 1.5) + theme_bw() # Show variance in position of vertices obtained using bootstraping # - use this to find largest k that has low variance plot_arc_var(arc_ks, type = "total_var", point_size = 2, line_size = 1.5) + theme_bw() + ylab("Mean variance in position of vertices") # Show t-ratio plot_arc_var(arc_ks, type = "t_ratio", point_size = 2, line_size = 1.5) + theme_bw()
Plot show cells in PC space (data = PCs4arch) colored by log2(counts) of marker genes (data_lab = as.numeric(logcounts(hepatocytes["Alb",]))). Each red dot is a position of vertex in one of the bootstrapping iterations.
# fit a polytope with bootstraping of cells to see stability of positions arc = fit_pch_bootstrap(PCs4arch, n = 200, sample_prop = 0.75, seed = 235, noc = 4, delta = 0, conv_crit = 1e-04, type = "m") p_pca = plot_arc(arc_data = arc, data = PCs4arch, which_dimensions = 1:3, line_size = 1.5, data_lab = as.numeric(logcounts(hepatocytes["Alb",])), text_size = 60, data_size = 6) plotly::layout(p_pca, title = "Hepatocytes colored by Alb (Albumine)") p_pca = plot_arc(arc_data = arc, data = PCs4arch, which_dimensions = 1:3, line_size = 1.5, data_lab = as.numeric(logcounts(hepatocytes["Cyp2e1",])), text_size = 60, data_size = 6) plotly::layout(p_pca, title = "Hepatocytes colored by Cyp2e1") p_pca = plot_arc(arc_data = arc, data = PCs4arch, which_dimensions = 1:3, line_size = 1.5, data_lab = as.numeric(logcounts(hepatocytes["Gpx1",])), text_size = 60, data_size = 6) plotly::layout(p_pca, title = "Hepatocytes colored by Gpx1") p_pca = plot_arc(arc_data = arc, data = PCs4arch, which_dimensions = 1:3, line_size = 1.5, data_lab = as.numeric(logcounts(hepatocytes["Apoa2",])), text_size = 60, data_size = 6) plotly::layout(p_pca, title = "Hepatocytes colored by Apoa2") # You can also check which cells have high entropy of logistic regression predictions when classifying all cells in a tissue into cell types. These could have been misclassified by the method and wrongly assigned to Hepatocytes, or these could be doublets. # find archetypes on all data (allows using archetype weights to describe cells) arc_1 = fit_pch(PCs4arch, volume_ratio = "t_ratio", maxiter = 500, noc = 4, delta = 0, conv_crit = 1e-04) # check that positions are similar to bootstrapping average from above p_pca = plot_arc(arc_data = arc_1, data = PCs4arch, which_dimensions = 1:3, line_size = 1.5, data_lab = as.numeric(logcounts(hepatocytes["Alb",])), text_size = 60, data_size = 6) plotly::layout(p_pca, title = "Hepatocytes colored by Alb")
# Map GO annotations and measure activities activ = measure_activity(hepatocytes, # row names are assumed to be gene identifiers which = "BP", return_as_matrix = F, taxonomy_id = 10090, keytype = "ALIAS", lower = 20, upper = 1000, aucell_options = list(aucMaxRank = nrow(hepatocytes) * 0.1, binary = F, nCores = 3, plotStats = FALSE)) # Merge distances, gene expression and gene set activity into one matrix data_attr = merge_arch_dist(arc_data = arc_1, data = PCs4arch, feature_data = as.matrix(logcounts(hepatocytes)), colData = activ, dist_metric = c("euclidean", "arch_weights")[2], colData_id = "cells", rank = F) # Use Wilcox test to find genes maximally expressed in 10% closest to each vertex enriched_genes = find_decreasing_wilcox(data_attr$data, data_attr$arc_col, features = data_attr$features_col, bin_prop = 0.1, dist_cutoff = 0.5, method = "BioQC") enriched_sets = find_decreasing_wilcox(data_attr$data, data_attr$arc_col, features = data_attr$colData_col, bin_prop = 0.1, dist_cutoff = 0.5, method = "BioQC") # Take a look at top genes and functions for each archetype labs = get_top_decreasing(summary_genes = enriched_genes, summary_sets = enriched_sets, cutoff_genes = 0.01, cutoff_sets = 0.05, cutoff_metric = "wilcoxon_p_val", p.adjust.method = "fdr", order_by = "mean_diff", order_decreasing = T, min_max_diff_cutoff_g = 0.4, min_max_diff_cutoff_f = 0.03) p_pca = plot_arc(arc_data = arc_1, data = PCs4arch, which_dimensions = 1:3, line_size = 1.5, data_lab = data_attr$data$archetype_1, # $ribosomal_large_subunit_biogenesis, text_size = 60, data_size = 6) plotly::layout(p_pca, title = "ribosomal_large_subunit_biogenesis activity")
To measure goodness of observed fit I compare observed tetrahedron shape to shape of data with no relationships between variables. This is done by comparing the ratio of tertahedron volume to volume of convex hull, a complex shape that contains all of the data. Empirical p-value is fraction of random t-ratios that are at least as high as the observed t-ratio.
# use permutations within each dimension - this is only possible for less than 8 vertices because computing convex hull gets exponentially slower with more dimensions start = Sys.time() pch_rand = randomise_fit_pch(PCs4arch, arc_data = arc_1, n_rand = 1000, replace = FALSE, bootstrap_N = NA, volume_ratio = "t_ratio", maxiter = 500, delta = 0, conv_crit = 1e-4, type = "m", clust_options = list(cores = 3)) # use type m to run on a single machine or cloud # type = "m", clust_options = list(cores = 3)) # use clustermq (type cmq) to run as jobs on a computing cluster (higher parallelisation) # type = "cmq", clust_options = list(njobs = 10)) # This analysis took: Sys.time() - start
# plot background distribution of t-ratio and show p-value plot(pch_rand, type = c("t_ratio"), nudge_y = 5) pch_rand
Sys.Date. = Sys.Date() Sys.Date. session_info. = devtools::session_info() session_info.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.