BiocStyle::markdown()

Introduction and setup

Here we show how embeddr (= spectral embedding + principal curves) can be used for pseudotemporal ordering of single-cell gene expression data using the monocle dataset. This uses the HSMMSingleCell dataset that is bundled with monocle.

library(monocle) ## for monocle data
library(reshape2) ## to melt data frames
library(dplyr) 
library(ggplot2)
library(scater) ## to hold single-cell data
library(knitr) ## for kable function
library(goseq) ## for GO enrichment
library(org.Hs.eg.db) ## for HG19 GO annotations
library(embeddr)

First we create the SCESet using the data from the HSMMSingleCell package:

## This is a bit fiddly since HSMMSingleCell changed format recently
sce <- NULL
hsmm_data_available <- data(package='HSMMSingleCell')$results[,3]
if("HSMM" %in% hsmm_data_available) {
  data(HSMM)
  sce <- fromCellDataSet(HSMM, use_exprs_as = 'fpkm')
} else if("HSMM_expr_matrix" %in% hsmm_data_available) {
  data(HSMM_expr_matrix)
  data(HSMM_gene_annotation)
  data(HSMM_sample_sheet)

  pd <- new('AnnotatedDataFrame', data = HSMM_sample_sheet)
  fd <- new('AnnotatedDataFrame', data = HSMM_gene_annotation)
  sce <- newSCESet(fpkmData = HSMM_expr_matrix, phenoData = pd, featureData = fd)
} else {
  stop('No recognised data types in HSMMSingleCell')
}

## add cell_id to HSMM to play nicely with dplyr
phenoData(sce)$cell_id <- rownames(pData(sce))

Laplacian eigenmaps

The embeddr workflow creates the nearest neighbour graph then returns the laplacian eigenmap. We can do this using embeddr::embeddr with all options available, though embeddr::weighted_graph and embeddr::laplacian_eigenmap are available to perform each step by hand or with custom distance metrics. The default options specify a nearest neighbour graph with $k = round(log(n))$ neighbours for $n$ cells. Other options for creating the graph (such as distance measures and heat kernels) are also available.

Selecting genes for the embedding

In standard manifold learning problems it is recommended that each feature is appropriately scaled to have mean 0 and variance 1. However, this is equivalent to treating all genes as equally contributing towards the process. Therefore it is recommended not to scale the dataset.

The entire transcriptome can be used to construct the embedding. However, it can be useful to pick only high-variance genes removing some of the residual noise from housekeeping or lowly expressed ones. The justification behind this is that the main source of variation in our dataset will be attributed to the process of interest. These high variance genes can be found using spike-ins (see Brennecke et al. Nature Methods 2014) or simlpy by fitting CV-mean curves and finding genes with a CV much higher than the mean:

x <- t(exprs(sce)) / log(10) * log(2) # convert to log10 for CV-mean fitting
x_mean <- colMeans(x)
x_var <- apply(x, 2, var)
genes_for_fit <- x_mean > 0.3
CV2 <- x_var[genes_for_fit] / (x_mean[genes_for_fit])^2
df_fit <- data.frame(m = x_mean[genes_for_fit], CV2 = CV2)
fit_loglin <- nls(CV2 ~ a * 10^(-k * m), data = df_fit, start=c(a=5, k=1)) 
ak <- coef(fit_loglin)
f <- function(x) ak[1] * 10^(-ak[2] * x) 
genes_for_embedding <- (CV2 > 4 * predict(fit_loglin))
df_fit$for_embedding <- as.factor(genes_for_embedding)
ggplot(df_fit, aes(x=m, y=CV2, color = for_embedding)) + geom_point() +
  theme_bw() + xlab('Mean') + ylab('CV2') + ggthemes::scale_color_fivethirtyeight() +
  stat_function(fun=f, color='black')
set.seed(123)
gene_indices <- match(names(which(genes_for_embedding)), featureNames(sce))
sce <- embeddr(sce, genes_for_embedding = gene_indices)

pData(sce)$long_state <- plyr::mapvalues(pData(sce)$State, from=1:3,
                                            to=c('Proliferating cell',
                                                 'Differentiating myoblast',
                                                 'Interstitial mesenchymal cell'))

plot_embedding(sce, color_by = 'long_state')

We can view the graph of cells to see how connections between different parts form:

plot_graph(sce)

We can also cluster the embedding using Gaussian mixture models from the package mclust and plot:

sce <- cluster_embedding(sce, k = 3)

sce_tmp <- sce
phenoData(sce_tmp)$cluster <- plyr::mapvalues(pData(sce_tmp)$cluster, from=c(3, 1, 2),
                                            to=c(1,2,3))
phenoData(sce_tmp)$cluster <- plyr::mapvalues(pData(sce_tmp)$cluster, from=1:3,
                                            to=c('Interstitial mesenchymal cell',
                                                 'Proliferating cell',
                                                 'Differentiating myoblast'))

plot_embedding(sce_tmp)

To identify what the different clusters mean we can pass particular gene names to plot_embedding and it will vary the point size with expression:

marker_genes <- row.names(subset(fData(sce),  gene_short_name %in% c("CDK1", "MYOG", "PDGFRA", 
                                                                    "SPHK1", "MYF5", "NCAM1")))
plot_embedding(sce, plot_genes = marker_genes, plot_pseudotime = FALSE, use_short_names = TRUE)

This implies that groups 1 & 2 correspond to differentiating cells while group 3 is the contaminating mesenchymal cells. We can then separate off groups 1 & 2 as sce_pst, just those cells we believe to be pseudotemporally regulated:

sce_pst <- sce[, pData(sce)$cluster %in% c(1,2)]

Pseudotime fitting

Pseudotime fitting is performed using the function fit_pseudotime:

sce_pst <- fit_pseudotime(sce_pst)
plot_embedding(sce_pst)

Plotting genes in pseudotime

We can plot genes in pseudotime without any model fitting using the function plot_in_pseudotime. This accepts a subset of an SCESet corresponding to a parituclar number of genes and plots them using smoothed LOESS regression. Here we select five genes of particular interest in the original paper. The variable genes_to_plot can be anything that will subset an SCESet:

genes_to_plot <- row.names(subset(fData(sce), 
                                  gene_short_name %in% c("CDK1", "MEF2C", "MYH3", "MYOG","ID1")))
plot_in_pseudotime(sce_pst[genes_to_plot,], use_short_names = TRUE)

The results are largely equivalent to those identified in monocle.

We can also look at the MRF family of transcription factors:

mrf <- c('MYOD1', 'MYF5', 'MYF6', 'MYOG') 
mrf_ind <- sapply(mrf, match, fData(sce_pst)$gene_short_name)

plot_in_pseudotime(sce_pst[mrf_ind,], use_short_names = TRUE)

Differential expression across pseudotime

Fitting a single-gene

Let's fit differential expression models to the gene CDK1. We can fit a differential expression model over pseudotime using fit_pseudotime_model and fit a NULL model using fit_null_model. These are essentially the same models used in Monocle though using the AER package which is slightly faster. We can then compute a p-value using the likelihood ratio test with the function compare_models.

cdk1 <- row.names(subset(fData(sce_pst),  gene_short_name == 'CDK1'))
model <- fit_pseudotime_model(sce_pst, cdk1)
null_model <- fit_null_model(sce_pst, cdk1)
p_val <- compare_models(model, null_model)

Note these three steps (fit_pseudotime_model -> fit_null_model -> compare_models) are encapsulated in the single function gene_pseudotime_test.

We can then use plot_pseudotime_model to view how the model looks across pseudotime. Note this is different to plot_in_pseudotime which fits a LOESS curve using ggplot2 rather than a smoothing-spline model. In plot_pseudotime_model, if model is NULL then the model will be recomputed.

plot_pseudotime_model(sce_pst[cdk1, ], model)

Differential expression for an entire SCESet

Often we're not concerned about a single-gene but would like to know which genes are pseudotemporally regulated. To do this we use the functino pseudotime_test which tests all genes and returns p and FDR-corrected q-values. First reduce the SCESet to genes expressed in at least 20% of cells:

n_cells_expressed <- rowSums(is_exprs(sce_pst))
keep_gene <- n_cells_expressed > 0.2 * dim(sce_pst)[2]
sce_pst_kept <- sce_pst[keep_gene,]

Then use pseudotime_test to get differential expression:

diff_gene_test <- pseudotime_test(sce_pst_kept, n_cores = 1)
head(diff_gene_test)

and plot the p and q values:

qplot(diff_gene_test$p_val, binwidth = 0.01) + theme_bw() + xlab('p-value') 
qplot(diff_gene_test$q_val, binwidth = 0.01) + theme_bw() + xlab('corrected p-value')

Using an FDR-corrected significance level of 1% we can then create a new SCESet containing only differentially expressed genes:

alpha <- 0.01
sig_genes <- diff_gene_test$gene[diff_gene_test$q_val < alpha]
sce_sig <- sce_pst_kept[sig_genes,]

Identifying gene clusters in pseudotime

We use the predicted expression of differentially expressed genes to find pseudotime clusters (again, similar to monocle). The function predicted_expression returns the predicted expression matrix from the Tobit regression models. Note that this has to recalculate all the models as storing them takes up too much memory.

pe <- predicted_expression(sce_sig)

And we can plot the correlation plot:

pcor <- cor(pe)
dist_cor <- 1 - pcor / 2 
hc <- hclust(as.dist(dist_cor))
plot(hc, labels=FALSE)

It's obvious we have four distinct modules. We can cut these and plot the predicted expression in each class:

n_cuts <- 4
gene_classes <- cutree(hc, n_cuts)

df_gene <- data.frame(gene=colnames(pe), class=gene_classes)

pe <- data.frame(scale(pe)) ## scaled pst-by-gene
pe$pseudotime <- pseudotime(sce_sig)

pe_melted <- melt(pe, id.vars='pseudotime', value.name='expression', variable.name='gene')
pe_melted <- inner_join(pe_melted, df_gene)

## we want to plot the mean expression rather than all of it (messy)
gp_groups <- group_by(pe_melted, class, pseudotime)
mean_expr_per_group <- dplyr::summarise(gp_groups, mean_expr = mean(expression))
pe_melted <- inner_join(pe_melted, mean_expr_per_group)
## pe_melted <- arrange(pe_melted, gene, pseudotime)

ggplot(pe_melted) + geom_line(aes(x=pseudotime, y=mean_expr), color='red') +
  facet_wrap(~ class, ncol = 2) + stat_density2d(aes(x=pseudotime, y=expression), n=150) +
  theme_bw() + ylab('Expression') # add ncol = 1 for vertical representation

Look at the number of genes in each class:

genes_per_class <- sapply(1:n_cuts, function(i) sum(gene_classes == i))
df_gpc <- data.frame(Class=1:n_cuts, 'Number in class' = genes_per_class)
print(kable(df_gpc, caption='Genes per class'))

Finally look at GO enrichment for each class using the goseq package:

genes_per_class <- sapply(1:n_cuts, function(i) 1 * (df_gene$class == i))
gene_names <- df_gene$gene
all_genes <- featureNames(sce_pst_kept)
gene_names <- sapply(as.character(gene_names), function(gn) strsplit(gn, '[.]')[[1]][1])
all_genes <- sapply(as.character(all_genes), function(gn) strsplit(gn, '[.]')[[1]][1])

genes_not_de <- setdiff(all_genes, gene_names)
genes_per_class <- rbind(genes_per_class, matrix(0, ncol=ncol(genes_per_class), nrow=length(genes_not_de)))

rownames(genes_per_class) <- c(gene_names, genes_not_de)

enriched_terms <- apply(genes_per_class, 2, function(gene_set) {
  pwf <- nullp(gene_set, 'hg19', 'ensGene', plot.fit=FALSE)
  go <- goseq(pwf, 'hg19', 'ensGene', test.cats=c('GO:BP'))
  go$log_qval <- log10(p.adjust(go$over_represented_pvalue, method='BH'))
  go <- dplyr::filter(go, log_qval < log10(0.01))
  go <- dplyr::select(go, category, log_qval, term)
  names(go) <- c('Category','log10 q-value','Term')
  return(go)
  })

reduced <- lapply(enriched_terms, head, 6)

And print the terms:

for(i in 1:n_cuts) {
  print(kable(reduced[[i]], caption = paste('GO terms for cluster', i)))
}
sessionInfo()


kieranrcampbell/embeddr documentation built on May 20, 2019, 9:24 a.m.