# # Execute this block to render the script. # devtools::load_all('../..') devtools::load_all('../../../DeLoreanData') rmarkdown::render('McDavid-Oscope.Rmd')
library(knitr) library(knitcitations) library(rmarkdown) # # knitr options # opts_chunk$set( fig.path = 'figures/McDavid-', stop_on_error = TRUE, fig.width = 12.5, fig.height = 8) # # Citations # cleanbib() cite_options( # hyperlink = 'to.doc', hyperlink = TRUE, # style = 'html', # citation_format = 'text', citation_format = "pandoc", cite.style = "numeric", check.entries = TRUE) # hyperlink = TRUE) bib <- read.bibtex("DeLorean.bib") if (file.exists("config.R")) { source("config.R") } source(system.file("scripts/shared.R", package="DeLorean"))
# suppressMessages(loadfonts()) font.family <- "Verdana" font.theme <- theme_update(text=element_text(family=font.family)) theme_set(font.theme)
r citet(bib[["mcdavid_modeling_2014"]])
assayed actively cycling single
cells in order to examine the confounding effect of the cell cycle on
single cell expression analyses. They measured the expression of 333 genes
in 930 cells across three cell cycle phases and three cell lines.
r date()
McDavid et al.'s data is available in the DeLorean
R package. Not all 333
genes are represented as several are rarely expressed.
library(Oscope) # vignette('Oscope_vignette') library(DeLoreanData) library(dplyr) library(reshape2) library(ggplot2) data(McDavidDeLorean) seed <- getOption("McDavid.seed", 1) set.seed(seed) gene.levels <- levels(mcdavid.gene.meta$gene) cell.levels <- levels(mcdavid.cell.meta$cell)
Choose a few cells from the PC3 cell line and those genes with -log pvalue greater than 10.
cells.PC3 <- mcdavid.cell.meta %>% filter("PC3" == cellline) genes.sig <- filter(mcdavid.gene.meta, pvalue > 10) # Shift positively expressed genes closer to unexpressed genes to improve fit. # Just as in DeLorean analysis expr <- pmax(mcdavid.expr - .69, 0)[as.character(genes.sig$gene),as.character(cells.PC3$cell)]
Oscope uses expression values on the original scale, not log transformed and expects mean gene levels to be above 100.
oExpr <- 1000*exp(expr) # Scale to put on similar scale to Oscope expected Sizes = MedianNorm(oExpr) DataNorm <- GetNormalizedMat(oExpr, Sizes)
We perform the mean-variance analysis but we are going to use all the genes in any case, so we don't use the results.
MV <- CalcMV(Data=oExpr, Sizes=Sizes) print(length(MV$GeneToUse))
Rescale the genes expression for the test.
DataInput <- NormForSine(DataNorm)
Perform the paired sine test.
SineRes <- OscopeSine(DataInput, parallel=TRUE) str(SineRes)
Use all gene pairs (quan=0
) as all the genes have small p-values in McDavid's
test of periodicity.
KMRes <- OscopeKM(SineRes, quan=0) print(KMRes)
Check the p-values of genes identified.
filter(mcdavid.gene.meta, gene %in% unlist(KMRes)) %>% arrange(pvalue)
Flag the clusters that are not significant.
ToRM <- FlagCluster(SineRes, KMRes, DataInput) print(ToRM$FlagID_bysine) print(ToRM$FlagID_byphase) print(ToRM$FlagID) # all flagged clusters KMResUse <- KMRes[-ToRM$FlagID] print(KMResUse)
Calculate orderings for each cluster.
ENIRes <- OscopeENI(KMRes=KMResUse, Data=DataInput, NCThre=1000, parallel=TRUE)
Reorder the data.
DataOrder <- DataNorm[,ENIRes[["cluster1"]]] cell.times <- data.frame(cell=factor(colnames(DataOrder), levels=cell.levels), time=1:ncol(DataOrder)) expr.l <- melt(DataOrder, varnames=c('gene', 'cell'), value.name='x') %>% mutate( gene=factor(gene, levels=gene.levels), cell=factor(cell, levels=cell.levels)) %>% left_join(cell.times) sample_n(expr.l, 10)
We don't know which point of the cell cycle the Oscope inferred ordering starts at. We estimate this by assessing all possibile offsets and choosing the one with the smallest RMSE to the McDavid capture time labels.
source(system.file("scripts/McDavid-fns.R", package="DeLorean")) apply.offset <- function(offset) { cell.times %>% left_join(mcdavid.cell.meta) %>% mutate( cbtime=periodise(100*(time+offset)/ncol(DataOrder), period=100), obs.dist=peak.distance(tau.to.cbtime(obstime), cbtime)) } assess.offset <- function(offset) calc.rms(apply.offset(offset)$obs.dist) obs.dist.rmses <- sapply(1:ncol(DataOrder), assess.offset) qplot(obs.dist.rmses) best.offset <- which.min(obs.dist.rmses) cell.times <- apply.offset(best.offset) sample_n(cell.times, 10)
Now we have aligned the ordering, we can calculate the RMSE between the peaks in the ordering and the CycleBase defined gene peak times.
gene.peaks <- get.gene.peaks(expr.l %>% left_join(mcdavid.cell.meta)) %>% left_join(cell.times) %>% left_join(mcdavid.gene.meta) %>% filter(! is.na(cbPeaktime)) %>% mutate(capture.dist=peak.distance(cbPeaktime, cbtime)) sample_n(gene.peaks, 10) capture.RMSE <- calc.rms(gene.peaks$capture.dist) capture.RMSE
Plot the genes against CycleBase time
plot.profiles <- ggplot(expr.l %>% left_join(cell.times), aes(x=cbtime, y=log(x), color=capture)) + geom_point(alpha=.3) + facet_wrap(~ gene) print(plot.profiles) golden.ratio <- 1.618 # Pleasing ratio fig.width <- 4.7 # LaTeX width in inches fig.height <- fig.width / golden.ratio ggsave('McDavid-Oscope-profiles.pdf', plot=plot.profiles + bioinf.config, width=fig.width*2, height=fig.width*2, units="in")
# Save the results. save( expr, Sizes, DataNorm, DataInput, SineRes, KMRes, KMResUse, ENIRes, DataOrder, expr.l, gene.peaks, cell.times, capture.RMSE, file='Data/McDavid-Oscope.RData')
# Load the results. load(file='Data/McDavid-Oscope.RData')
date()
R version and packages used:
devtools::session_info()
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.