#
# Execute this block to render the script.
#
devtools::load_all('../..')
devtools::load_all('../../../DeLoreanData')
rmarkdown::render('McDavid-Oscope.Rmd')
library(knitr)
library(knitcitations)
library(rmarkdown)
#
# knitr options
#
opts_chunk$set(
    fig.path = 'figures/McDavid-',
    stop_on_error = TRUE,
    fig.width = 12.5,
    fig.height = 8)
#
# Citations
#
cleanbib()
cite_options(
    # hyperlink = 'to.doc',
    hyperlink = TRUE,
    # style = 'html',
    # citation_format = 'text',
    citation_format = "pandoc",
    cite.style = "numeric",
    check.entries = TRUE)
    # hyperlink = TRUE)
bib <- read.bibtex("DeLorean.bib")
if (file.exists("config.R")) {
    source("config.R")
}
source(system.file("scripts/shared.R", package="DeLorean"))
# suppressMessages(loadfonts())
font.family <- "Verdana"
font.theme <- theme_update(text=element_text(family=font.family))
theme_set(font.theme)

r citet(bib[["mcdavid_modeling_2014"]]) assayed actively cycling single cells in order to examine the confounding effect of the cell cycle on single cell expression analyses. They measured the expression of 333 genes in 930 cells across three cell cycle phases and three cell lines.

r date()

Data

McDavid et al.'s data is available in the DeLorean R package. Not all 333 genes are represented as several are rarely expressed.

library(Oscope)
# vignette('Oscope_vignette')
library(DeLoreanData)
library(dplyr)
library(reshape2)
library(ggplot2)
data(McDavidDeLorean)
seed <- getOption("McDavid.seed", 1)
set.seed(seed)
gene.levels <- levels(mcdavid.gene.meta$gene)
cell.levels <- levels(mcdavid.cell.meta$cell)

Choose genes and cells

Choose a few cells from the PC3 cell line and those genes with -log pvalue greater than 10.

cells.PC3 <- mcdavid.cell.meta %>% filter("PC3" == cellline)
genes.sig <- filter(mcdavid.gene.meta, pvalue > 10)
# Shift positively expressed genes closer to unexpressed genes to improve fit.
# Just as in DeLorean analysis
expr <- pmax(mcdavid.expr - .69, 0)[as.character(genes.sig$gene),as.character(cells.PC3$cell)]

Normalisation

Oscope uses expression values on the original scale, not log transformed and expects mean gene levels to be above 100.

oExpr <- 1000*exp(expr) # Scale to put on similar scale to Oscope expected
Sizes = MedianNorm(oExpr)
DataNorm <- GetNormalizedMat(oExpr, Sizes)

Mean-variance

We perform the mean-variance analysis but we are going to use all the genes in any case, so we don't use the results.

MV <- CalcMV(Data=oExpr, Sizes=Sizes)
print(length(MV$GeneToUse))

Paired sine model

Rescale the genes expression for the test.

DataInput <- NormForSine(DataNorm)

Perform the paired sine test.

SineRes <- OscopeSine(DataInput, parallel=TRUE)
str(SineRes)

K-medoids algorithm

Use all gene pairs (quan=0) as all the genes have small p-values in McDavid's test of periodicity.

KMRes <- OscopeKM(SineRes, quan=0)
print(KMRes)

Check the p-values of genes identified.

filter(mcdavid.gene.meta, gene %in% unlist(KMRes)) %>% arrange(pvalue)

Flag clusters

Flag the clusters that are not significant.

ToRM <- FlagCluster(SineRes, KMRes, DataInput)
print(ToRM$FlagID_bysine)
print(ToRM$FlagID_byphase)
print(ToRM$FlagID) # all flagged clusters
KMResUse <- KMRes[-ToRM$FlagID]
print(KMResUse)

Extended nearest insertion

Calculate orderings for each cluster.

ENIRes <- OscopeENI(KMRes=KMResUse, Data=DataInput, NCThre=1000, parallel=TRUE)

Reorder the data.

DataOrder <- DataNorm[,ENIRes[["cluster1"]]]
cell.times <- data.frame(cell=factor(colnames(DataOrder), levels=cell.levels),
                         time=1:ncol(DataOrder))
expr.l <-
    melt(DataOrder, varnames=c('gene', 'cell'), value.name='x') %>%
    mutate(
        gene=factor(gene, levels=gene.levels),
        cell=factor(cell, levels=cell.levels)) %>%
    left_join(cell.times)
sample_n(expr.l, 10)

Align ordering

We don't know which point of the cell cycle the Oscope inferred ordering starts at. We estimate this by assessing all possibile offsets and choosing the one with the smallest RMSE to the McDavid capture time labels.

source(system.file("scripts/McDavid-fns.R", package="DeLorean"))
apply.offset <- function(offset) {
    cell.times %>%
        left_join(mcdavid.cell.meta) %>%
        mutate(
            cbtime=periodise(100*(time+offset)/ncol(DataOrder), period=100),
            obs.dist=peak.distance(tau.to.cbtime(obstime), cbtime))
}
assess.offset <- function(offset) calc.rms(apply.offset(offset)$obs.dist)
obs.dist.rmses <- sapply(1:ncol(DataOrder), assess.offset)
qplot(obs.dist.rmses)
best.offset <- which.min(obs.dist.rmses)
cell.times <- apply.offset(best.offset)
sample_n(cell.times, 10)

Calculate peak offsets

Now we have aligned the ordering, we can calculate the RMSE between the peaks in the ordering and the CycleBase defined gene peak times.

gene.peaks <-
    get.gene.peaks(expr.l %>% left_join(mcdavid.cell.meta)) %>%
    left_join(cell.times) %>%
    left_join(mcdavid.gene.meta) %>%
    filter(! is.na(cbPeaktime)) %>%
    mutate(capture.dist=peak.distance(cbPeaktime, cbtime))
sample_n(gene.peaks, 10)
capture.RMSE <- calc.rms(gene.peaks$capture.dist)
capture.RMSE

Gene profiles

Plot the genes against CycleBase time

plot.profiles <-
    ggplot(expr.l %>% left_join(cell.times), aes(x=cbtime, y=log(x), color=capture)) +
    geom_point(alpha=.3) +
    facet_wrap(~ gene)
print(plot.profiles)
golden.ratio <- 1.618  # Pleasing ratio
fig.width <- 4.7  # LaTeX width in inches
fig.height <- fig.width / golden.ratio
ggsave('McDavid-Oscope-profiles.pdf', plot=plot.profiles + bioinf.config,
       width=fig.width*2, height=fig.width*2, units="in")
# Save the results.
save(
    expr,
    Sizes,
    DataNorm,
    DataInput,
    SineRes,
    KMRes,
    KMResUse,
    ENIRes,
    DataOrder,
    expr.l,
    gene.peaks,
    cell.times,
    capture.RMSE,
    file='Data/McDavid-Oscope.RData')
# Load the results.
load(file='Data/McDavid-Oscope.RData')

Session information

date()

R version and packages used:

devtools::session_info()


JohnReid/DeLorean documentation built on Sept. 27, 2021, 5:45 a.m.