Nothing
library(knitr) module <- 'Trapnell-2014' system.time(knit2html(sprintf('%s.Rmd', module), stylesheet='../Rmd/foghorn.css'))
# install.packages("VGAM") # source('http://mc-stan.org/rstan/install.R', echo=T, max.deparse.length=2000) # install_rstan() library(VGAM) library(splines) library(dplyr) library(reshape2) library(stringr) library(ggplot2)
Load data from GEO file.
data.dir <- '../../data' truseq <- 'GSE52529_truseq_fpkm_matrix.txt.gz' trapnell <- read.csv(paste(data.dir, 'GSE52529_fpkm_matrix.txt.gz', sep='/'), sep="\t") trapnell$gene <- factor(rownames(trapnell), levels=rownames(trapnell)) trapnell.l <- ( melt(trapnell, variable.name="cell", value.name="fpkm") %>% mutate(fpkm.log10=log10(fpkm)))
Parse the cell meta data.
cell.meta <- ( data.frame(cell=factor(levels(trapnell.l$cell))) %>% mutate(capture=str_match(levels(cell), "^T([0-9]+)")[,2], obstime=as.integer(capture), capture=factor(capture))) min.expr <- 0.1 # Minimum expression value for Tobit model expr.threshold <- 1 # The value above which genes are considered expressed (ggplot(trapnell.l %>% filter(fpkm > min.expr) %>% sample_n(10000) %>% left_join(cell.meta), aes(x=fpkm, color=factor(obstime))) + geom_density() + geom_rug(alpha=.01) + scale_x_log10() )
Filter genes as in Trapnell et al.
gene.meta <- ( trapnell.l %>% group_by(gene) %>% dplyr::summarise(num.expr=sum(fpkm > expr.threshold), lpe.sd=sd(fpkm.log10[fpkm > min.expr])) %>% filter(num.expr >= 50, lpe.sd > .7)) qplot(gene.meta$num.expr, binwidth=5) qplot(gene.meta$lpe.sd) trapnell.f <- ( trapnell.l %>% filter(gene %in% gene.meta$gene) %>% left_join(cell.meta) ) length(unique(trapnell.f$gene))
Use Tobit model to filter genes on a differential expression test.
fit.model <- function(form) { vgam(form, family=tobit(Lower=log10(min.expr), Upper=Inf)) } Tobit.Diff.Expr.LRT <- function(fpkm.log10, obstime) { FM.fit <- fit.model(fpkm.log10 ~ bs(obstime, df=3)) # summary(FM.fit) RM.fit <- fit.model(fpkm.log10 ~ 1) # summary(RM.fit) if (is.null(FM.fit) == FALSE && is.null(RM.fit) == FALSE) { lrt <- lrtest(FM.fit, RM.fit) return(lrt@Body["Pr(>Chisq)"][2,]) } else { return(1) } } genes.high.sd <- filter(gene.meta, num.expr >= 50, lpe.sd > 1)$gene gene.diff.expr <- (trapnell.f %>% filter(gene %in% genes.high.sd) %>% group_by(gene) %>% dplyr::summarise(p.val=Tobit.Diff.Expr.LRT(fpkm.log10, obstime)) %>% arrange(p.val) ) gene.diff.expr qplot(gene.diff.expr$p.val, binwidth=1) + scale_x_log10() genes.diff.expr <- filter(gene.diff.expr, p.val < 1e-2)$gene length(genes.diff.expr)
Get gene names.
library("biomaRt") ensembl <- useMart("ensembl", dataset="hsapiens_gene_ensembl") filters <- listFilters(ensembl) attributes = listAttributes(ensembl) gene.meta$ensembl_gene_id <- str_match(gene.meta$gene, '(ENSG[0-9]+)')[,1] gene.names <- getBM(attributes=c('ensembl_gene_id', 'hgnc_symbol'), mart = ensembl) gene.meta <- ( gene.meta %>% left_join(gene.names) # Join the name to the ID %>% dplyr::select(-ensembl_gene_id) %>% group_by(gene) # Just take first name for each gene %>% do(head(., 1)) %>% ungroup() )
Reshape data into general format for model.
fpkm <- trapnell.f %>% dcast(gene ~ cell, value.var="fpkm") fpkm.m <- as.matrix(fpkm %>% dplyr::select(-gene)) rownames(fpkm.m) <- fpkm$gene expr <- log10(fpkm.m + 1)
Save data.
rda.path <- paste(data.dir, "TrapnellDeLorean.rda", sep="/") message('Saving expression data and meta-data to: ', rda.path) rename.and.save <- function(..., file) { x <- list(...) save(list=names(x), file=file, envir=list2env(x)) } rename.and.save( trapnell.expr=expr, trapnell.gene.meta=gene.meta, trapnell.cell.meta=cell.meta, file=rda.path)
R version and packages used:
sessionInfo()
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.