devtools::load_all('../..') rmarkdown::render('fetch-Shalek-2014.Rmd')
library(DeLorean) library(stringr) library(dplyr) library(reshape2) library(ggplot2)
library(knitr) library(rmarkdown) # # knitr options # opts_chunk$set( fig.path = 'figures/Shalek-', stop_on_error = TRUE, fig.width = 12.5, fig.height = 8) # # Widths for saving figures # text.width <- 4.79 # LaTeX width in inches golden.ratio <- 1.618 # Pleasing ratio fig.width <- text.width fig.height <- text.width / golden.ratio # # Stylesheet # options(markdown.HTML.stylesheet = system.file("inst/Rmd/foghorn.css", package="DeLorean")) font.family <- "Verdana" font.theme <- theme_update(text=element_text(family=font.family)) theme_set(font.theme)
Shalek et al. investigated mouse bone-marrow-derived dendritic cells.
Two matrix files are available from GEO. We shall name them A and B. First we parse A.
csv.filename <- '../../data/GSE48968_allgenesTPM_GSM1189042_GSM1190902.txt.gz' shalek.A <- read.csv(csv.filename, sep='\t') cells.A <- colnames(shalek.A) genes.A <- rownames(shalek.A) class(shalek.A) dim(shalek.A) sapply(shalek.A, class) gene.meta.A <- data.frame(gene=factor(genes.A, levels=genes.A)) gp <- qplot(as.vector(as.matrix(shalek.A[sample(nrow(shalek.A), 1000), sample(ncol(shalek.A), 100)]))) pdf("Shalek-expr.pdf") print(gp + scale_x_log10()) dev.off() cell.re.A <- str_c( '^', '(?:(Tube_Control|On_Chip_Stimulation)_)?', '(?:(Ifnar1|Stat1|Tnfr)_KO_)?', '(?:(IFNB|LPS|PAM|PIC)_)?', '(?:(Unstimulated|[[:digit:]]+h)_)?', '(?:(Replicate)_)?', '(?:S([[:digit:]]+)_)?', '$') cell.meta.A <- ( as.data.frame(str_match(str_c(names(shalek.A), '_'), cell.re.A)) %>% dplyr::select(cell=V1, assay=V2, ko=V3, stimulant=V4, capture=V5, replicate=V6, cell.num=V7) %>% mutate(cell=factor(str_replace(cell, "_$", ""), levels=cells.A), cell.num=as.numeric(as.character(cell.num)), capture=factor(capture, ordered=TRUE, levels=c("Unstimulated", "1h", "2h", "4h", "6h")), obstime=as.numeric(ifelse("Unstimulated"==capture, "0", str_match(capture, "[[:digit:]]+"))))) sample_n(cell.meta.A, 20) stopifnot(all(! is.na(cell.meta.A[,1]))) which(is.na(cell.meta.A[,1])) sapply(cell.meta.A %>% dplyr::select(-1), levels) sapply(cell.meta.A, class) nrow(cell.meta.A) # cell.meta.A %>% filter(""!=assay)
Load supplementary table 2.
supp.table.2 <- ( read.csv('../../data/Shalek-2014-Supp-Table-2.csv') %>% rename(cell=Experiment, total=Total.Reads, aligned=Aligned.Reads....Transcriptome.) %>% mutate(cell.orig=cell, cell.1=cell %>% str_replace("Technical_Replicate_1_", "Replicate_1_") %>% str_replace("Technical_Replicate_2_", "Replicate_2_") %>% str_replace("_Unstim_", "_Unstimulated_") %>% str_replace("_rsem$", ""), cell=factor(cell.1, levels=cells.A))) #%>% dplyr::select(-cell.1)) sapply(supp.table.2, class) dim(supp.table.2) sum(is.na(supp.table.2$cell)) # Which cells failed to map # supp.table.2 %>% filter(is.na(cell)) # Update cell meta data with Table 2 cell.meta.A <- ( cell.meta.A %>% left_join(supp.table.2 %>% dplyr::select(cell, total, aligned))) dim(cell.meta.A) # Check which cells in table 2 have more than one match print( supp.table.2 %>% group_by(cell) %>% dplyr::summarise(num=n()) %>% arrange(-num) %>% filter(num > 1) )
Load supplementary table 3.
supp.table.3 <- ( read.csv('../../data/Shalek-2014-Supp-Table-3.csv') %>% rename(gene=GENE, cluster=CLUSTER, RefSeq=REFSEQ) %>% mutate(gene=factor(gene, levels=genes.A))) sapply(supp.table.3, class) dim(supp.table.3) sum(is.na(supp.table.3$gene)) which(is.na(supp.table.3$gene)) gene.meta.A <- gene.meta.A %>% left_join(supp.table.3)
Investigate the cluster disruption markers.
cluster.disrupt <- ( t(as.matrix(shalek.A[c('SERPINB6B', 'LYZ1'),])) %>% melt(varnames=c("cell", "gene"), value.name="TPM") %>% mutate(expr=log(TPM+1)) %>% dcast(cell ~ gene, value.var="expr") %>% mutate(disrupted=LYZ1<6 | SERPINB6B>4) ) class(cluster.disrupt) names(cluster.disrupt) gp <- ggplot(cluster.disrupt, aes(x=SERPINB6B, y=LYZ1, color=disrupted)) + geom_point() pdf("Shalek-cluster-disrupt.pdf") print(gp) dev.off() cell.meta.A <- cell.meta.A %>% left_join(cluster.disrupt %>% dplyr::select(cell, disrupted))
Have a look at which cells might be useful for modelling.
cell.meta.A %>% filter(! is.na(total), "" == assay, # str_detect(stimulant, "LPS|Unstimulated"), "LPS" == stimulant | "" == stimulant, "" == ko, FALSE == disrupted, total > 1e6, "" == replicate)
Investigate how many reads per cell.
colSums(shalek.A)
Check other method to retrieve data to see if there is more meta data.
library(GEOquery) geo.series <- 'GSE48968' geo.sample.1 <- 'GSM1189042' geo.sample.2 <- 'GSM1190902' gse <- getGEO(geo.sample.1, GSEMatrix=FALSE)
Now we parse B.
csv.filename <- '../../data/GSE48968_allgenesTPM_GSM1406531_GSM1407094.txt.gz' shalek.B <- read.csv(csv.filename, sep='\t') genes.B <- rownames(shalek.B) cells.B <- colnames(shalek.B) cell.re.B <- str_c( '^', '(wps|tr[1234]|gp[abc]|hero|ifnr)_', '(t[[:digit:]]+)_', '(S[[:digit:]]+)_rsem', '$') names(shalek.B) cell.meta.B <- ( as.data.frame(str_match(names(shalek.B), cell.re.B)) %>% dplyr::select(ko=V2, t=V3, cell=V4)) sample_n(cell.meta.B, 20) stopifnot(all(! is.na(cell.meta.B[,1]))) which(is.na(cell.meta.B[,1])) sapply(cell.meta.B, levels) gene.meta.B <- data.frame(gene=factor(genes.B, levels=genes.B))
Define some key genes mentioned in the paper.
shalek.key.genes <- unique(toupper(c( # # Cluster I d (core antiviral module; enriched for annotated antiviral and # interferon response genes; for example, "Ifit1", "Irf7", # # Cluster III c (peaked inflammatory module; showing rapid, # yet transient, induction under LPS; for example, "Tnf", "Il1a", "Cxcl2", # # Cluster III d (sustained inflammatory module; exhibiting # continued rise in expression under LPS; for example, "Mmp14", "Marco", "Il6", # # Cluster III b (‘maturity’ module; containing markers of # dendritic cell maturation; for example, "Cd83", "Ccr7", "Ccl22", # # At 2 h following LPS, "Ifnb1", # was bimodally expressed # # Genes encoding key inflammatory cytokines (for example, "Tnf", "Cxcl1", # # Figure 4: core antiviral targets. "Rsad2", "Stat2" ))) # shalek.key.genes %in% genes.A stopifnot(0 == sum(! shalek.key.genes %in% genes.A))
Save the data in a format suitable for DeLorean package.
shalek.A.expr <- log(as.matrix(shalek.A)+1) shalek.A.cell.meta <- cell.meta.A shalek.A.gene.meta <- gene.meta.A # Test we can construct a DeLorean object dl <- de.lorean(shalek.A.expr, shalek.A.gene.meta, shalek.A.cell.meta) # Save save(shalek.A.expr, shalek.A.gene.meta, shalek.A.cell.meta, file='../../data/ShalekDeLorean.rda')
date()
R version and packages used:
sessionInfo()
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.