#!/usr/bin/env R
# Author: Sean Maden
# Make Figure 1A panel plot of cumulative GEO samples (GSMs) by year
# and available sample type.
library(ggplot2)
library(data.table)
library(ggpubr)
library(cowplot)
pkgname <- "recountmethylationManuscriptSupplement"
tables.dir <- system.file("extdata", "geodata", package = pkgname)
#--------------
# platform info
#--------------
lid <- list(c('GPL13534','HM450K','2011','May 2011'),
c('GPL21145', 'EPIC','2015','Nov. 2015'),
c('GPL8490', 'HM27K','2009','Apr. 2009')
)
keydf <- as.data.frame(do.call(rbind, lid))
colnames(keydf) <- c("platform_id", "platform_name",
"release_year", "release_month")
#----------------------------
# load and format data tables
#----------------------------
# specify tables
fnv <- c("gseyeardata", "gseidatyrdat")
# load sample data
dm <- do.call(rbind, lapply(fnv, function(x){
return(fread(file.path(tables.dir, x), sep = " ", data.table = FALSE))
}))
df <- as.data.frame(dm, stringsAsFactors = FALSE)
colnames(df) <- c("platformid","year","samples")
# filter platforms of interest
which.platforms <- c("GPL8490", "GPL13534", "GPL21145")
df <- df[df$platformid %in% which.platforms,]
# append sample types
stype <- c(rep(" (all)", 60), rep(" (IDAT)", 60))
df$plab <- paste0(df$platformid, stype)
# get formatted columns/labels
df$year <- as.numeric(df$year); df$pname <- "NA"
dff <- matrix(nrow = 0, ncol = 5)
for(pid in keydf[,1]){
ki <- keydf[keydf[,1] == pid,]
release.yr <- as.numeric(ki[3])
dpid <- df[df$platformid == pid,]
dpid <- dpid[dpid$year >= release.yr,]
dpid$pname <- paste0(gsub(" .*", "", ki[2]),
" ", gsub(".* ", "", dpid$plab))
dpid <- dpid[order(dpid$year),]
for(u in unique(dpid$plab)){
dpp <- dpid[dpid$plab == u,]
for(r in seq(nrow(dpp))){
dfi <- dpp[r,]
dfi[3] <- sum(dpp[seq(r),3])
dff <- rbind(dff, dfi)
}
}
}
dff <- as.data.frame(dff, stringsAsFactors = FALSE)
dff$platform <- gsub(" .*", "", dff$pname)
#---------------------
# make manuscript plot
#---------------------
# make manuscript plot
dff$`Platform (Type)` <- dff$plab
lv <- c("HM450K (all)", "HM450K (IDAT)", "EPIC (all)", "EPIC (IDAT)", "HM27K (all)", "HM27K (IDAT)")
vv <- c("green", "forestgreen", "blue", "cyan", "gold", "brown")
sv <- c(17, 17, 16, 16, 18, 18)
lname <- ""; lsize <- 1; ptsize <- 4.5
yearv <- c(2009,2010,2011,2012,2013,2014,2015,2016,2017,2018)
vv2 <- c("HM27K (all)" = "gold", "HM27K (IDAT)" = "brown",
"HM450K (all)" = "green", "HM450K (IDAT)" = "forestgreen",
"EPIC (all)" = "cyan", "EPIC (IDAT)" = "blue")
sv2 <- c("HM27K (all)" = 18, "HM27K (IDAT)" = 18,
"HM450K (all)" = 16, "HM450K (IDAT)" = 16,
"EPIC (all)" = 17, "EPIC (IDAT)" = 17)
dff$pname <- factor(dff$pname, levels = c("HM27K (all)","HM27K (IDAT)",
"HM450K (all)", "HM450K (IDAT)",
"EPIC (all)", "EPIC (IDAT)"))
dff$color <- dff$shape <- "NA"
for(x in names(vv2)){
dff[dff$pname == x,]$color <- vv2[x];dff[dff$pname == x,]$shape <- sv2[x]}
figS1 <- ggplot(dff, aes(x = year, y = samples)) + theme_bw() +
xlab("Year") + ylab("Cumulative studies") + xlim(2009, 2018) +
geom_line(data = dff[dff$pname == "HM27K (all)",],
aes(year, samples), color = "gold", size = lsize) +
geom_point(data = dff[dff$pname == "HM27K (all)",], aes(year, samples),
color = "gold", size = ptsize, shape = 18) +
geom_line(data = dff[dff$pname == "HM27K (IDAT)",],
aes(year, samples), color = "brown", size = lsize) +
geom_point(data = dff[dff$pname == "HM27K (IDAT)",], aes(year, samples),
color = "brown", size = ptsize, shape = 18) +
geom_line(data = dff[dff$pname == "HM450K (all)",],
aes(year, samples), color = "green", size = lsize) +
geom_point(data = dff[dff$pname == "HM450K (all)",], aes(year, samples),
color = "green", size = ptsize, shape = 16) +
geom_line(data = dff[dff$pname == "HM450K (IDAT)",],
aes(year, samples), color = "forestgreen", size = lsize) +
geom_point(data = dff[dff$pname == "HM450K (IDAT)",], aes(year, samples),
color = "forestgreen", size = ptsize, shape = 16) +
geom_line(data = dff[dff$pname == "EPIC (all)",],
aes(year, samples), color = "cyan", size = lsize) +
geom_point(data = dff[dff$pname == "EPIC (all)",], aes(year, samples),
color = "cyan", size = ptsize, shape = 17) +
geom_line(data = dff[dff$pname == "EPIC (IDAT)",],
aes(year, samples), color = "blue", size = lsize) +
geom_point(data = dff[dff$pname == "EPIC (IDAT)",], aes(year, samples),
color = "blue", size = ptsize, shape = 17) +
theme(legend.position = "none", axis.text.x = element_text(angle = 90)) +
scale_x_continuous(labels = yearv,
breaks = yearv)
# get the plot legend
figS1.lform <- ggplot(dff, aes(x = year, y = samples)) + theme_bw() +
xlab("Year") + ylab("Cumulative samples") + xlim(2009, 2018) +
geom_point(aes(colour = pname, shape = pname, size = 4)) +
geom_line(aes(colour = pname, size = 2)) +
scale_colour_manual(values = vv2) + scale_shape_manual(values = sv2) +
guides(colour = FALSE,
shape = guide_legend(override.aes = list(shape = sv2,
colour = vv2, size = 4)),
size = FALSE) +
theme(legend.title = element_blank(), legend.position = c(0.2, 0.6))
figS1.legend <- ggpubr::get_legend(figS1.lform)
# print manuscript plot
# pdf("sfig1_geo-gse-yr.pdf", 5.2, 2.9)
# print(cowplot::ggdraw(figS1) +
# cowplot::draw_plot(figS1.legend, .17, .38, .5, .5))
# dev.off()
#-------------------
# make vignette plot
#-------------------
# make plot
dff$`Platform (Type)` <- dff$plab
lv <- c("HM450K (all)", "HM450K (IDAT)", "EPIC (all)", "EPIC (IDAT)", "HM27K (all)", "HM27K (IDAT)")
vv <- c("green", "forestgreen", "blue", "cyan", "gold", "brown")
sv <- c(17, 17, 16, 16, 18, 18)
lname <- "Platform (Type)"
figS1 <- ggplot(dff, aes(year, samples)) + theme_bw() +
xlab("Year") + ylab("Cumulative Samples") +
geom_point(aes(colour = `Platform (Type)`, shape = platform)) +
guides(shape = FALSE) +
scale_shape_manual(values = sv) +
scale_colour_manual(values = vv) +
guides(colour = guide_legend(override.aes = list(shape = sv, colour = vv))) +
geom_line(data = dff[dff$pname == "HM27K (IDAT)",],
aes(year, samples), color = "gold") +
geom_line(data = dff[dff$pname == "HM27K (all)",],
aes(year, samples), color = "brown") +
geom_line(data = dff[dff$pname == "HM450K (all)",],
aes(year, samples), color = "green") +
geom_line(data = dff[dff$pname == "HM450K (IDAT)",],
aes(year, samples), color = "forestgreen") +
geom_line(data = dff[dff$pname == "EPIC (all)",],
aes(year, samples), color = "blue") +
geom_line(data = dff[dff$pname == "EPIC (IDAT)",],
aes(year, samples), color = "cyan") +
theme(legend.position = c(0.2, 0.6))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.