# knitr::opts_chunk$set(echo = TRUE, cache = TRUE) knitr::opts_chunk$set( # cache = TRUE, dpi = 60, comment = '#>', tidy = FALSE)
library(heatmaply) library(dendextend)
Here eval=FALSE
as we have the final version in the heatmaplyExamples package.
Preparing the data:
# measles # bil gates (20 Feb 2015): https://twitter.com/BillGates/status/568749655112228865 # http://graphics.wsj.com/infectious-diseases-and-vaccines/#b02g20t20w15 # http://www.opiniomics.org/recreating-a-famous-visualisation/ # http://www.r-bloggers.com/recreating-the-vaccination-heatmaps-in-r-2/ # source: http://www.r-bloggers.com/recreating-the-vaccination-heatmaps-in-r/ # measles <- read.csv("https://raw.githubusercontent.com/blmoore/blogR/master/data/measles_incidence.csv", header=T, # skip=2, stringsAsFactors=F) # dir.create("data") # write.csv(measles, file = "data\\measles.csv") # measles[measles == "-"] <- NA measles[,-(1:2)] <- apply(measles[,-(1:2)], 2, as.numeric) dim(measles) # head(measles) d2 <- aggregate(measles, list(measles[, "YEAR"]), "sum", na.rm = T) rownames(measles) <- d2[,1] d2 <- d2[, -c(1:4)] d2 <- t(measles) # head(measles) dim(measles) d2[1:5,1:5] # head(md2) # # # per week # d22 <- aggregate(measles, list(measles$YEAR, measles$WEEK), "sum", na.rm = T) # rownames(measles2) <- paste(measles2[,1], d22[,2], sep = "_") # d22 <- d22[, -c(1:5)] # d22 <- t(measles2) # md22 <- reshape2::melt(measles2) # dim(measles2) # 51 3952
# pander::pander(sqrt(measles)) if(!file.exists("data\\sqrt_d2.csv")) write.csv(sqrt(measles), file = "data\\sqrt_d2.csv")
Let's have measles in a long format.
library(heatmaplyExamples) data(measles) dim(measles) md2 <- reshape2::melt(measles) head(md2)
We can look at the general timeline trend with different transformations.
library(ggplot2) ggplot(md2, aes(x = Var2, y = value)) + geom_line(aes(group = Var1)) + geom_smooth(size = 2) ggplot(md2, aes(x = Var2, y = sqrt(value))) + geom_line(aes(group = Var1)) + geom_smooth(size = 2) ggplot(md2, aes(x = Var2, y = log(value+.1))) + geom_line(aes(group = Var1)) + geom_smooth(size = 2) # ggplot(md22, aes(x = Var2, y = value)) + geom_line(aes(group = Var1)) # + geom_smooth(size = 2) # # we miss the per state information and the patters of similarity
We can try several heatmaps settings until we get one that works. Notice the effect of the color scheme.
library(gplots) library(viridis) heatmap.2(measles) heatmap.2(measles, margins = c(3, 9)) heatmap.2(measles,trace = "none", margins = c(3, 9)) heatmap.2(measles, trace = "none", margins = c(3, 9), dendrogram = "row", Colv = NULL) # heatmap.2(sqrt(measles), Colv = NULL, trace = "none", margins = c(3, 9)) library(viridis) heatmap.2(measles, Colv = NULL, trace = "none", col = viridis(200), margins = c(3, 9)) heatmap.2(sqrt(measles), Colv = NULL, trace = "none", col = viridis(200), margins = c(3, 9)) # hist(measles, col = ) library(ggplot2) library(viridis) hp <- qplot(x = as.numeric(measles), fill = ..count.., geom = "histogram") hp + scale_fill_viridis(direction = -1) + xlab("Measles incedence") + theme_bw()
Using scaling doesn't make much sense in this data.
heatmap.2(measles, dendrogram = "row", Colv = NULL, trace = "none", margins = c(3, 9), col = viridis(200), scale = "row" ) heatmap.2(measles, dendrogram = "row", Colv = NULL, trace = "none", margins = c(3, 9), col = viridis(200), scale = "column" )
Decisions for clustering (highlight_branches helps identify the topology of the dendrogram, it colors each branch based on its height):
DATA <- measles d <- dist(sqrt(DATA)) library(dendextend) dend_expend(d)[[3]] # average seems to make the most sense dend_row <- d %>% hclust(method = "average") %>% as.dendrogram dend_row %>% highlight_branches %>% plot
What would be the optimal number of clusters? (it seems that 3 would be good)
library(dendextend) dend_k <- find_k(dend_row) plot(dend_k)
Let's plot it:
Rowv <- dend_row %>% color_branches(k = 3) heatmap.2(sqrt(measles), Colv = NULL, Rowv = Rowv, trace = "none", col = viridis(200), margins = c(3, 9)) Rowv <- dend_row %>% color_branches(k = 3) %>% seriate_dendrogram(x=d) heatmap.2(sqrt(measles), Colv = NULL, Rowv = Rowv, trace = "none", col = viridis(200), margins = c(3, 9))
All of this work (other then finding that the hclust method should be average) can simply be done using the following code to produce an interactive heatmap using heatmaply:
library(heatmaply) heatmaply(sqrt(measles), Colv = NULL, hclust_method = "average", fontsize_row = 8,fontsize_col = 6, k_row = NA, margins = c(60,170, 70,40), xlab = "Year", ylab = "State", main = "Project Tycho's heatmap visualization \nof the measles vaccine" )
We can also use plot_method = "plotly", row_dend_left = TRUE to make the plot more similar to heatmap.2 using the plotly engine (this is a bit more experimental. The default, plot_method = "ggplot", is often safer to use - but it does not yet support row_dend_left = TRUE properly).
heatmaply(sqrt(measles), Colv = NULL, hclust_method = "average", fontsize_row = 8,fontsize_col = 6, k_row = NA, margins = c(60,50, 70,90), xlab = "Year", ylab = "State", main = "Project Tycho's heatmap visualization \nof the measles vaccine", plot_method = "plotly", row_dend_left = TRUE )
# save.image("example.Rdata")
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.