library(heatmaply)
library(dendextend)
Here eval=FALSE
as we have the final version in the heatmaplyExamples package.
Preparing the data:
# measles
# bil gates (20 Feb 2015): https://twitter.com/BillGates/status/568749655112228865
# http://graphics.wsj.com/infectious-diseases-and-vaccines/#b02g20t20w15
# http://www.opiniomics.org/recreating-a-famous-visualisation/
# http://www.r-bloggers.com/recreating-the-vaccination-heatmaps-in-r-2/
# source: http://www.r-bloggers.com/recreating-the-vaccination-heatmaps-in-r/
# measles <- read.csv("https://raw.githubusercontent.com/blmoore/blogR/master/data/measles_incidence.csv", header=T,
# skip=2, stringsAsFactors=F)
# dir.create("data")
# write.csv(measles, file = "data\\measles.csv")
# measles[measles == "-"] <- NA
measles[,-(1:2)] <- apply(measles[,-(1:2)], 2, as.numeric)
dim(measles)
# head(measles)
d2 <- aggregate(measles, list(measles[, "YEAR"]), "sum", na.rm = T)
rownames(measles) <- d2[,1]
d2 <- d2[, -c(1:4)]
d2 <- t(measles)
# head(measles)
dim(measles)
d2[1:5,1:5]
# head(md2)
#
# # per week
# d22 <- aggregate(measles, list(measles$YEAR, measles$WEEK), "sum", na.rm = T)
# rownames(measles2) <- paste(measles2[,1], d22[,2], sep = "_")
# d22 <- d22[, -c(1:5)]
# d22 <- t(measles2)
# md22 <- reshape2::melt(measles2)
# dim(measles2) # 51 3952
# pander::pander(sqrt(measles))
if(!file.exists("data\\sqrt_d2.csv")) write.csv(sqrt(measles), file = "data\\sqrt_d2.csv")
Let’s have measles in a long format.
library(heatmaplyExamples)
data(measles)
dim(measles)
#> [1] 59 72
md2 <- reshape2::melt(measles)
head(md2)
#> Var1 Var2 value
#> 1 ALABAMA 1930 4389
#> 2 ALASKA 1930 0
#> 3 AMERICAN.SAMOA 1930 0
#> 4 ARIZONA 1930 2107
#> 5 ARKANSAS 1930 996
#> 6 CALIFORNIA 1930 43585
We can look at the general timeline trend with different transformations.
library(ggplot2)
ggplot(md2, aes(x = Var2, y = value)) + geom_line(aes(group = Var1)) + geom_smooth(size = 2)
#> `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(md2, aes(x = Var2, y = sqrt(value))) + geom_line(aes(group = Var1)) + geom_smooth(size = 2)
#> `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(md2, aes(x = Var2, y = log(value+.1))) + geom_line(aes(group = Var1)) + geom_smooth(size = 2)
#> `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# ggplot(md22, aes(x = Var2, y = value)) + geom_line(aes(group = Var1)) # + geom_smooth(size = 2)
#
# we miss the per state information and the patters of similarity
We can try several heatmaps settings until we get one that works. Notice the effect of the color scheme.
library(gplots)
library(viridis)
heatmap.2(measles)
heatmap.2(measles, margins = c(3, 9))
heatmap.2(measles,trace = "none", margins = c(3, 9))
heatmap.2(measles, trace = "none", margins = c(3, 9), dendrogram = "row", Colv = NULL)
# heatmap.2(sqrt(measles), Colv = NULL, trace = "none", margins = c(3, 9))
library(viridis)
heatmap.2(measles, Colv = NULL, trace = "none", col = viridis(200), margins = c(3, 9))
#> Warning in heatmap.2(measles, Colv = NULL, trace = "none", col =
#> viridis(200), : Discrepancy: Colv is FALSE, while dendrogram is `both'.
#> Omitting column dendogram.
heatmap.2(sqrt(measles), Colv = NULL, trace = "none", col = viridis(200), margins = c(3, 9))
#> Warning in heatmap.2(sqrt(measles), Colv = NULL, trace = "none", col =
#> viridis(200), : Discrepancy: Colv is FALSE, while dendrogram is `both'.
#> Omitting column dendogram.
# hist(measles, col = )
library(ggplot2)
library(viridis)
hp <- qplot(x = as.numeric(measles), fill = ..count.., geom = "histogram")
hp + scale_fill_viridis(direction = -1) + xlab("Measles incedence") + theme_bw()
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Using scaling doesn’t make much sense in this data.
heatmap.2(measles, dendrogram = "row", Colv = NULL, trace = "none", margins = c(3, 9),
col = viridis(200),
scale = "row"
)
heatmap.2(measles, dendrogram = "row", Colv = NULL, trace = "none", margins = c(3, 9),
col = viridis(200),
scale = "column"
)
Decisions for clustering (highlight_branches helps identify the topology of the dendrogram, it colors each branch based on its height):
DATA <- measles
d <- dist(sqrt(DATA))
library(dendextend)
dend_expend(d)[[3]]
#> dist_methods hclust_methods optim
#> 1 unknown ward.D 0.8435986
#> 2 unknown ward.D2 0.8499247
#> 3 unknown single 0.8600853
#> 4 unknown complete 0.8656224
#> 5 unknown average 0.8774877
#> 6 unknown mcquitty 0.8447603
#> 7 unknown median 0.8682800
#> 8 unknown centroid 0.8508883
# average seems to make the most sense
dend_row <- d %>% hclust(method = "average") %>% as.dendrogram
dend_row %>% highlight_branches %>% plot
What would be the optimal number of clusters? (it seems that 3 would be good)
library(dendextend)
dend_k <- find_k(dend_row)
plot(dend_k)
Let’s plot it:
Rowv <- dend_row %>% color_branches(k = 3)
heatmap.2(sqrt(measles), Colv = NULL, Rowv = Rowv, trace = "none", col = viridis(200), margins = c(3, 9))
#> Warning in heatmap.2(sqrt(measles), Colv = NULL, Rowv = Rowv, trace =
#> "none", : Discrepancy: Colv is FALSE, while dendrogram is `both'. Omitting
#> column dendogram.
Rowv <- dend_row %>% color_branches(k = 3) %>% seriate_dendrogram(x=d)
heatmap.2(sqrt(measles), Colv = NULL, Rowv = Rowv, trace = "none", col = viridis(200), margins = c(3, 9))
#> Warning in heatmap.2(sqrt(measles), Colv = NULL, Rowv = Rowv, trace =
#> "none", : Discrepancy: Colv is FALSE, while dendrogram is `both'. Omitting
#> column dendogram.
All of this work (other then finding that the hclust method should be average) can simply be done using the following code to produce an interactive heatmap using heatmaply:
library(heatmaply)
heatmaply(sqrt(measles), Colv = NULL, hclust_method = "average",
fontsize_row = 8,fontsize_col = 6,
k_row = NA, margins = c(60,170, 70,40),
xlab = "Year", ylab = "State", main = "Project Tycho's heatmap visualization \nof the measles vaccine"
)
We can also use plot_method = “plotly”, row_dend_left = TRUE to make the plot more similar to heatmap.2 using the plotly engine (this is a bit more experimental. The default, plot_method = “ggplot”, is often safer to use - but it does not yet support row_dend_left = TRUE properly).
heatmaply(sqrt(measles), Colv = NULL, hclust_method = "average",
fontsize_row = 8,fontsize_col = 6,
k_row = NA, margins = c(60,50, 70,90),
xlab = "Year", ylab = "State", main = "Project Tycho's heatmap visualization \nof the measles vaccine",
plot_method = "plotly", row_dend_left = TRUE
)
# save.image("example.Rdata")
sessionInfo()
#> R version 3.4.1 (2017-06-30)
#> Platform: x86_64-w64-mingw32/x64 (64-bit)
#> Running under: Windows 7 x64 (build 7601) Service Pack 1
#>
#> Matrix products: default
#>
#> locale:
#> [1] LC_COLLATE=Hebrew_Israel.1255 LC_CTYPE=Hebrew_Israel.1255
#> [3] LC_MONETARY=Hebrew_Israel.1255 LC_NUMERIC=C
#> [5] LC_TIME=Hebrew_Israel.1255
#>
#> attached base packages:
#> [1] parallel stats graphics grDevices datasets utils methods
#> [8] base
#>
#> other attached packages:
#> [1] gplots_3.0.1 limma_3.22.7
#> [3] ALL_1.7.1 Biobase_2.26.0
#> [5] BiocGenerics_0.12.1 dendextend_1.6.0
#> [7] glmnet_2.0-10 foreach_1.4.3
#> [9] Matrix_1.2-10 bindrcpp_0.2
#> [11] heatmaplyExamples_0.1.0 heatmaply_0.11.0
#> [13] viridis_0.4.0 viridisLite_0.2.0
#> [15] plotly_4.7.1 ggplot2_2.2.1.9000
#> [17] knitr_1.16 installr_0.19.0
#> [19] stringr_1.2.0
#>
#> loaded via a namespace (and not attached):
#> [1] httr_1.2.1 tidyr_0.6.3 jsonlite_1.5
#> [4] gtools_3.5.0 shiny_1.0.3 assertthat_0.2.0
#> [7] stats4_3.4.1 yaml_2.1.14 robustbase_0.92-7
#> [10] backports_1.1.0 lattice_0.20-35 glue_1.1.1
#> [13] digest_0.6.12 RColorBrewer_1.1-2 colorspace_1.3-2
#> [16] httpuv_1.3.5 htmltools_0.3.6 plyr_1.8.4
#> [19] pkgconfig_2.0.1 devtools_1.13.2 xtable_1.8-2
#> [22] purrr_0.2.2.2 mvtnorm_1.0-6 scales_0.5.0
#> [25] gdata_2.18.0 whisker_0.3-2 tibble_1.3.4
#> [28] mgcv_1.8-17 withr_2.0.0 nnet_7.3-12
#> [31] lazyeval_0.2.0 mime_0.5 magrittr_1.5
#> [34] mclust_5.3 memoise_1.1.0 evaluate_0.10.1
#> [37] nlme_3.1-131 MASS_7.3-47 class_7.3-14
#> [40] tools_3.4.1 registry_0.3 data.table_1.10.4
#> [43] trimcluster_0.1-2 kernlab_0.9-25 munsell_0.4.3
#> [46] cluster_2.0.6 fpc_2.1-10 compiler_3.4.1
#> [49] caTools_1.17.1 rlang_0.1.2 grid_3.4.1
#> [52] iterators_1.0.8 htmlwidgets_0.9 crosstalk_1.0.0
#> [55] labeling_0.3 bitops_1.0-6 rmarkdown_1.6
#> [58] gtable_0.2.0 codetools_0.2-15 flexmix_2.3-14
#> [61] TSP_1.1-5 reshape2_1.4.2 R6_2.2.2
#> [64] seriation_1.2-2 gridExtra_2.2.1 prabclus_2.2-6
#> [67] dplyr_0.7.3.9000 bindr_0.1 rprojroot_1.2
#> [70] KernSmooth_2.23-15 modeltools_0.2-21 stringi_1.1.5
#> [73] Rcpp_0.12.12 gclus_1.3.1 DEoptimR_1.0-8
#> [76] diptest_0.75-7