knitr::opts_chunk$set(echo = FALSE, message = FALSE, dpi = 300) ggplot2::theme_set(ggplot2::theme_bw())
library(practice) library(plyr) library(dplyr) library(ggplot2) library(scales) data("CRANpractices") # apparently this IS needed if first call is inline?
Until CRANpractices
gets rebuilt with more info about a package's birthdate, most recent update, and number of releases ... I'm gonna grab what I can from CRANmetadata
.
date_stuff <- ldply(CRANmetadata, function(x) { y <- as.Date(unlist(x$timeline)) data_frame(first_date = min(y), pub_date = max(y), nrel = length(y)) }, .id = "package") CRANpractices <- left_join(CRANpractices, date_stuff)
Many variables take on a small number of values and we can digest their frequencies en masse.
There are r nrow(CRANpractices)
packages in this dataset.
## variables that make reasonable factors, w/ a modest number of levels j_vars <- c('casing', 'alphanumeric', 'upstream_repo', 'versioning', 'testing', 'roxygen', 'changelog') j_vars <- setNames(j_vars, j_vars) j_ft <- lapply(j_vars, function(x) { CRANpractices %>% count_(vars = list(x), sort = TRUE) }) j_ft2 <- lapply(j_ft, function(x) { what <- names(x)[1] dots <- list(what) x <- x %>% rename_(.dots = setNames(dots, "val")) %>% mutate(val = as.character(val), var = what, bar = paste(var, val, sep = ".")) %>% select(bar, var, val, n) }) j_ft3 <- bind_rows(j_ft2) j_ft3$bar <- factor(j_ft3$bar, rev(j_ft3$bar))
p <- ggplot(j_ft3, aes(x = n, y = bar)) p + geom_segment(aes(yend = bar), xend = 0, lwd = 3, colour = "grey50") + geom_text(aes(label = n), hjust = -0.1, size = 4) + facet_grid(var ~ ., scales = "free_y", space = "free_y") + theme(strip.background = element_blank(), strip.text = element_blank(), axis.title.y = element_blank()) + expand_limits(x = 7000)
Variables in CRANpractices
that are not covered above:
setdiff(names(CRANpractices), j_vars)
lic_dat <- CRANpractices %>% count(license, sort = TRUE) %>% mutate(license = factor(license, rev(license))) %>% filter(n > 4) p <- ggplot(lic_dat, aes(x = n, y = license)) p + geom_segment(aes(yend = license), xend = 0, lwd = 3, colour = "grey50") + geom_text(aes(label = n), hjust = -0.1, size = 4) + expand_limits(x = 2800)
p <- ggplot(CRANpractices %>% filter(!is.na(roxygen)), aes(x = downloads)) + scale_x_log10(breaks = 10 ^ (1:5), labels = comma) p + geom_density() p + geom_density(aes(fill = roxygen), alpha = 0.5) #p + geom_density(aes(fill = testing)) + facet_grid(testing ~ .) p + geom_density(aes(fill = testing), alpha = 0.5) p + geom_density(aes(fill = upstream_repo), alpha = 0.5)
p <- ggplot(CRANpractices %>% filter(!is.na(roxygen)), aes(x = downloads)) + scale_x_log10(breaks = 10 ^ (1:5), labels = comma) p + geom_jitter(aes(y = roxygen), alpha = 0.5) p + geom_jitter(aes(y = testing), alpha = 0.5) p + geom_jitter(aes(y = upstream_repo), alpha = 0.5)
CRANpractices %>% group_by(upstream_repo) %>% summarize(no_dld = sum(downloads < 1), yes_dld = sum(downloads >= 1))
I find it hard to believe that organic human-driven downloads would hit essentially every single package on CRAN within a month. Are there automated systems that, e.g., download CRAN in its entirety as a matter of policy?
vig_dat <- CRANpractices %>% mutate(bar = paste(vignette_format, vignette_builder, sep = "+")) vig_dat <- vig_dat %>% count(bar, sort = TRUE) %>% mutate(bar = factor(bar, rev(bar))) %>% filter(n > 1) p <- ggplot(vig_dat, aes(x = n, y = bar)) p + geom_segment(aes(yend = bar), xend = 0, lwd = 3, colour = "grey50") + geom_text(aes(label = n), hjust = -0.1, size = 4) + expand_limits(x = 5500) + theme(axis.title.y = element_blank())
nrel_dat <- CRANpractices %>% count(nrel, sort = TRUE) %>% arrange(nrel) p <- ggplot(nrel_dat, aes(x = n, y = nrel)) p + geom_segment(aes(yend = nrel), xend = 0, lwd = 3, colour = "grey50") + #geom_text(aes(label = n), hjust = -0.1, size = 4) + ylab("Number of releases") + scale_y_reverse() + expand_limits(x = 1500)
p <- ggplot(CRANpractices, aes(x = pub_date, y = 1)) p + geom_jitter(alpha = 0.5) + theme(axis.title.y = element_blank(), axis.text.y = element_blank()) p <- ggplot(CRANpractices, aes(x = first_date, y = nrel)) p + geom_jitter(aes(alpha = 0.3)) + guides(alpha = FALSE) + ylab("Number of releases")
Left: various approaches to testing against date of most recent version.
Right: Frequency of testing for packages that provide an upstream repo (almost all are GitHub) vs those that do not. A proportional bar chart would be nice ... but don't have yet. Having such a repo appears to be strongly correlated with having tests.
p <- ggplot(CRANpractices, aes(x = pub_date, y = testing)) p + geom_jitter(aes(alpha = 0.3)) + guides(alpha = FALSE) p <- ggplot( CRANpractices %>% filter(upstream_repo %in% c("GitHub", "None/Other", "R-Forge")) %>% mutate(#upstream_repo = upstream_repo != "None/Other", testing = testing != "None"), aes(x = upstream_repo, fill = testing)) p + geom_bar()
links_to, links_from
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.