In Ironholds/practice: Tools for identifying whether R packages follow common software engineering conventions

knitr::opts_chunk$set(echo = FALSE, message = FALSE, dpi = 300)
ggplot2::theme_set(ggplot2::theme_bw())

library(practice)
library(plyr)
library(dplyr)
library(ggplot2)
library(scales)
data("CRANpractices") # apparently this IS needed if first call is inline?

Temporary workaround: date info

Until CRANpractices gets rebuilt with more info about a package's birthdate, most recent update, and number of releases ... I'm gonna grab what I can from CRANmetadata.

date_stuff <- ldply(CRANmetadata, function(x) {
  y <- as.Date(unlist(x$timeline))
  data_frame(first_date = min(y),
             pub_date = max(y),
             nrel = length(y))
}, .id = "package")
CRANpractices <- left_join(CRANpractices, date_stuff)

Frequency tables and barcharts

Many variables take on a small number of values and we can digest their frequencies en masse.

There are r nrow(CRANpractices) packages in this dataset.

## variables that make reasonable factors, w/ a modest number of levels
j_vars <- c('casing', 'alphanumeric', 'upstream_repo',
            'versioning', 'testing', 'roxygen', 'changelog')
j_vars <- setNames(j_vars, j_vars)
j_ft <- lapply(j_vars, function(x) {
  CRANpractices %>% count_(vars = list(x), sort = TRUE)
})
j_ft2 <- lapply(j_ft, function(x) {
  what <- names(x)[1]
  dots <- list(what)
  x <- x %>%
    rename_(.dots = setNames(dots, "val")) %>% 
    mutate(val = as.character(val),
           var = what,
           bar = paste(var, val, sep = ".")) %>% 
    select(bar, var, val, n)
})
j_ft3 <- bind_rows(j_ft2)
j_ft3$bar <- factor(j_ft3$bar, rev(j_ft3$bar))

p <- ggplot(j_ft3, aes(x = n, y = bar))
p + geom_segment(aes(yend = bar), xend = 0, lwd = 3, colour = "grey50") +
  geom_text(aes(label = n), hjust = -0.1, size = 4) +
  facet_grid(var ~ ., scales = "free_y", space = "free_y") +
  theme(strip.background = element_blank(), strip.text = element_blank(),
        axis.title.y = element_blank()) +
  expand_limits(x = 7000)

Variables in CRANpractices that are not covered above:

setdiff(names(CRANpractices), j_vars)

License

lic_dat <- CRANpractices %>%
  count(license, sort = TRUE) %>% 
  mutate(license = factor(license, rev(license))) %>% 
  filter(n > 4)
p <- ggplot(lic_dat, aes(x = n, y = license))
p + geom_segment(aes(yend = license), xend = 0, lwd = 3, colour = "grey50") +
  geom_text(aes(label = n), hjust = -0.1, size = 4) +
  expand_limits(x = 2800)

Downloads

p <- ggplot(CRANpractices %>% filter(!is.na(roxygen)),
            aes(x = downloads)) + scale_x_log10(breaks = 10 ^ (1:5),
                                                labels = comma)
p + geom_density()
p + geom_density(aes(fill = roxygen), alpha = 0.5)
#p + geom_density(aes(fill = testing)) + facet_grid(testing ~ .)
p + geom_density(aes(fill = testing), alpha = 0.5)
p + geom_density(aes(fill = upstream_repo), alpha = 0.5)

p <- ggplot(CRANpractices %>% filter(!is.na(roxygen)),
            aes(x = downloads)) + scale_x_log10(breaks = 10 ^ (1:5),
                                                labels = comma)
p + geom_jitter(aes(y = roxygen), alpha = 0.5)
p + geom_jitter(aes(y = testing), alpha = 0.5)
p + geom_jitter(aes(y = upstream_repo), alpha = 0.5)

CRANpractices %>% 
  group_by(upstream_repo) %>% 
  summarize(no_dld = sum(downloads < 1), yes_dld = sum(downloads >= 1))

I find it hard to believe that organic human-driven downloads would hit essentially every single package on CRAN within a month. Are there automated systems that, e.g., download CRAN in its entirety as a matter of policy?

Vignettes

vig_dat <- CRANpractices %>% 
  mutate(bar = paste(vignette_format, vignette_builder, sep = "+"))
vig_dat <- vig_dat %>%
  count(bar, sort = TRUE) %>% 
  mutate(bar = factor(bar, rev(bar))) %>% 
  filter(n > 1)
p <- ggplot(vig_dat, aes(x = n, y = bar))
p + geom_segment(aes(yend = bar), xend = 0, lwd = 3, colour = "grey50") +
  geom_text(aes(label = n), hjust = -0.1, size = 4) +
  expand_limits(x = 5500) +
  theme(axis.title.y = element_blank())

Number of releases

nrel_dat <- CRANpractices %>%
  count(nrel, sort = TRUE) %>% 
  arrange(nrel)
p <- ggplot(nrel_dat, aes(x = n, y = nrel))
p + geom_segment(aes(yend = nrel), xend = 0, lwd = 3, colour = "grey50") +
  #geom_text(aes(label = n), hjust = -0.1, size = 4) +
  ylab("Number of releases") + scale_y_reverse() +
  expand_limits(x = 1500)

Date of first and most recent CRAN version

p <- ggplot(CRANpractices, aes(x = pub_date, y = 1))
p + geom_jitter(alpha = 0.5) +
  theme(axis.title.y = element_blank(), axis.text.y = element_blank())
p <- ggplot(CRANpractices, aes(x = first_date, y = nrel))
p + geom_jitter(aes(alpha = 0.3)) + guides(alpha = FALSE) +
  ylab("Number of releases")

Practices by date of first recent CRAN version

Left: various approaches to testing against date of most recent version.

Right: Frequency of testing for packages that provide an upstream repo (almost all are GitHub) vs those that do not. A proportional bar chart would be nice ... but don't have yet. Having such a repo appears to be strongly correlated with having tests.

p <- ggplot(CRANpractices, aes(x = pub_date, y = testing))
p + geom_jitter(aes(alpha = 0.3)) + guides(alpha = FALSE)
p <- ggplot(
  CRANpractices %>% 
    filter(upstream_repo %in% c("GitHub", "None/Other", "R-Forge")) %>% 
    mutate(#upstream_repo = upstream_repo != "None/Other",
      testing = testing != "None"),
  aes(x = upstream_repo, fill = testing))
p + geom_bar()