library ("pkgstatsAnalyses")
here <- here::here ()
v_data_dir <- file.path (here, "vignettes", "data")

datafile <- file.path (here, "data-raw", "pkgstats-results.Rds")
f_fig2_top10 <- file.path (v_data_dir, "fig02-top10.Rds")
f_fig2_tr <- file.path (v_data_dir, "fig02-tr.Rds")
fig2_calc_top10_data <- !file.exists (f_fig2_top10)
fig2_calc_tr_data <- !file.exists (f_fig2_tr)

fig02_png <- file.path (here, "vignettes", "figures", "_fig02.png")
fig2_calc_top10_data <- fig2_calc_top10_data & !file.exists (fig02_png)
fig2_calc_tr_data <- fig2_calc_tr_data & !file.exists (fig02_png)
x <- load_pkgstats_data (datafile, raw = TRUE, latest = FALSE)
x <- lapply (split (x, f = as.factor (x$year)), function (i) {
                 urls <- unlist (strsplit (i$urls, ",|;|\\n|\\s"))
                 urls <- gsub ("^(\\s*?)http(s?)\\:\\/\\/", "", urls [!is.na (urls)])
                 urls <- table (gsub ("\\/.*$", "", urls))
                 urls <- sort (urls, decreasing = TRUE)
                 data.frame (year = rep (i$year [1], length (urls)),
                             url = names (urls),
                             n = as.integer (urls))
                  })
x <- do.call (rbind, x)
x <- x [which (!x$url == ""), ]
rownames (x) <- NULL
x$year <- lubridate::year (paste0 (x$year, "-01-01"))

top10 <- x |>
    group_by (url) |>
    summarise (n = sum (n)) |>
    arrange (desc (n))
top10 <- top10 [-grep ("^(cran|www\\.r-project)", top10$url), ]

x_top10 <- x [which (x$url %in% top10$url [1:10]), ]
x_top10$n [x_top10$url == "github.com"] <-
    x_top10$n [x_top10$url == "github.com"] / 20

saveRDS (x_top10, f_fig2_top10)
x <- load_pkgstats_data (datafile, raw = TRUE, latest = FALSE)
x$translations [x$translations == "NA"] <- NA_character_

tr <- lapply (split (x, f = as.factor (x$year)), function (i) {
                  tr <- unlist (strsplit (i$translations, ","))
                  tr <- gsub ("^\\s*|\\s*$", "", tr [which (!is.na (tr))])
                  tab <- sort (table (tr), decreasing = TRUE)
                  data.frame (translation = names (tab),
                              n = as.integer (tab)) })
tr <- tr [which (vapply (tr, nrow, integer (1)) > 0)]
for (i in seq_along (tr)) {
    tr [[i]]$year <- names (tr) [i]
}
tr <- do.call (rbind, tr)
rownames (tr) <- NULL
tr$year <- lubridate::year (paste0 (tr$year, "-01-01"))

# total numbers of packages for each year:
n <- x |>
    group_by (year) |>
    summarise (n = length (unique (package)))
tr$ntot <- n$n [match (tr$year, n$year)]

tr_summary <- tr |>
    group_by (translation) |>
    summarise (n = sum (n)) |>
    arrange (desc (n))
top10 <- tr_summary$translation [1:10]
tr_top10 <- tr [which (tr$translation %in% top10), ]

tr_all <- tr |>
    group_by (year) |>
    summarise (n = sum (n),
               ntot = head (ntot, 1))
tr_all$translation <- "ALL"

tr_top10 <- bind_rows (tr_top10, tr_all)

saveRDS (tr_top10, f_fig2_tr)
tr <- readRDS (f_fig2_tr)
tr$proportion <- tr$n / tr$ntot

# Rate of decrease since 2015:
decr_year <- function (tr, year = 2010) {
    tmp <- tr [tr$year >= year & tr$translation == "ALL", ]
    mod <- lm (proportion ~ year, data = tmp)
    100 * mod$coefficients [2]
}
d <- c (decr_year (tr, 2008), decr_year (tr, 2010), decr_year (tr, 2015))

# colours from `ggplot_build(p1)$data`:
#tr |>
#    ggplot (aes (x = year, y = n, color = translation)) +
#    geom_line () -> p1
#dat <- ggplot_build (p1)$data [[1]]
#unique (dat$colour)

cols <- c ("de" = "#F8766D",
           "en@quot" = "#D89000",
           "es" = "#A3A500",
           "fr" = "#39B600",
           "it" = "#00BF7D",
           "ja" = "#00BFC4",
           "ko" = "#00B0F6",
           "pl" = "#9590FF",
           "pt_BR" = "#E76BF3",
           "ru" = "#FF62BC")
           #"ALL" = "#111111")

# But don't plot "ALL" because it doesn't add much
tr <- tr |> filter (translation != "ALL")

tr |>
    ggplot (aes (x = year, y = proportion, color = translation)) +
    geom_line () +
    scale_colour_manual (values = cols) +
    ggtitle ("A: Top 10 translations") +
    guides (color = guide_legend (ncol = 2)) -> p1

readRDS (f_fig2_top10) |>
     ggplot (aes (x = year, y = n, color = url)) +
    geom_line () +
    ggtitle ("B: Top 10 URLs") +
    #theme (legend.position = c (0.2, 0.7),
    #       legend.background = element_rect(fill='transparent', colour='transparent')) +
    guides (color = guide_legend (ncol = 2)) -> p2

png (fig02_png, width = 7, height = 7, units = "in", res = 300)
p1 / p2
graphics.off ()
knitr::include_graphics (fig02_png)


mpadge/pkgstats-analyses documentation built on Dec. 10, 2022, 4:20 a.m.