In tnewman63/easystats: Jump in the easyverse

easystats

knitr::opts_chunk$set(
  dpi = 600,
  collapse = TRUE,
  fig.path = "man/figures/"
)
options(knitr.kable.NA = '',
        digits = 4,
        width=60)

# knitr::include_graphics(c("https://github.com/easystats/insight/raw/master/man/figures/logo.png",
#                           "https://github.com/easystats/bayestestR/raw/master/man/figures/logo.png",
#                           "https://github.com/easystats/parameters/raw/master/man/figures/logo.png",
#                           "https://github.com/easystats/performance/raw/master/man/figures/logo.png",
#                           "https://github.com/easystats/modelbased/raw/master/man/figures/logo.png",
#                           "https://github.com/easystats/correlation/raw/master/man/figures/logo.png",
#                           "https://github.com/easystats/report/raw/master/man/figures/logo.png"))
# 

# library(png)
# library(grid)
# library(gridExtra)
# library(RCurl)
# 
# insight <-  rasterGrob(as.raster(readPNG(getURLContent("https://github.com/easystats/insight/raw/master/man/figures/logo.png"))), interpolate = FALSE)
# bayestestR <-  rasterGrob(as.raster(readPNG(getURLContent("https://github.com/easystats/bayestestR/raw/master/man/figures/logo.png"))), interpolate = FALSE)
# grid.arrange(insight, bayestestR, ncol = 1)

The aim of easystats is to provide a unifying and consistent framework to tame, discipline and harness the scary R statistics and their pesky models.

Installation

The whole easystats suite can be installed at once with the following:

install.packages("devtools")
devtools::install_github("easystats/easystats")

library("easystats")

Features

Find an overview of all postings here.

Dependencies

Most of easystats packages are very lightweight, i.e., they don't rely nor import any other packages! This means that you can safely use them as dependencies in your own packages, without the risk of butterfly effects (a small change in a distant downstream dependency with unexpected upstream consequences).

library(tidyverse)
library(devtools)
library(miniCRAN)
library(igraph)
library(ggnetwork)
library(intergraph)


pkg <- devtools::as.package(".")

dependencies <- c("insight", "performance", "bayestestR", "parameters", "correlation", "report", "modelbased", "effectsize")

dependency_graph <- miniCRAN::makeDepGraph(dependencies, suggests = FALSE, enhances = FALSE)
class(dependency_graph) <- "igraph"
dependency_graph <- dependency_graph + igraph::vertices(pkg$package) + igraph::edges(as.vector(rbind(dependencies, pkg$package)))
dependency_graph <- igraph::simplify(dependency_graph)

edge_list <- igraph::get.edgelist(dependency_graph)
# dependency_graph <- igraph::graph(rbind(edge_list[, 2], edge_list[, 1]))  # This fails at my place

dependency_graph_df <- ggnetwork::ggnetwork(
  dependency_graph
  , layout = "fruchtermanreingold"
  , arrow.gap = 0.015
  , layout.par = list(niter = 5000)
)

dependency_graph_df$package <- dependency_graph_df$vertex.names
dependency_graph_df$face <- ifelse(dependency_graph_df$package == pkg$package, "bold", "plain")

dependency_graph_df$n_dependencies <- as.vector(table(gsub("\\|.+", "", attr(igraph::E(dependency_graph), "vnames")))[as.character(dependency_graph_df$package)])
dependency_graph_df$n_dependencies[is.na(dependency_graph_df$n_dependencies)] <- 0

dependency_graph_df$importance <- as.vector(table(gsub(".+\\|", "", attr(E(dependency_graph), "vnames")))[as.character(dependency_graph_df$package)])
dependency_graph_df$importance[is.na(dependency_graph_df$importance)] <- 0

max_downstream_deps <- max(dependency_graph_df$importance)

dependency_graph_df$importance <- dependency_graph_df$importance / max_downstream_deps
dependency_graph_df$importance <- abs(1 - dependency_graph_df$importance)

dependency_graph_df <- as.data.frame(lapply(dependency_graph_df, as.vector))

ggplot(dependency_graph_df, aes(x = x, y = y, xend = xend, yend = yend)) +
  geom_nodes(aes(color = n_dependencies), size = 6.5, alpha = 0.4) +
  geom_edges(arrow = arrow(length = unit(4, "pt"), type = "closed"), color = grey(0.4)) +
  geom_nodelabel_repel(
    aes(label = package, fontface = face, color = n_dependencies)
    , box.padding = unit(8, "pt")
  ) +
  geom_nodes(aes(color = n_dependencies, size = 7 * importance)) +
  scale_color_distiller(palette = "Spectral") +
  scale_size(labels = function(x) abs(max_downstream_deps - ceiling(x / 7 * max_downstream_deps))) +
  theme_blank(legend.position = "top") +
  guides(
    size = guide_legend(title = "Downstream dependencies", title.position = "top", title.hjust = 0.5, label.position = "bottom", label.hjust = 0.5)
    , color = guide_colorbar(title = "Upstream dependencies", title.position = "top", title.hjust = 0.5, barwidth = unit(130, "pt"), barheight = unit(4, "pt"))
  )

There is one exception. The see package is one of our high-level packages that is responsible for plotting and creating figures, relying thus on other packages such as ggplot2, which itself is plugged in the tidyverse, increasing package dependencies by a substantial amount. On the bright side of things, it gives a good overview of the place of easystats in the R ecosystem.

library(tidyverse)
library(devtools)
library(miniCRAN)
library(igraph)
library(ggnetwork)
library(intergraph)


pkg <- devtools::as.package(".")
# dependencies <- unlist(strsplit(paste(pkg$imports, "\n", pkg$depends), split = "\n"))[-1]
# dependencies <- trimws(gsub("\\n| \\(.+\\)|,", "", dependencies))

dependencies <- c(
  "insight", "bayestestR", "performance", "parameters", "see",
  "correlation", "modelbased", "report", "effectsize"
)

dependency_graph <- miniCRAN::makeDepGraph(dependencies, suggests = FALSE, enhances = FALSE)
class(dependency_graph) <- "igraph"
dependency_graph <- dependency_graph + igraph::vertices(pkg$package) + igraph::edges(as.vector(rbind(dependencies, pkg$package)))
dependency_graph <- igraph::simplify(dependency_graph)

edge_list <- igraph::get.edgelist(dependency_graph)
# dependency_graph <- igraph::graph(rbind(edge_list[, 2], edge_list[, 1]))

dependency_graph_df <- ggnetwork::ggnetwork(
  dependency_graph
  , layout = "fruchtermanreingold"
  , arrow.gap = 0.015
  , layout.par = list(niter = 5000)
)

dependency_graph_df$package <- dependency_graph_df$vertex.names
dependency_graph_df$face <- ifelse(dependency_graph_df$package == pkg$package, "bold", "plain")

dependency_graph_df$n_dependencies <- as.vector(table(gsub("\\|.+", "", attr(igraph::E(dependency_graph), "vnames")))[as.character(dependency_graph_df$package)])
dependency_graph_df$n_dependencies[is.na(dependency_graph_df$n_dependencies)] <- 0

dependency_graph_df$importance <- as.vector(table(gsub(".+\\|", "", attr(E(dependency_graph), "vnames")))[as.character(dependency_graph_df$package)])
dependency_graph_df$importance[is.na(dependency_graph_df$importance)] <- 0

max_downstream_deps <- max(dependency_graph_df$importance)

dependency_graph_df$importance <- dependency_graph_df$importance / max_downstream_deps
dependency_graph_df$importance <- abs(1 - dependency_graph_df$importance)

dependency_graph_df <- as.data.frame(lapply(dependency_graph_df, as.vector))

ggplot(dependency_graph_df, aes(x = x, y = y, xend = xend, yend = yend)) +
  geom_nodes(aes(color = n_dependencies), size = 6.5, alpha = 0.4) +
  geom_edges(arrow = arrow(length = unit(4, "pt"), type = "closed"), color = grey(0.4)) +
  geom_nodelabel_repel(
    aes(label = package, fontface = face, color = n_dependencies)
    , box.padding = unit(8, "pt")
  ) +
  geom_nodes(aes(color = n_dependencies, size = 7 * importance)) +
  scale_color_distiller(palette = "Spectral") +
  scale_size(labels = function(x) abs(max_downstream_deps - ceiling(x / 7 * max_downstream_deps))) +
  theme_blank(legend.position = "top") +
  guides(
    size = guide_legend(title = "Downstream dependencies", title.position = "top", title.hjust = 0.5, label.position = "bottom", label.hjust = 0.5)
    , color = guide_colorbar(title = "Upstream dependencies", title.position = "top", title.hjust = 0.5, barwidth = unit(130, "pt"), barheight = unit(4, "pt"))
  )

Citation

How to reference easystats?

Cite specific packages

The most parsimonious approach is to cite only the particular package that helped you, e.g., "using bayestestR (Makowski, Ben-Shachar, & Lüdecke, 2019)". However, as easystats is meant to be an ecosystem, with different people working on its different aspects (some being more "citeable" than others), please consider including also the "main" publication: not available yet.

Cite the whole ecosystem

Want to really help us boost our h-index? Or simply credit the whole network of interconnected aspects of easystats? This can be done with a sentence like the following:

Data processing was carried out with R (R Core Team, 2019) and the easystats ecosystem (Lüdecke, Waggoner, & Makowski, 2019; Makowski, Ben-Shachar, & Lüdecke, 2019)

The corresponding bibtex entries are the following:

@article{ludecke2019insight,
    journal = {Journal of Open Source Software},
    doi = {10.21105/joss.01412},
    issn = {2475-9066},
    number = {38},
    publisher = {The Open Journal},
    title = {insight: A Unified Interface to Access Information from Model Objects in R},
    url = {http://dx.doi.org/10.21105/joss.01412},
    volume = {4},
    author = {Lüdecke, Daniel and Waggoner, Philip and Makowski, Dominique},
    pages = {1412},
    date = {2019-06-25},
    year = {2019},
    month = {6},
    day = {25},
}


@article{makowski2019bayestestr,
    title = {{bayestestR}: {Describing} {Effects} and their {Uncertainty}, {Existence} and {Significance} within the {Bayesian} {Framework}},
    volume = {4},
    issn = {2475-9066},
    shorttitle = {{bayestestR}},
    url = {https://joss.theoj.org/papers/10.21105/joss.01541},
    doi = {10.21105/joss.01541},
    number = {40},
    urldate = {2019-08-13},
    journal = {Journal of Open Source Software},
    author = {Makowski, Dominique and Ben-Shachar, Mattan and Lüdecke, Daniel},
    month = aug,
    year = {2019},
    pages = {1541}
}

Versioning

Package version numbers indicate following: MAJOR.MINOR.PATCH.DEVELOPMENT. As long as packages are in a more or less rapidly developing and changig state, the major version number is typically 0. Once we think we will have a stable base that will likely not change dramatically or soon, the major version number will be set to 1, and increased for following major changes that probably break the current API. When new features are added or (re)moved, we typicaly increase the minor version number. Minimal changes or bug fixes only are indicated by increasing the patch version number. Current development versions of our poackages (i.e. master branchs from GitHub) additionally have a development version number. You typically won't find packages on CRAN with a development version number.

Downloads

library(tidyverse)
library(zoo)
library(cranlogs)
library(rstanarm)
library(see)
library(ggrepel)

# Packages data
downloads_insight <- cranlogs::cran_downloads(package = "insight", from = "2019-02-26") %>% 
  mutate(Package = "insight")
downloads_bayestestR <- cranlogs::cran_downloads(package = "bayestestR", from = "2019-04-08") %>% 
  mutate(Package = "bayestestR")
downloads_performance <- cranlogs::cran_downloads(package = "performance", from = "2019-04-22") %>% 
  mutate(Package = "performance")
downloads_see <- cranlogs::cran_downloads(package = "see", from = "2019-05-22") %>% 
  mutate(Package = "see")
downloads_parameters <- cranlogs::cran_downloads(package = "parameters", from = "2019-08-26") %>% 
  mutate(Package = "parameters")
downloads_effectsize <- cranlogs::cran_downloads(package = "effectsize", from = "2019-11-15") %>% 
  mutate(Package = "effectsize")
downloads_modelbased <- cranlogs::cran_downloads(package = "modelbased", from = "2020-01-12") %>% 
  mutate(Package = "modelbased")

# Combine all data
data <- rbind(downloads_insight,
              downloads_bayestestR,
              downloads_performance,
              downloads_see,
              downloads_parameters,
              downloads_effectsize,
              downloads_modelbased) %>% 
  group_by(Package) %>% 
  mutate(cumulative_count = cumsum(count),
         average_count = zoo::rollmax(count, 10, fill=0)+10,
         day_num = as.numeric(date) - min(as.numeric(date)),
         day = weekdays(date),
         month = months(date),
         quarters = quarters(date),
         month_day = lubridate::mday(date)) %>% 
  ungroup() %>% 
  mutate(Package = fct_relevel(Package, "insight", "bayestestR", "performance", "see", "parameters", "effectsize", "modelbased"))


events <- data.frame()
color_CRAN <- "#607D8B"
color_blog <- "#9C27B0"

# Publications
events <- rbind(events,
               data.frame(date = as.Date(c("2019-03-05",
                                           "2019-03-29",
                                           "2019-04-09",
                                           "2019-04-23",
                                           "2019-04-24",
                                           "2019-05-11",
                                           "2019-05-24",
                                           "2019-05-29",
                                           "2019-06-19",
                                           "2019-06-25",
                                           "2019-07-01",
                                           "2019-08-05",
                                           "2019-08-26",
                                           "2019-09-26",
                                           "2019-11-15",
                                           "2020-01-12"
                                           )), 
                          event=c("CRAN - insight (0.1.0)", 
                                  "CRAN - insight (0.2.0)",
                                  "CRAN - bayestestR (0.1.0)",
                                  "JOSS - insight (submission)",
                                  "CRAN - performance (0.1.0)",
                                  "CRAN - insight (0.3.0)",
                                  "CRAN - see (0.1.0)",
                                  "CRAN - bayestestR (0.2.0)",
                                  "CRAN - see (0.2.0)",
                                  "JOSS - insight (publication)",
                                  "CRAN - insight (0.4.0)",
                                  "CRAN - performance (0.3.0)",
                                  "CRAN - parameters (0.1.0)",
                                  "CRAN - parameters (0.2.0)",
                                  "CRAN - effectsize (0.0.1)",
                                  "CRAN - modelbased (0.1.0)"
                                  ), 
                          Package=c("insight", 
                                    "insight",
                                    "bayestestR",
                                    "insight",
                                    "performance",
                                    "insight",
                                    "see",
                                    "bayestestR",
                                    "see",
                                    "insight",
                                    "insight",
                                    "performance",
                                    "parameters",
                                    "parameters",
                                    "effectsize",
                                    "modelbased"
                                    ), 
                          color=color_CRAN, stringsAsFactors = FALSE))
# Blogposts
blogposts <- tidyRSS::tidyfeed("https://easystats.github.io/blog/categories/r/index.xml") %>% 
  filter(item_date_published > "2019-03-29") %>% 
  select(date = item_date_published,
         event = item_title) %>% 
  mutate(color=color_blog) %>% 
  distinct()

blogposts$Package <- rev(c("insight", "bayestestR", "bayestestR", "performance", "bayestestR", "see", "bayestestR", "bayestestR", "performance", "parameters", "parameters", "parameters", "bayestestR"))


events <- rbind(events, blogposts)

# Full join
data <- full_join(data, events, by = c("date", "Package"))

events <- full_join(
  events,
  group_by(data, event) %>% 
    summarise(event_y = max(average_count)), by = "event") %>% 
    filter(!is.na(date))

# Set packages colours
packages_colours <- c("insight" = unname(see::material_colors("orange")),
                      "bayestestR" = unname(see::material_colors("pink")),
                      "performance" = unname(see::material_colors("green")),
                      "see" = unname(see::material_colors("blue")),
                      "parameters" = unname(see::material_colors("purple")),
                      "effectsize" = unname(see::material_colors("brown")),
                      "modelbased" = unname(see::material_colors("lime")))

Trend

# TIME TREND
data %>%
  group_by(Package) %>% 
  slice(1:(n()-1)) %>%  # Remove 2 last days to avoid edge issues
  ungroup() %>%  
  ggplot(aes(x = date, color = Package)) +
  geom_line(aes(y = count), size = 0.5, alpha = 0.1) +
  geom_smooth(aes(y = count),
              method = 'loess',
              size = 1.75,
              se = FALSE) +
  geom_smooth(aes(y = count),
              method = 'lm',
              size = 0.75,
              se = FALSE) +
  # geom_linerange(data=events, aes(x = date, ymin = 0, ymax = event_y), colour = events$color, size = .5, linetype="dashed") +
  # geom_label_repel(data=events, aes(label = event, x = date, y = event_y), fill = events$color, colour = "white", segment.colour = events$color) +
  geom_text_repel(
    data = events,
    aes(label = event, x = date, y = event_y),
    colour = events$color,
    segment.colour = events$color,
    size = 3
  ) +
  see::theme_modern() +
  scale_x_date(date_breaks = "1 month",
               labels = scales::date_format("%Y-%m")) +
  scale_color_manual(values = packages_colours) +
  xlab("") +
  ylab("Daily CRAN Downloads\n") +
  coord_cartesian(ylim = c(0, max(data$count) + 100), expand = FALSE) +
  scale_y_sqrt()

Cumulative downloads

data$download_label <- NA

data <- data %>% 
  group_by(Package, month) %>% 
  mutate(download_label = first(cumulative_count)) %>% 
  ungroup()

data$download_label[duplicated(data$download_label)] <- NA

ggplot(data, aes(x = date, y = cumulative_count, label = download_label, color = Package)) +
  geom_line(size=0.75) +
  geom_label_repel(show.legend = FALSE) +
  labs(x = "", y = "Cumulative CRAN Downloads", label = NULL) +
  see::theme_modern() +
  scale_x_date(date_breaks = "1 month", labels = scales::date_format("%Y-%m")) +
  scale_color_manual(values=packages_colours) +
  scale_y_sqrt()

Convention of code-style

Following conventions apply to the easystats-ecosystem, to ensure that function and argument names as well as element names for return-values follow a consistent pattern across all packages.

Importing other packages

No full import, only selective import of functions
Use base-R wherever possible (reduce dependencies)
Make sure R-version requirements are not too strict

Helper-functions

Own re-implementation of helper-functions, if it's not too much effort (e.g. I typically use own functions to check if a string starts / ends with a pattern, or if an object (list, data frame) contains an element with a given name (like tibble::has_name()), to reduce dependencies.

Function names

Lower case, underscore separated if more than one verb
Common prefix for functions that focus on specific "tasks" or workflows (e.g. insight, get_*() to get data, find_*() to find information, or performance, performance_*() to compute measures of model quality, check_*() to check model assumptions...)
Internal functions (that are not exported, like the previously mentined helper-functions) should always start with a . (e.g., .do_some_internal_stuff()).

Argument names

lower case, underscore separated if more than one verb

Element / Column names (for returned data frames)

1) First letter of the column name is capital, unless (6) applies (example: Parameter) 2) First letter of nouns is capital, unless (6) applies (example: ROPE_Percentage, Prior_Scale) 3) Using underscore rather than camelCase to separate words (example: CI_high) 4) Multiple words: common/main part first and adjective/specifier/variational part after (example: Median_standardized, ROPE_percentage) 5) Abbreviations: all uppercase (example: ESS, MCSE, ROPE) 6) Keep conventions for reserved words (example: p, pd, Rhat) 7) Adjectives / verbs: all lower case, unless (1) applies (example: high or low in CI_high or CI_low)

List of functions

# it would be cool to add the title / description for all functions
library(insight)
library(bayestestR)
library(parameters)
library(performance)
library(correlation)
library(modelbased)
library(effectsize)
library(report)

all_funs <- data.frame()

for (package in c("insight", "bayestestR", "parameters", "performance", "effectsize", "correlation", "modelbased", "report", "see")) {

  name <- ls(paste0("package:", package))

  functions <- paste0("[**`", 
                      name, 
                      "`**](https://easystats.github.io/", 
                      package, 
                      "/reference/index.html)",
                      " *(",
                      package,
                      ")*")

  functions <- data.frame("Functions" = functions,
                          "Package" = package,
                          "Name" = name)


  all_funs <- rbind(all_funs, functions)
}



all_funs <- all_funs[!duplicated(all_funs$Name),]
all_funs <- sort(as.character(all_funs$Functions))
cat(paste0(c("", all_funs), collapse = "\n- "))