knitr::opts_chunk$set(echo = FALSE, message = FALSE, warning = FALSE)
options(readr.num_columns = 0)
options(knitr.table.format = "markdown")
org_name <- params$org_name
report_start_date <- params$report_start_date
report_end_date <- params$report_end_date
bot_gh_users <- params$bot_gh_users
people_w_multiple_gh_users <- yaml::yaml.load_file(here::here(params$duplicates_yml_path))
result_folder <- here::here("results", glue::glue("{org_name}_{report_start_date}_{report_end_date}"))
dir.create(result_folder, recursive = TRUE, showWarnings = FALSE)
historical_commits <- report_end_date %>% 
  get_ropensci_commit_history_filename() %>%
  readr::read_csv(show_col_types = FALSE) %>%
  dplyr::filter(!author_login %in% bot_gh_users)

Potential duplicates (1 person with multiple GitHub logins) are identified based on commit name and email. Based on r params$duplicates_yml_path those will be collapsed where the below will_be_collapsed column is TRUE.

standardized_commits <- historical_commits %>% 
  mutate(standardized_author_name = gsub("[ _-]+", "", tolower(commit_author_name)))
potential_duplicates <- report_potential_duplicates(
  standardized_commits, people_w_multiple_gh_users
)
knitr::kable(select(potential_duplicates, -author_login_2))

Commit data is cleaned: when GitHub login is not available, commits are identified based on commit author name or email. For each GitHub login, the most frequently used author name is used.

cleaned_commits <- standardized_commits %>% 
  collapse_duplicates(filter(potential_duplicates, will_be_collapsed == TRUE)) %>%
  identify_unknown_authors() %>% 
  dplyr::filter(!is.na(author_login)) %>% 
  unify_names_for_login()

message(
  nrow(historical_commits) - nrow(cleaned_commits), " out of ",
  nrow(historical_commits),
  " commits were discarded due to being unidentifiable\nbetween ",
  as.Date(min(historical_commits$commit_date, na.rm = TRUE)), " and ",
  as.Date(max(historical_commits$commit_date, na.rm = TRUE)), "."
)

Contributors in the report period

commits_in_report_period <- cleaned_commits %>% 
  filter(as.Date(commit_date) >= report_start_date)
past_commits <- cleaned_commits %>% 
  filter(as.Date(commit_date) < report_start_date)
new_contributors <- anti_join(
  commits_in_report_period %>% select(author_login, commit_author_name) %>% unique(),
  past_commits %>% select(author_login) %>% unique(),
  by = "author_login"
) %T>% 
  readr::write_csv(file.path(result_folder, "new_contributors.csv"))
authors <- commits_in_report_period %>% 
  select(author_login, author_name = commit_author_name) %>% 
  unique()
author_summary <- commits_in_report_period %>%
  group_by(author_login) %>%
  summarise(
    n_commit = n(),
    n_repo = n_distinct(repo)
  ) %>%
  ungroup() %>%
  inner_join(authors, by = "author_login") %>%
  select(author_login, author_name, n_commit, n_repo) %>% 
  arrange(-n_repo, -n_commit) %T>%
  readr::write_csv(file.path(result_folder, "authors.csv"))

Total number of contributors in the report period: r nrow(author_summary). From this, r nrow(new_contributors) contributed for the first time in the report period.

Top contributors

author_summary %>%
  dplyr::filter(n_commit * n_repo >= 1000) %>%
  knitr::kable()

Authors by number of repositories they contributed to

author_summary %>%
  mutate(n_repo = cut(
    n_repo,
    breaks = c(0:5, 10, 50, 1000),
    labels = c(as.character(1:5), "6-10", "11-50", "50+")
  )) %>%
  group_by(n_repo) %>%
  summarize(n_author = n()) %T>%
  readr::write_csv(file.path(result_folder, "authors_by_num_repo.csv")) %>%
  knitr::kable()

Authors by number of commits they made

author_summary %>%
  mutate(n_commit = cut(
    n_commit,
    breaks = c(0, 1, 5, 10, 100, 1000, 10000),
    labels = c("1", "2-5", "6-10", "11-100", "101-1000", "1000+")
  )) %>%
  group_by(n_commit) %>%
  summarize(n_author = n()) %T>%
  readr::write_csv(file.path(result_folder, "authors_by_num_commit.csv")) %>%
  knitr::kable()


czeildi/ghcontributors documentation built on June 13, 2022, 2:34 a.m.