knitr::opts_chunk$set(echo = FALSE, message = FALSE, warning = FALSE) options(readr.num_columns = 0) options(knitr.table.format = "markdown")
org_name <- params$org_name report_start_date <- params$report_start_date report_end_date <- params$report_end_date bot_gh_users <- params$bot_gh_users people_w_multiple_gh_users <- yaml::yaml.load_file(here::here(params$duplicates_yml_path))
r org_name
.r report_start_date
to r report_end_date
.r paste(bot_gh_users, collapse = ", ")
.result_folder <- here::here("results", glue::glue("{org_name}_{report_start_date}_{report_end_date}")) dir.create(result_folder, recursive = TRUE, showWarnings = FALSE)
historical_commits <- report_end_date %>% get_ropensci_commit_history_filename() %>% readr::read_csv(show_col_types = FALSE) %>% dplyr::filter(!author_login %in% bot_gh_users)
Potential duplicates (1 person with multiple GitHub logins) are identified based on commit name and email. Based on r params$duplicates_yml_path
those will be collapsed where the below will_be_collapsed
column is TRUE
.
standardized_commits <- historical_commits %>% mutate(standardized_author_name = gsub("[ _-]+", "", tolower(commit_author_name))) potential_duplicates <- report_potential_duplicates( standardized_commits, people_w_multiple_gh_users ) knitr::kable(select(potential_duplicates, -author_login_2))
Commit data is cleaned: when GitHub login is not available, commits are identified based on commit author name or email. For each GitHub login, the most frequently used author name is used.
cleaned_commits <- standardized_commits %>% collapse_duplicates(filter(potential_duplicates, will_be_collapsed == TRUE)) %>% identify_unknown_authors() %>% dplyr::filter(!is.na(author_login)) %>% unify_names_for_login() message( nrow(historical_commits) - nrow(cleaned_commits), " out of ", nrow(historical_commits), " commits were discarded due to being unidentifiable\nbetween ", as.Date(min(historical_commits$commit_date, na.rm = TRUE)), " and ", as.Date(max(historical_commits$commit_date, na.rm = TRUE)), "." )
commits_in_report_period <- cleaned_commits %>% filter(as.Date(commit_date) >= report_start_date) past_commits <- cleaned_commits %>% filter(as.Date(commit_date) < report_start_date) new_contributors <- anti_join( commits_in_report_period %>% select(author_login, commit_author_name) %>% unique(), past_commits %>% select(author_login) %>% unique(), by = "author_login" ) %T>% readr::write_csv(file.path(result_folder, "new_contributors.csv"))
authors <- commits_in_report_period %>% select(author_login, author_name = commit_author_name) %>% unique() author_summary <- commits_in_report_period %>% group_by(author_login) %>% summarise( n_commit = n(), n_repo = n_distinct(repo) ) %>% ungroup() %>% inner_join(authors, by = "author_login") %>% select(author_login, author_name, n_commit, n_repo) %>% arrange(-n_repo, -n_commit) %T>% readr::write_csv(file.path(result_folder, "authors.csv"))
Total number of contributors in the report period: r nrow(author_summary)
. From this, r nrow(new_contributors)
contributed for the first time in the report period.
author_summary %>% dplyr::filter(n_commit * n_repo >= 1000) %>% knitr::kable()
author_summary %>% mutate(n_repo = cut( n_repo, breaks = c(0:5, 10, 50, 1000), labels = c(as.character(1:5), "6-10", "11-50", "50+") )) %>% group_by(n_repo) %>% summarize(n_author = n()) %T>% readr::write_csv(file.path(result_folder, "authors_by_num_repo.csv")) %>% knitr::kable()
author_summary %>% mutate(n_commit = cut( n_commit, breaks = c(0, 1, 5, 10, 100, 1000, 10000), labels = c("1", "2-5", "6-10", "11-100", "101-1000", "1000+") )) %>% group_by(n_commit) %>% summarize(n_author = n()) %T>% readr::write_csv(file.path(result_folder, "authors_by_num_commit.csv")) %>% knitr::kable()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.