## MAIN FUNCTION -------------------------------------------------------------------------
#' Pull repository statistics for specified GitHub repository. Specifically repositories set up to host a Jekyll/GitHub Pages site.
#'
#' @param owner The repository name of the relevant GitHub repo.
#' @param repo The owner of the relevant GitHub repo.
#' @param dateRange A vector of two dates.
#' @param useCache TRUE/FALSE.
#'
#' @return a dataframe of metrics for the date range specified.
#'
#' @export
#' @import lubridate
githubMetrics <- function(owner,
repo,
dateRange,
useCache) {
# get metrics
contributor <- contributorMetrics(owner = owner,
repo = repo,
dateRange = dateRange,
useCache = useCache)
commit <- commitMetrics(owner = owner,
repo = repo,
dateRange = dateRange)
post <- postMetrics(owner = owner,
repo = repo,
dateRange = dateRange)
# merge
githubData <- left_join(contributor, commit, by = "month") %>%
left_join(., post, by = "month") %>%
select(month, numCommits, numPostTotal, numNewPosts, totalContributors, numNewContributors, handles)
return(githubData)
}
## COMMITS -------------------------------------------------------------------------------
#' Pull and calculate commit metric (commits per month over a time frame)
#' @param owner The GitHub repository owner
#' @param repo The GitHub repository name
#' @param dateRange A vector of two dates in yyyy-mm-dd format. Can be strings or date objects. Order doesn't matter, the max and min will be used.
#'
#' @export
commitMetrics <- function(owner,
repo,
dateRange) {
# modify dateRange format
dateRange <- dateRangeToStartEnd(dateRange)
# pull commits over dateRange
commitObj <- getCommits(owner,
repo,
start = dateRange$start,
end = dateRange$end)
# create dataframe aggregating commits by month
commitTbl <- calcCommitNum(commitObj)
return(commitTbl)
}
## POSTS ---------------------------------------------------------------------------------
#' Pull and calculate posts metric (posts per month over a time frame)
#' @param owner The GitHub repository owner
#' @param repo The GitHub repository name
#' @param dateRange A vector of two dates in yyyy-mm-dd format. Can be strings or date objects. Order doesn't matter, the max and min will be used.
#'
#' @export
postMetrics <- function(owner,
repo,
dateRange) {
# get _post directory contents
posts <- getDirContents(owner = owner,
repo = repo,
dir = "_posts",
fullPath = FALSE)
# calculate posts/month using post file name
# Jekyll posts always have yyyy-mm-dd in post title
postTbl <- calcPostNumJekyll(posts,
dateRange)
return(postTbl)
}
## CONTRIBUTORS ---------------------------------------------------------------------------
#' This function calculates the number of new contributors and total contributors by month
#'
#' @param owner The owner of the repository that the blog lives in.
#' @param repo The repository name.
#' @param dateRange A vector of two dates in yyyy-mm-dd format. Can be strings or date objects. Order doesn't matter, the max and min will be used.
#' @param useCache TRUE/FALSE. When TRUE, use cached data. When FALSE, pull all data. Using cached data will make the function run faster.
#'
#' @return
#'
#' @export
contributorMetrics <- function(owner,
repo,
dateRange,
useCache = FALSE) {
dateRangeObj <- dateRangeToStartEnd(dateRange)
# if useCache is TRUE, only pull new contributors, will merge with cached
# if useCache is FALSE, pull all contributor data
contributorData <- getContributorDataJekyll(owner = owner,
repo = repo,
useCache = useCache)
# check cacheExists
cacheExists <- file.exists(here::here("R/sysdata.rda"))
# if useCache is TRUE and cacheExists is FALSE stop script. Recommend createCache
if (useCache == TRUE && cacheExists == FALSE) {
stop("useCache = TRUE but no cache is found. Use createCache() to create store your data locally.")
}
# if useCache is TRUE
# load cache
# merge with the contributor data of new contributors
if (useCache & cacheExists) {
load("R/sysdata.rda")
contributorData <- suppressMessages(full_join(contributorDataCache, contributorData))
}
contributorsByMonthYear <- contributorData %>%
mutate(month = floor_date(commitDate, unit = "month")) %>%
aggregateContributorData() %>%
completeMonths(dateCol = "month") %>%
mutate_at("numNewContributors", ~replace(., is.na(.), 0)) %>%
mutate(totalContributors = cumsum(numNewContributors)) %>%
filter(month >= dateRangeObj$start & month <= dateRangeObj$end)
return(contributorsByMonthYear)
}
## HELPER FUNCTIONS FOR COMMIT METRICS ---------------------------------------------------
#' Lists commits for a specified repository over a date range.
#' @param owner The GitHub repository owner.
#' @param repo The GitHub repository name.
#' @param start Beginning of date range in yyyy-mm-dd format. Can be strings or date objects.
#' @param end End of date range in yyyy-mm-dd format. Can be strings or date objects.
#'
#' @return A list where each element contains information about a single commit including author, date, and commit message.
getCommits <- function(owner,
repo,
start,
end) {
commitObj <- NULL
commitObjList <- list()
i <- 1
while (length(commitObj) == 100 | is.null(commitObj)) {
commitObj <- gh("GET /repos/:owner/:repo/commits",
owner = owner,
repo = repo,
since = start,
until = end,
per_page = 100,
page = i)
commitObjList <- append(commitObjList, commitObj)
i <- i + 1
}
return(commitObjList)
}
#' Calculates the number of commits/month to a given repository over a specified date range.
#'
#' @param commitObjList The result from `getCommits()` or `gh("GET /repos/:owner/:repo/commits, ... )`. A list where each element contains all info relevant to a single commit.
#'
#' @return a tibble that shows the number of commits to a repository by month. Tibble has two columns: `month`, `numCommits`.
calcCommitNum <- function(commitObjList) {
# unnest commitDates from list of lists
# flatten to a vector
# coerce to date
dates <- commitObjList %>%
purrr::map("commit") %>%
purrr::map("committer") %>%
purrr::map("date") %>%
purrr::flatten_chr() %>%
lubridate::as_date()
commitTbl <- tibble(commitDate = dates) %>%
dplyr::mutate(month = lubridate::floor_date(commitDate, unit = "month")) %>%
dplyr::group_by(month) %>%
dplyr::summarise(numCommits = n())
return(commitTbl)
}
## HELPER FUNCTIONS FOR CONTRIBUTOR PULL -------------------------------------------------
#' Pulls contributor paths from a static Jekyll blog hosted on GitHub pages.
#' @param owner The owner of the repository to pull file paths from.
#' @param repo The name of the repository to pull file paths from.
#' @param useCache If TRUE, only returns new, uncached contributors. If FALSE, returns all contributors.
#'
#' @export
getContributorPathsJekyll <- function(owner,
repo,
onlyUncached) {
# pull current contributor filepaths
contributorPaths <- getDirContents(owner = owner,
repo = repo,
dir = "_contributors",
fullPath = TRUE)
# check that cache exists
cacheExist <- file.exists(here::here("./R/sysdata.rda"))
# if cache exists, load and setdiff with knownContributors
# only pull info for new contributors and merge with known.
if (onlyUncached & cacheExist) {
message("loading cached data")
load(here::here("./R/sysdata.rda"))
message(paste0("data last cached on ", cacheDate))
contributorPaths <- setdiff(contributorPaths, contributorDataCache$path)
} else if (!onlyUncached) {
message("Loading all contributors.")
} else if (!cacheExist & onlyUncached) {
message("onlyUncached = TRUE but no cache found. Loading all contributors.")
}
return(contributorPaths)
}
#' Gets oldest commit date and handle for provided contributor paths for a Jekyll/Github Pages repository.
#' @param owner The owner of the repository to pull file paths from.
#' @param repo The name of the repository to pull file paths from.
#' @param useCache If TRUE, only returns new, uncached contributors. If FALSE, returns all contributors.
#'
#' @return A tibble with three columns: `path` contains the relative path to the contributor .md file, `commitDate` contains the earliest commit date for each contributor file, and `handle` has the contributors handle (the filename without .md on the end)
#' @export
getContributorDataJekyll <- function(owner,
repo,
useCache) {
# pull all contributor paths from the repo
paths <- getDirContents(owner = owner,
repo = repo,
dir = "_contributors",
fullPath = TRUE)
# if cache exists and useCache = TRUE, load cache and subset paths so only uncached
# contributors are pulled
cacheExist <- file.exists(here::here("./R/sysdata.rda"))
if (cacheExist & useCache) {
message("loading cached data")
load(here::here("./R/sysdata.rda"))
message(paste0("data last cached on ", cacheDate))
paths <- setdiff(paths, contributorDataCache$path)
}
# if there are paths to pull commit date for, do that!
if (length(paths) > 0) {
contributorDates <- paths %>%
map_dbl(~ filepathToOldestCommitDate(owner = owner,
repo = repo,
filepath = .x)) %>%
as_date()
contributorData <- tibble(path = paths,
commitDate = contributorDates,
handle = pathToContributor(paths))
} else {
contributorData <- NULL
}
return(contributorData)
}
#' Gets oldest commit date and handle for provided contributor paths for a Jekyll/Github Pages repository.
#' @param contributorData takes the output of `getContributorDataJekyll()`. See docs for description of output.
aggregateContributorData <- function(contributorData) {
contributorDataAgg <- contributorData %>%
mutate(month = floor_date(commitDate, unit = "month")) %>%
group_by(month) %>%
summarise(numNewContributors = n(),
handles = paste0(handle, collapse = ", ")) %>%
mutate(totalContributors = cumsum(numNewContributors)) %>%
select(month, totalContributors, numNewContributors, handles)
# fill in missing months
contributorTbl <- completeMonths(contributorDataAgg, "month")
return(contributorTbl)
}
## HELPER FUNCTIONS FOR POSTS ------------------------------------------------------------
#' Calculate the number of posts per month in a Jekyll GitHub pages repository.
#'
#' @param posts a vector of post file names.
#' @param dateRange A vector of two dates in yyyy-mm-dd format. Can be strings or date objects. Order doesn't matter, the max and min will be used.
#'
#' @return a dataframe of number of posts per month
calcPostNumJekyll <- function(posts,
dateRange) {
dateRange <- dateRangeToStartEnd(dateRange)
# get filenames from "_posts" directory
# use dates in filenames to parse by dateRange
postTbl <- as_date(str_sub(posts, start = 1, end = 10)) %>%
tibble(post = posts, date = .) %>%
mutate(month = floor_date(date, unit = "month"))%>%
group_by(month) %>%
summarise(numNewPosts = n()) %>%
mutate_at("numNewPosts", ~replace(., is.na(.), 0)) %>%
mutate(numPostTotal = cumsum(numNewPosts))%>%
filter(month >= dateRange$start & month <= dateRange$end)
return(postTbl)
}
## OTHER HELPER FUNCTIONS ----------------------------------------------------------------
#' Get the names of files in a specified directory in a github repo
#'
#' Uses the `gh` package to pull the file names from a specified directory.
#'
#' @param owner the GitHub repository owner
#' @param repo The GitHub repository name
#' @param dir The directory name or path to directory
#' @param fullPath TRUE returns the full filepath, FALSE returns filename
#'
#' @return a vector of files located in the specified directory.
#'
#' @import gh
getDirContents <- function(owner,
repo,
dir,
fullPath = TRUE) {
fileList <- gh("GET /repos/:owner/:repo/contents/:path",
owner = owner,
repo = repo,
path = dir)
fileNames <- fileList %>% map_chr("name")
#return full path?
if (fullPath) {
fileNames <- file.path(dir, fileNames)
}
return(fileNames)
}
#' Given a filepath from the `_contributors` directory this function will return just the contributor ID.
#'
#' @param contributorPaths A vector of paths from the `_contributors` directory.
#' @return A vector of contributor IDs from the `_contributors` directory.
#'
pathToContributor <- function(contributorPaths) {
id <- gsub("_contributors\\/|.md", "", contributorPaths)
return(id)
}
#' This function pulls the the oldest commit date for the specified filepath. This can be used to determine when a file was first pushed to the repository.
#'
#' @param owner the GitHub repository owner
#' @param repo The GitHub repository name
#' @param filepath Path to the file to return first commit date from
#'
#' @return a date object of the date that the specified file was first commited to the repository.
#'
#' @export
#' @import lubridate
filepathToOldestCommitDate <- function(owner,
repo,
filepath) {
# check that only 1 filepath is provided
if (length(filepath) != 1) {
stop("This function can only take a single filepath")
}
commitObj <- gh("GET /repos/:owner/:repo/commits",
owner = owner,
repo = repo,
path = filepath)
# check that commitObj returned something
if (length(commitObj) == 0) {
stop("path returns no commits")
}
commitNum <- length(commitObj)
# most recent commit is always the first row. Oldest commit is last.
oldestCommitDate <- as_date(commitObj[[max(commitNum)]]$commit$author$date)
return(oldestCommitDate)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.