README.md

GithubMetrics

It's now recomended to use this package instead: https://github.com/r-world-devs/GitStats

R-CMD-check Codecov test
coverage HitCount

The aim of this package is to provide a wrapper on gh to quickly get you key Github repo information you need.The code here is used within Roche to quickly let me pull answer simple questions like:

Installation

You can install the released version of GithubMetrics from CRAN with:

install.packages("GithubMetrics")

Setup

library(GithubMetrics)
library(tidyverse)
library(glue)

organisation <- "openpharma"

Repos in an org

Pull all the repos present within an org (that I can see).

repos_raw <- gh_repos_get(
  org = organisation
  )

repos_clean <- gh_repos_clean(repos_raw)

glimpse(repos_clean) 
#> Rows: 14
#> Columns: 7
#> $ name           <chr> "BBS-causality-training", "GithubMetrics", "facetsr", …
#> $ full_name      <chr> "openpharma/BBS-causality-training", "openpharma/Githu…
#> $ size           <int> 27, 118, 2163, 5435, 87, 939, 1817, 79487, 329, 0, 482…
#> $ updated_at     <chr> "2021-01-29T18:01:35Z", "2021-02-03T07:07:43Z", "2020-…
#> $ default_branch <chr> "main", "master", "master", "master", "master", "maste…
#> $ language       <chr> "R", "R", "R", "Unsure", "Python", "R", "C", "R", "R",…
#> $ MB             <dbl> 0.0, 0.1, 2.1, 5.3, 0.1, 0.9, 1.8, 77.6, 0.3, 0.0, 0.5…

Realistically, research code is likely to be on Github Enterprise, so the .api_url and .token parameters can be passed through to gh(). Commented code below shows how you can use an on-premise Github server.

# repos_raw <- gh_repos_get(
#   org = organisation,
#   .api_url = "https://github.roche.com/api/v3",
#   .token = Sys.getenv("GITHUB_PAT_ROCHE")
#   )

Commits

Get every commit for all the repos in this organisation.

repo_all_commits <- gh_commits_get(
  repos_clean %>% filter(size > 0) %>% pull(full_name), 
  days_back = 365*10
)

glimpse(repo_all_commits)
#> Rows: 1,762
#> Columns: 5
#> $ full_name      <chr> "openpharma/BBS-causality-training", "openpharma/BBS-c…
#> $ author         <chr> "heinzmann537", "heinzmann537", "heinzmann537", "epiji…
#> $ datetime       <chr> "2021-01-29T18:00:10Z", "2021-01-29T12:55:54Z", "2021-…
#> $ sha            <chr> "5ac98df2a99db3b50abae114e37c00e433903094", "059569252…
#> $ commit_message <chr> "Update variable naming ADALM", "Small change", "First…

People

Pull all the people that have committed in r.

contributors <- repo_all_commits %>%
  group_by(author) %>%
  summarise(
    commits = n()
  ) %>%
  filter(!author %in% c(".gitconfig missing email","actions-user"))

contributors <- contributors %>%
  left_join(
    gh_user_get(contributors$author),
    by = c("author"="username")
  )

contributors %>%
  arrange(-commits) %>%
  mutate(
    last_active = Sys.Date() - last_active,
    contributor = glue('<img src="{avatar}" alt="" height="30"> {author}'),
    blog = case_when(
      blog == "" ~ "",
      TRUE ~ as.character(glue('<a href="{blog}">link</a>'))
      )
    ) %>%
  select(contributor,commits,name,last_active,company,location,blog) %>%
  knitr::kable(

  )

| contributor | commits | name | last_active | company | location | blog | | :--------------------------------------------------------------------------------------------------- | ------: | :------------------ | :----------- | :--------------------------------- | :---------------------- | :------------------------------------------------------- | | evanmiller | 936 | Evan Miller | 17 days | NA | Chicago, IL | link | | SHAESEN2 | 127 | Steven Haesendonckx | 20 days | NA | NA | | | diego-s | 122 | Diego S | 255 days | NA | NA | | | bailliem | 109 | Mark Baillie | 0 days | NA | Basel, CH | link | | epijim | 89 | James Black | 5 days | Roche | Basel, Switzerland | link | | jaredhobbs | 70 | Jared Hobbs | 89 days | YearEnd, Inc. | Salt Lake City, UT | link | | kalimu | 42 | Kamil Wais | 9 days | 7N / Roche | Rzeszów | link | | Jonnie-Bevan | 28 | NA | 63 days | NA | NA | | | cschaerfe | 21 | Charlotta | 118 days | NA | NA | | | davidanthoff | 12 | David Anthoff | 1 days | University of California, Berkeley | Berkeley, CA | link | | jar1karp | 12 | Jari Karppinen | 154 days | NA | NA | link | | mikmart | 12 | Mikko Marttila | 2 days | NA | NA | link | | reikoch | 8 | NA | 6 days | NA | NA | | | afeld | 6 | Aidan Feldman | 0 days | @GSA and personal projects | Brooklyn, NY | link | | erblast | 6 | Björn Oettinghaus | 22 days | NA | Switzerland | link | | lionel- | 6 | Lionel Henry | 70 days | @rstudio | NA | | | bpfoley | 5 | Brian Foley | 94 days | NA | Seattle, Washington | | | rebecca-albrecht | 4 | NA | 5 days | NA | NA | | | dazim | 3 | Tim Treis | 23 days | NA | Heidelberg, Germany | | | heinzmann537 | 3 | NA | 5 days | NA | NA | | | kentm4 | 3 | Matt Kent | 2 days | Genesis Research | NA | | | PaulJordan57 | 3 | NA | 19 days | NA | NA | | | galachad | 2 | Adam Foryś | 20 days | @Roche | Warsaw, Poland | link | | gerph | 2 | Charles Ferguson | 8 days | NA | NA | | | hadley | 2 | Hadley Wickham | 0 days | @rstudio | Houston, TX | link | | kawap | 2 | NA | 289 days | Roche / 7N | NA | | | kleschenko | 2 | Kostya Leschenko | 5 days | @datarobot | Lviv, Ukraine | | | kshedden | 2 | Kerby Shedden | 1 days | NA | NA | | | kurt-vd | 2 | Kurt Van Dijck | 63 days | NA | NA | | | mrocklin | 2 | Matthew Rocklin | 2 days | @coiled | San Juan Capistrano, CA | link | | thomas-neitmann | 2 | Thomas Neitmann | 1 days | Roche | Basel, Switzerland | link | | waddella | 2 | Adrian Waddell | 27 days | NA | NA | link | | ararslan | 1 | Alex Arslan | 0 days | Beacon Biosignals | Seattle, WA | | | ginberg | 1 | NA | 14 days | NA | Remote | link | | ivarref | 1 | Ivar Refsdal | 13 days | NA | Bergen, Norway | | | jonathon-love | 1 | Jonathon Love | 1 days | NA | NA | link | | Karissa | 1 | NA | 363 days | NA | NA | | | thanos-siadimas | 1 | NA | 1 days | NA | NA | |

Files

Pull a specific file using gh_file_get().

desc_formatted <- gh_file_get(
  repo = "GithubMetrics",
  org = "OpenPharma",
  file = "DESCRIPTION"
) %>%
  # format the description
  desc::desc(text = .)

# Print it
desc_formatted$get(c("Package","Title","Version")) %>%
  tibble::enframe() %>%
  knitr::kable()

| name | value | | :------ | :--------------------------------------------- | | Package | GithubMetrics | | Title | Quickly get key metrics on Github repositaries | | Version | 0.1.0 |

Get all of the files present in the last commit of all the repos using gh_repo_files_get().

repo_files <- gh_repo_files_get(
  repo_commits = repo_all_commits,
  only_last_commit = TRUE
)
#> Pulling files in latest commit from 13 repos

glimpse(repo_files)
#> Rows: 1,311
#> Columns: 6
#> $ repo       <chr> "openpharma/visR-docs", "openpharma/visR-docs", "openpharm…
#> $ file       <chr> "readme.md", "docs", "docs/404.html", "docs/code_of_conduc…
#> $ sha_repo   <chr> "5b35fdbc39b87a154c9426e363c8f5a2c83d66b0", "5b35fdbc39b87…
#> $ sha_commit <chr> "642856728e165746076a17c6522b9264f693f37d", "642856728e165…
#> $ extension  <chr> "md", "docs", "html", "html", "html", "html", "png", "png"…
#> $ lang       <chr> "Markdown", NA, "HTML", "HTML", "HTML", "HTML", NA, NA, NA…

repo_files %>%
  group_by(repo) %>%
  summarise(
    Files = n(),
    `R files` = sum(lang %in% "R"),
    `Python files` = sum(lang %in% c("Python","Jupyter Notebook"))
  ) %>% knitr::kable(
    caption = "Types of files in the organisation"
  )

| repo | Files | R files | Python files | | :-------------------------------- | ----: | ------: | -----------: | | openpharma/BBS-causality-training | 4 | 2 | 0 | | openpharma/CTP | 100 | 30 | 0 | | openpharma/facetsr | 63 | 13 | 0 | | openpharma/GithubMetrics | 43 | 22 | 0 | | openpharma/openpharma.github.io | 76 | 1 | 0 | | openpharma/pypharma_nlp | 131 | 0 | 49 | | openpharma/RDO | 105 | 11 | 0 | | openpharma/ReadStat | 207 | 0 | 0 | | openpharma/sas7bdat | 8 | 0 | 2 | | openpharma/simaerep | 145 | 32 | 0 | | openpharma/syntrial | 67 | 24 | 0 | | openpharma/visR | 177 | 81 | 0 | | openpharma/visR-docs | 185 | 0 | 0 |

Types of files in the organisation

results <- gh_repo_search(
  code = "tidyverse",
  organisation = organisation
)

glimpse(results)
#> Rows: 12
#> Columns: 7
#> $ full_name <chr> "openpharma/GithubMetrics", "openpharma/GithubMetrics", "op…
#> $ name      <chr> "GithubMetrics", "GithubMetrics", "GithubMetrics", "GithubM…
#> $ file_name <chr> "README.md", "README.Rmd", "DESCRIPTION", "test-gh_repos_XX…
#> $ path      <chr> "README.md", "README.Rmd", "DESCRIPTION", "tests/testthat/t…
#> $ url       <chr> "https://github.com/openpharma/GithubMetrics/blob/fa7764869…
#> $ score     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
#> $ lang      <chr> "Markdown", "R", NA, "R", "Markdown", "R", "Markdown", "R",…
helper_gh_repo_search <- function(x, org = "openpharma"){

  ## Slow it down! as search has 30 calls a minute rate limit.
  ## If you prem the search rate limit is higher, so usually not needed
  if(interactive()){message("Wait 5 seconds")}
  Sys.sleep(5)
  ## End slow down


   results <- gh_repo_search(
      code = x,
      organisation = org
    ) 

  if(is.na(results)) {
    results <- return()
  }
  results %>% 
    mutate(Package = x, Organisation = org) %>%
    group_by(Organisation,Package) %>%
    summarise(
      Repos = n_distinct(full_name), .groups = "drop"
    )
}

packages <- c(
  "tidyverse","pkgdown","dplyr","data.table"
  )

package_use <- bind_rows(
  packages %>%
    map_df(
      helper_gh_repo_search, org = "PHCAnalytics"
    ),
  packages %>%
    map_df(
      helper_gh_repo_search, org = "openpharma"
    ),
  packages %>%
    map_df(
      helper_gh_repo_search, org = "AstraZeneca"
    ),
  packages %>%
    map_df(
      helper_gh_repo_search, org = "Roche"
    ),
  packages %>%
    map_df(
      helper_gh_repo_search, org = "Genentech"
    ),
  packages %>%
    map_df(
      helper_gh_repo_search, org = "Novartis"
    )
)
#> pkgdown does not appear in PHCAnalytics.
#> query = 'pkgdown in:file  user:PHCAnalytics'
#> tidyverse does not appear in AstraZeneca.
#> query = 'tidyverse in:file  user:AstraZeneca'
#> pkgdown does not appear in AstraZeneca.
#> query = 'pkgdown in:file  user:AstraZeneca'
#> data.table does not appear in AstraZeneca.
#> query = 'data.table in:file  user:AstraZeneca'


package_use %>%
  pivot_wider(names_from = "Package", values_from = "Repos") %>%
  mutate(Total = rowSums(.[,-1], na.rm = TRUE)) %>%
  arrange(-Total) %>%
  knitr::kable(
    caption = "Package use detected within repositaries in Pharma orgs"
  )

| Organisation | tidyverse | dplyr | data.table | pkgdown | Total | | :----------- | --------: | ----: | ---------: | ------: | ----: | | Novartis | 4 | 10 | 12 | 6 | 32 | | openpharma | 4 | 6 | 2 | 6 | 18 | | Roche | 3 | 2 | 3 | 3 | 11 | | Genentech | 3 | 3 | 3 | 2 | 11 | | PHCAnalytics | 2 | 4 | 4 | NA | 10 | | AstraZeneca | NA | 1 | NA | NA | 1 |

Package use detected within repositaries in Pharma orgs



openpharma/GithubMetrics documentation built on Dec. 1, 2023, 12:52 a.m.