Nothing
#' @title searchForIndustryRelevantGitHubProjects
#' @description Function searches for industry relevant software projects available from GitHub. The function was used to deliver data set of software projects in an NCBiR project. More details are described in a report: Lech Madeyski, “Training data preparation method,” tech. rep., code quest (research project NCBiR POIR.01.01.01-00-0792/16), 2019, as well as a paper: Tomasz Lewowski and Lech Madeyski, "Creating evolving project data sets in software engineering", 2019.
#' If you use this function or the returned data set than please cite:
#' Tomasz Lewowski and Lech Madeyski, "Creating evolving project data sets in software engineering", 2019
#' @author Lech Madeyski and Tomasz Lewowski
#' @export searchForIndustryRelevantGitHubProjects
#' @param myToken A private token used to access GitHub
#' @param earliestPushDate Only repositories which were pushed after this date will be included in the results (i.e., repositories for which the latest push was before this date will not be included in the results)
#' @param latestCreationDate Only repositories which were created before this date will be included in the results (i.e., repositories created after this date will not be included in the results)
#' @return selected GitHub projects
#' @examples
#' # to run this function you need to use your own token as a parameter of the function
#' # use your own token as the first parameter of the function
#' # searchForIndustryRelevantGitHubProjects("...", "2019-03-01", "2018-08-01")
#' @importFrom dplyr %>%
searchForIndustryRelevantGitHubProjects <- function(myToken,
earliestPushDate,
latestCreationDate) {
token <- myToken
language <- "Java"
minForks <- 1
minStars <- 1
pushedAt <- "2018-09-01..2019-03-25"
repoCount <- 50
languageCount <- 1
afterClause <- ""
listOfRepos <- NULL
repoCountTotal <- 100
analyzedRepos <- repoCountTotal / repoCount
hasNextPage <- TRUE
companies <-
c(
"microsoft",
"google",
"redhat-developer",
"IBM",
"Intel",
"amzn",
"SAP",
"thoughtworks",
"alibaba",
"github",
"facebook",
"Tencent",
"Pivotal",
"epam",
"baidu",
"mozilla",
"oracle",
"Unity-Technologies",
"uber",
"yandex",
"Shopify",
"linkedin",
"SUSE",
"Esri",
"Apple",
"salesforce",
"vmware",
"adobe",
"andela",
"cisco"
)
# Who really contributes to open source https://www.infoworld.com/article/3253948/open-source-tools/who-really-contributes-to-open-source.html
AmazonGitHubAccountsFrom_amzn.github.io <-
c(
"alexa",
"aws",
"awsdocs",
"awslabs",
"aws-quickstart",
"aws-samples",
"blox",
"boto",
"carbonado",
"c9",
"ajaxorg",
"cloud9ide",
"goodreads",
"gluon-api",
"IvonaSoftware",
"twitchtv",
"twitchtv",
"TwitchScience",
"justintv",
"Zappos",
"amazon-archives"
)
# Pivotal also keep projects under project name, not company (vide https://pivotal.io/open-source )
# Pivotal is involved in or supports some Apache projects...
OtherPivotalGitHubAccounts <-
c(
"pivotalsoftware",
"Pivotal-Open-Source-Hub",
"kubernetes",
"spring-projects",
"cloudfoundry",
"reactor",
"openservicebrokerapi",
"greenplum-db",
"rabbitmq",
"jasmine",
"robolectric",
"pivotal-sprout"
)
# Apache and Eclipse projects:
# "rabbitmq", "jasmine", "robolectric", "pivotal-sprout"
Apache <- "apache"
Eclipse <- "eclipse"
companies <-
c(
companies,
AmazonGitHubAccountsFrom_amzn.github.io,
OtherPivotalGitHubAccounts,
Apache,
Eclipse
)
for (company in companies)
{
currentPage <- 0
afterClause <- ""
hasNextPage <- TRUE
searchQuery <-
GetoptLong::qq("\\\"user:@{company} language:@{language}\\\"")
print(searchQuery)
while (hasNextPage) {
query <- gsub(
"\n",
" ",
GetoptLong::qq(
'{"query": "query {
viewer {
login
}
rateLimit {
limit
cost
remaining
resetAt
}
search(query:@{searchQuery}, type: REPOSITORY, first: @{repoCount}@{afterClause}){
repositoryCount
pageInfo {
endCursor
hasNextPage
}
nodes {
... on Repository {
id
nameWithOwner
createdAt
updatedAt
pushedAt
diskUsage
forkCount
isArchived
isFork
isLocked
isMirror
isPrivate
sshUrl
licenseInfo {
name
}
defaultBranchRef {
target {
... on Commit {
oid
history {
totalCount
}
}
}
}
stargazers {
totalCount
}
collaborators {
totalCount
}
watchers {
totalCount
}
languages(first: @{languageCount}, orderBy:{direction:DESC, field: SIZE}) {
totalSize
edges {
size
node {
name
}
}
}
}
}
}
}","variables": {}}
'
)
)
# cli$load_schema()
result <- httr::POST(
url = "https://api.github.com/graphql",
httr::add_headers(
Authorization = paste0("Bearer ", token),
Accept = "application/vnd.github.machine-man-preview+json"
),
body = query
)
# qry <- ghql::Query$new()
# qry$query('myquery', query)
jsonResult <- httr::content(result)
repos <-
jsonlite::fromJSON(jsonlite::toJSON(jsonResult), flatten = TRUE)
rows <- repos$data$search$nodes
rows["searchQuery"] <- searchQuery
listOfRepos <- dplyr::bind_rows(listOfRepos, rows)
repoCountTotal <- repos$data$search$repositoryCount
hasNextPage <- repos$data$search$pageInfo$hasNextPage
print(
GetoptLong::qq(
"Page: @{currentPage + 1} ends at cursor: @{repos$data$search$pageInfo$endCursor}. Has next page? @{hasNextPage}"
)
)
print(
GetoptLong::qq(
"In total there are: @{repos$data$search$repositoryCount} repositories. This query contained: @{repoCount} starting from @{currentPage * repoCount}"
)
)
print(
GetoptLong::qq(
"Query cost: @{repos$data$rateLimit$cost}, remaining: @{repos$data$rateLimit$remaining}/@{repos$data$rateLimit$limit} until: @{repos$data$rateLimit$resetAt} for @{repos$data$viewer$login}"
)
)
afterClause <-
GetoptLong::qq(", after: \\\"@{repos$data$search$pageInfo$endCursor}\\\"")
currentPage <- currentPage + 1
}
print(
GetoptLong::qq(
"Finished fetching after @{currentPage} pages. Downloaded metadata for @{currentPage * repoCount} repositories. Day: @{pushedAt}"
)
)
}
listOfRepos <- listOfRepos %>% tidyr::drop_na("id")
allGithubProjects <- listOfRepos
allGithubProjects <-
tibble::rowid_to_column(allGithubProjects, "rowID")
allGithubProjects <-
allGithubProjects %>% dplyr::rename(commitSHA = "defaultBranchRef.target.oid")
allGithubProjects <-
allGithubProjects %>% dplyr::rename(sshUrlOfRepository = "sshUrl")
# split column "languages.edges" (including, e.g., "1104419, Java") into two columns "Java.byte.count" (inclunding, e.g., "1104419") and "Language" (e.g., "Java")
allGithubProjects <-
tidyr::separate(
allGithubProjects,
col = "languages.edges",
into = c("Java.byte.count", "Language"),
convert = TRUE,
sep = "\\,"
)
allGithubProjects$Java.byte.count <-
readr::parse_number(allGithubProjects$Java.byte.count)
# https://stackoverflow.com/questions/29508943/r-regular-expression-isolate-a-string-between-quotes
allGithubProjects$Language <-
sub('[^\"]+\"([^\"]+).*', "\\1", "Language")
myselection <- function(projects) {
stringr::str_sub(projects$createdAt, 1, 10) < latestCreationDate &
# "2018-01-01" -> "2018-08-01"
stringr::str_sub(projects$pushedAt, 1, 10) > earliestPushDate &
# "2018-09-01" -> "2019-03-01"
projects$isArchived == FALSE &
projects$watchers.totalCount > stats::quantile(unlist(projects$watchers.totalCount), 1 /
4) &
projects$stargazers.totalCount > stats::quantile(unlist(projects$stargazers.totalCount), 1 /
4) &
projects$Java.byte.count > stats::quantile(unlist(projects$Java.byte.count), na.rm = TRUE, 1 /
4) &
projects$defaultBranchRef.target.history.totalCount > stats::quantile(
unlist(projects$defaultBranchRef.target.history.totalCount),
1 / 4
) &
projects$forkCount > stats::quantile(unlist(projects$forkCount), 1 / 4)
}
selectedGithubProjects <-
subset(
allGithubProjects,
myselection(allGithubProjects)
)
sshPaths2SelectedProjects <-
subset(
allGithubProjects,
myselection(allGithubProjects),
select = c("rowID", "sshUrlOfRepository", "commitSHA")
)
sshPaths2SelectedProjects <-
tibble::rowid_to_column(sshPaths2SelectedProjects, "rowNr")
sshPaths2AllProjects <-
subset(allGithubProjects,
select = c("rowID", "sshUrlOfRepository", "commitSHA")
)
repoOwners <-
dplyr::count(
selectedGithubProjects,
stringr::word("nameWithOwner", 1, sep = "\\/")
)
openxlsx::write.xlsx(
as.data.frame(allGithubProjects),
file = paste("githubAllProjects", ".xlsx", sep = ""),
colNames = TRUE,
borders = "columns"
)
openxlsx::write.xlsx(
as.data.frame(selectedGithubProjects),
file = paste("githubSelectedProjects", ".xlsx", sep = ""),
colNames = TRUE,
borders = "columns"
)
openxlsx::write.xlsx(
as.data.frame(sshPaths2SelectedProjects),
file = paste("githubSshPaths2SelectedProjects", ".xlsx", sep = ""),
colNames = TRUE,
borders = "columns"
)
return(selectedGithubProjects)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.