R/github.R

Defines functions parse_github_replies

Documented in parse_github_replies

# Kaiaulu - https://github.com/sailuh/kaiaulu
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at https://mozilla.org/MPL/2.0/.

############## Parsers ##############

#' Parse GitHub Issue and Pull Request Comments
#'
#' Parses Issue, Pull Request, and Comments Endpoints into a reply table.
#' See example usage in the download_github_comments.Rmd vignette.
#'
#' @param issues_json_folder_path The path to the downloaded issues JSON. See \code{\link{github_api_project_issue}}.
#' @param pull_requests_json_folder_path The path to the downloaded pull requests JSON. See \code{\link{github_api_project_pull_request}}.
#' @param comments_json_folder_path The path to the downloaded comments JSON. See \code{\link{github_api_project_issue_or_pr_comments}}.
#' @param commit_json_folder_path The path to the downloaded commits JSON (used to map github username to the git log). See \code{\link{github_api_project_commits}}.
#' @return A single reply table which combines the communication from the three jsons.
#' @export
parse_github_replies <- function(issues_json_folder_path,
                                 pull_requests_json_folder_path,
                                 comments_json_folder_path,
                                 commit_json_folder_path){
# PASSING THE WRONG PARAMETER HERE

#  issues_json_folder_path <- paste0(github_replies_folder_path,"/issue/")
#  pull_requests_json_folder_path <- paste0(github_replies_folder_path,"/pull_request/")
#  comments_json_folder_path <- paste0(github_replies_folder_path,"/issue_or_pr_comment/")
#  commit_json_folder_path <- paste0(github_replies_folder_path,"/commit/")

  # Tabulate Issues
  all_issue <- lapply(list.files(issues_json_folder_path,
                                 full.names = TRUE),jsonlite::read_json)
  all_issue <- lapply(all_issue,
                      github_parse_project_issue)
  all_issue <- rbindlist(all_issue,fill=TRUE)

  # Tabulate PRs
  all_pr <- lapply(list.files(pull_requests_json_folder_path,
                              full.names = TRUE),jsonlite::read_json)
  all_pr <- lapply(all_pr,
                   github_parse_project_pull_request)
  all_pr <- rbindlist(all_pr,fill=TRUE)

  # Tabulate Comments
  all_issue_or_pr_comments <- lapply(list.files(comments_json_folder_path,
                                                full.names = TRUE),jsonlite::read_json)
  all_issue_or_pr_comments <- lapply(all_issue_or_pr_comments,
                                     github_parse_project_issue_or_pr_comments)
  all_issue_or_pr_comments <- rbindlist(all_issue_or_pr_comments,fill=TRUE)


  all_issue <- all_issue[,.(reply_id=issue_id,
                            in_reply_to_id=NA_character_,
                            reply_datetimetz=created_at,
                            reply_from=issue_user_login,
                            reply_to=NA_character_,
                            reply_cc=NA_character_,
                            reply_subject=issue_number,
                            reply_body=body)]

  # Note because GitHub API treats PRs as Issues, then pr_number <=> issue_number
  all_pr <- all_pr[,.(reply_id=pr_id,
                      in_reply_to_id=NA_character_,
                      reply_datetimetz=created_at,
                      reply_from=pr_user_login,
                      reply_to=NA_character_,
                      reply_cc=NA_character_,
                      reply_subject=pr_number,
                      reply_body=body)]

  all_issue_or_pr_comments <- all_issue_or_pr_comments[,.(reply_id=comment_id,
                                                          in_reply_to_id=NA_character_,
                                                          reply_datetimetz=created_at,
                                                          reply_from=comment_user_login,
                                                          reply_to=NA_character_,
                                                          reply_cc=NA_character_,
                                                          reply_subject=issue_url,
                                                          reply_body=body)]

  issue_or_pr_comments_reply_subject <- stringi::stri_split_regex(all_issue_or_pr_comments$reply_subject,
                                                                  "/")
  all_issue_or_pr_comments$reply_subject <- sapply(issue_or_pr_comments_reply_subject,"[[",8)

  replies <- rbind(all_issue,
                   all_pr,
                   all_issue_or_pr_comments)

  # We can then parse the commit messages, and format so we have a look-up table of authors
  # and committers name, e-mail, and github ID:

  all_commits <- lapply(list.files(commit_json_folder_path,
                                   full.names = TRUE),jsonlite::read_json)
  all_commits <- lapply(all_commits,
                        github_parse_project_commits)
  all_commits <- rbindlist(all_commits,fill=TRUE)

  all_github_authors <- all_commits[,.(github_login=author_login,
                                       name_email = stringi::stri_c(commit_author_name,
                                                                    " ",
                                                                    commit_author_email))]

  all_github_committers <- all_commits[,.(github_login=committer_login,
                                          name_email = stringi::stri_c(commit_committer_name,
                                                                       " ",
                                                                       commit_committer_email))]

  all_github_developers <- rbind(all_github_authors,all_github_committers)

  # For simplicity here, when the same GitHub id contains
  # multiple e-mails, we choose one. In the future, we will
  # consider including all e-mails.
  all_github_developers <- all_github_developers[,.(name_email=name_email[1]),by="github_login"]

  # Replace `reply_from` by name<space>email when information is available (i.e.)
  # the github id modified as author or commiter at least one file.
  replies <- merge(replies,all_github_developers,
                   all.x=TRUE,
                   by.x="reply_from",
                   by.y="github_login")

  replies[!is.na(name_email)]$reply_from <-  replies[!is.na(name_email)]$name_email
  replies[,name_email:=NULL]

  return(replies)
}

############## Downloader ##############

#' Download Project Issue Events
#'
#' Download Issues from "GET /repos/{owner}/{repo}/issues/events" endpoint.
#'
#' @param owner GitHub's repository owner (e.g. sailuh)
#' @param repo GitHub's repository name (e.g. kaiaulu)
#' @param token Your GitHub API token
#' @references For details, see \url{https://docs.github.com/en/free-pro-team@latest/rest/reference/issues#events}
#' and \url{https://docs.github.com/en/free-pro-team@latest/developers/webhooks-and-events/issue-event-types}.
#' @export
github_api_project_issue_events <- function(owner,repo,token){
  gh::gh("GET /repos/{owner}/{repo}/issues/events",
         owner=owner,
         repo=repo,
         type="IssuesEvent",
         page=1,
         per_page=100,
         .token=token)
}


#' Parse Issue Events JSON to Table
#'
#' Note not all columns available in the downloaded json are parsed.
#'
#' @param api_responses API response obtained from github_api_* function.
#' @export
github_parse_project_issue_events <- function(api_responses){
  parse_response <- function(api_response){
    parsed_response <- list()
    parsed_response[["id"]] <- api_response[["id"]]
    parsed_response[["created_at"]] <- api_response[["created_at"]]
    parsed_response[["commit_id"]] <- ifelse(length(api_response[["commit_id"]]) == 0,
                                             NA,
                                             api_response[["commit_id"]])
    parsed_response[["event"]] <- api_response[["event"]]
    parsed_response[["actor_login"]] <- api_response[["actor"]][["login"]]
    parsed_response[["actor_id"]] <- api_response[["actor"]][["id"]]
    parsed_response[["actor_type"]] <- api_response[["actor"]][["type"]]
    parsed_response[["issue_number"]] <- api_response[["issue"]][["number"]]
    parsed_response[["issue_title"]] <- api_response[["issue"]][["title"]]
    parsed_response[["issue_user_login"]] <- api_response[["issue"]][["user"]][["login"]]
    parsed_response[["issue_user_id"]] <- api_response[["issue"]][["user"]][["id"]]
    parsed_response[["issue_user_site_admin"]] <- api_response[["issue"]][["user"]][["site_admin"]]
    parsed_response[["issue_state"]] <- api_response[["issue"]][["state"]]
    parsed_response[["issue_author_association"]] <- api_response[["issue"]][["author_association"]]
    parsed_response[["issue_body"]] <- api_response[["issue"]][["body"]]
    parsed_response[["issue_assignee_login"]] <- api_response[["issue"]][["assignee"]][["login"]]
    parsed_response[["issue_assignee_id"]] <- api_response[["issue"]][["assignee"]][["id"]]
    parsed_response[["issue_body"]] <- api_response[["issue"]][["body"]]

    assignees_list <- api_response[["issue"]][["assignees"]]
    assignees_list <- lapply(assignees_list,function(x){
      data.table(data.frame(issue_assignees_login=x[["login"]],issue_assignees_id=x[["id"]]))
    })
    assignees_list <- rbindlist(assignees_list,fill=TRUE)
    parsed_response[["issue_assignees_login"]] <- stringi::stri_c(assignees_list$issue_assignees_login,collapse = ";")
    parsed_response[["issue_assignees_id"]] <- stringi::stri_c(assignees_list$issue_assignees_id,collapse = ";")

    parsed_response <- as.data.table(parsed_response)

    return(parsed_response)
  }
  rbindlist(lapply(api_responses,parse_response),fill=TRUE)
}

#' Download Project Issues
#'
#' Download  Issues from "GET /repos/{owner}/{repo}/issues" endpoint.
#'
#' @param owner GitHub's repository owner (e.g. sailuh)
#' @param repo GitHub's repository name (e.g. kaiaulu)
#' @param token Your GitHub API token
#' @export
#' @references For details, see \url{https://docs.github.com/en/rest/reference/issues#list-repository-issues}.
github_api_project_issue <- function(owner,repo,token){
  gh::gh("GET /repos/{owner}/{repo}/issues",
         owner=owner,
         repo=repo,
         state="all",
         page=1,
         per_page=100,
         .token=token)
}

#' Parse Issues JSON to Table
#'
#' Note not all columns available in the downloaded json are parsed.
#'
#' @param api_responses API response obtained from github_api_* function.
#' @export
github_parse_project_issue <- function(api_responses){
  parse_response <- function(api_response){
    parsed_response <- list()
    parsed_response[["issue_id"]] <- api_response[["id"]]
    parsed_response[["issue_number"]] <- api_response[["number"]]
    parsed_response[["html_url"]] <- api_response[["html_url"]]
    parsed_response[["url"]] <- api_response[["url"]]
    parsed_response[["created_at"]] <- api_response[["created_at"]]
    parsed_response[["updated_at"]] <- api_response[["updated_at"]]
    parsed_response[["state"]] <- api_response[["state"]]
    parsed_response[["issue_user_login"]] <- api_response[["user"]][["login"]]
    parsed_response[["author_association"]] <- api_response[["author_association"]]
    parsed_response[["title"]] <- api_response[["title"]]
    parsed_response[["body"]] <- api_response[["body"]]

    parsed_response[["labels"]] <- api_response[["labels"]]
    if(length(parsed_response[["labels"]]) > 0){
      parsed_response[["labels"]] <- stringi::stri_c(sapply(parsed_response[["labels"]],"[[","name"),collapse = ",")
    }else{
      parsed_response[["labels"]] <- NA_character_
    }

    parsed_response <- as.data.table(parsed_response)

    return(parsed_response)
  }
  rbindlist(lapply(api_responses,parse_response),fill=TRUE)
}

#' Download Project Pull Requests
#'
#' Download  Pull Requests from "GET /repos/{owner}/{repo}/pulls" endpoint.
#'
#' @param owner GitHub's repository owner (e.g. sailuh)
#' @param repo GitHub's repository name (e.g. kaiaulu)
#' @param token Your GitHub API token
#' @export
#' @references For details, see \url{https://docs.github.com/en/rest/reference/pulls#list-pull-requests}.
github_api_project_pull_request <- function(owner,repo,token){
  gh::gh("GET /repos/{owner}/{repo}/pulls",
         owner=owner,
         repo=repo,
         state="all",
         page=1,
         per_page=100,
         .token=token)
}

#' Parse Pull Requests JSON to Table
#'
#' Note not all columns available in the downloaded json are parsed.
#'
#' @param api_responses API response obtained from github_api_* function.
#' @export
github_parse_project_pull_request <- function(api_responses){
  parse_response <- function(api_response){
    parsed_response <- list()
    parsed_response[["pr_id"]] <- api_response[["id"]]
    parsed_response[["pr_number"]] <- api_response[["number"]]
    parsed_response[["html_url"]] <- api_response[["html_url"]]
    parsed_response[["url"]] <- api_response[["url"]]
    parsed_response[["created_at"]] <- api_response[["created_at"]]
    parsed_response[["updated_at"]] <- api_response[["updated_at"]]
    parsed_response[["state"]] <- api_response[["state"]]
    parsed_response[["pr_user_login"]] <- api_response[["user"]][["login"]]
    parsed_response[["author_association"]] <- api_response[["author_association"]]
    parsed_response[["title"]] <- api_response[["title"]]
    parsed_response[["body"]] <- api_response[["body"]]

    parsed_response[["labels"]] <- api_response[["labels"]]
    if(length(parsed_response[["labels"]]) > 0){
      parsed_response[["labels"]] <- stringi::stri_c(sapply(parsed_response[["labels"]],"[[","name"),collapse = ",")
    }else{
      parsed_response[["labels"]] <- NA_character_
    }

    parsed_response <- as.data.table(parsed_response)

    return(parsed_response)
  }
  rbindlist(lapply(api_responses,parse_response),fill=TRUE)
}

#' Download Project Issue's or Pull Request's Comments
#'
#' Download Issues' or Pull Request's Comments from "GET /repos/{owner}/{repo}/issues/comments" endpoint.
#' Optional parameter since is used to download comments updated after the specified date.
#' If the value of since is NULL, it is not passed to the API call and all comments are downloaded.
#'
#' @param owner GitHub's repository owner (e.g. sailuh)
#' @param repo GitHub's repository name (e.g. kaiaulu)
#' @param token Your GitHub API token
#' @param since Optional parameter to specify pulling only comments updated after this date
#' @export
#' @references For details, see \url{https://docs.github.com/en/rest/reference/issues#list-issue-comments-for-a-repository} and
#' \url{https://docs.github.com/en/rest/guides/working-with-comments#pull-request-comments}.
#' @export
github_api_project_issue_or_pr_comments <- function(owner,repo,token,since=NULL){
  if (!is.null(since)){
    gh::gh("GET /repos/{owner}/{repo}/issues/comments",
           owner=owner,
           repo=repo,
           page=1,
           per_page=100,
           .token=token,
           since=since)
  } else {
    gh::gh("GET /repos/{owner}/{repo}/issues/comments",
           owner=owner,
           repo=repo,
           page=1,
           per_page=100,
           .token=token)
  }
}


#' Parse Issues' or Pull Requests' Comments JSON to Table
#'
#' Note not all columns available in the downloaded json are parsed.
#'
#' @param api_responses API response obtained from github_api_* function.
#' @export
github_parse_project_issue_or_pr_comments <- function(api_responses){
  parse_response <- function(api_response){
    parsed_response <- list()
    parsed_response[["comment_id"]] <- api_response[["id"]]
    parsed_response[["html_url"]] <- api_response[["html_url"]]
    parsed_response[["issue_url"]] <- api_response[["issue_url"]]
    parsed_response[["created_at"]] <- api_response[["created_at"]]
    parsed_response[["updated_at"]] <- api_response[["updated_at"]]
    parsed_response[["comment_user_login"]] <- api_response[["user"]][["login"]]
    parsed_response[["author_association"]] <- api_response[["author_association"]]
    parsed_response[["body"]] <- api_response[["body"]]

    parsed_response <- as.data.table(parsed_response)

    return(parsed_response)
  }
  rbindlist(lapply(api_responses,parse_response),fill=TRUE)
}
#' Download Project Commits
#'
#' Download Commits from "GET /repos/{owner}/{repo}/commits" endpoint.
#' Differently from parsing commits by git cloning the repository, this JSON provides
#' the GitHub user id, which allows for linking file changes and issue events by the
#' same author without relying on identity matching heuristics.
#'
#' @param owner GitHub's repository owner (e.g. sailuh)
#' @param repo GitHub's repository name (e.g. kaiaulu)
#' @param token Your GitHub API token
#' @references For details, see \url{https://docs.github.com/en/rest/reference/repos#commits}.
#' @export
github_api_project_commits <- function(owner,repo,token){
  gh::gh("GET /repos/{owner}/{repo}/commits",
         owner=owner,
         repo=repo,
         page=1,
         per_page=100,
         .token=token)
}
#' Parse Commits JSON to Table
#'
#' Note not all columns available in the downloaded json are parsed.
#'
#' @param api_responses API response obtained from github_api_* function.
#' @export
github_parse_project_commits <- function(api_responses){
  parse_response <- function(api_response){
    parsed_response <- list()
    parsed_response[["author_login"]] <- api_response[["author"]][["login"]]
    parsed_response[["commit_author_name"]] <- api_response[["commit"]][["author"]][["name"]]
    parsed_response[["commit_author_email"]] <- api_response[["commit"]][["author"]][["email"]]
    parsed_response[["committer_login"]] <- api_response[["committer"]][["login"]]
    parsed_response[["commit_committer_name"]] <- api_response[["commit"]][["committer"]][["name"]]
    parsed_response[["commit_committer_email"]] <- api_response[["commit"]][["committer"]][["email"]]
    parsed_response[["commit_message"]] <- api_response[["commit"]][["message"]]

    parsed_response <- as.data.table(parsed_response)

    return(parsed_response)
  }
  rbindlist(lapply(api_responses,parse_response),fill=TRUE)
}


#' Download Project Contributors
#'
#' Download project contributors from GET /repos/{owner}/{repo}/contributors" endpoint.
#'
#' @param owner GitHub's repository owner (e.g. sailuh)
#' @param repo GitHub's repository name (e.g. kaiaulu)
#' @param token Your GitHub API token
#' @references For more details see \url{https://docs.github.com/en/free-pro-team@latest/rest/reference/repos#list-repository-contributors}.
#' @export
github_api_project_contributors <- function(owner,repo,token){
  gh::gh("GET /repos/{owner}/{repo}/contributors",
         owner=owner,
         repo=repo,
         page=1,
         per_page=100,
         .token=token)
}

#' Returns token remaining available requests.
#' @param token Your GitHub API token
#' @references For more details see \url{https://docs.github.com/en/free-pro-team@latest/rest/overview/resources-in-the-rest-api#rate-limiting}.
#' @export
github_api_rate_limit <- function(token){
  gh::gh_rate_limit(
    response = NULL,
    .token = token,
    .api_url = NULL,
    .send_headers = NULL
  )
}

#' Obtain the next GitHub response page.
#' @param gh_response A response returned by any GitHub endpoint which is paginated (e.g. \code{\link{github_api_project_commits}}).
#' @export
#' @keywords internal
github_api_page_next <- function(gh_response){
  gh::gh_next(gh_response)
}

#' Obtain the previous GitHub response page.
#' @param gh_response A response returned by any GitHub endpoint which is paginated (e.g. \code{\link{github_api_project_commits}}).
#' @export
#' @keywords internal
github_api_page_prev <- function(gh_response){
  gh::gh_prev(gh_response)
}

#' Obtain the first GitHub response page.
#' @param gh_response A response returned by any GitHub endpoint which is paginated (e.g. \code{\link{github_api_project_commits}}).
#' @export
#' @keywords internal
github_api_page_first <- function(gh_response){
  gh::gh_first(gh_response)
}


#' Obtain the last GitHub response page.
#' @param gh_response A response returned by any GitHub endpoint which is paginated (e.g. \code{\link{github_api_project_commits}}).
#' @export
#' @keywords internal
github_api_page_last <- function(gh_response){
  gh::gh_last(gh_response)
}

#' GitHub Page Iterator
#'
#' GitHub API endpoints return data in pages, each containing by default 100 entries.
#' This iterator can be used to iterate over the next page in order to download all the
#' project's data available from the endpoint (up to a user-defined maximum or the remaining
#' available requests in the used user's token). This function can differentiate between
#' data downloaded from search endpoint or not, in which the issues are differently nested.
#' It also differentiates these endpoints from commit data, which also uses a different level
#' of nesting. This is important in order to extract the minimum and maximum value of time created
#' for each page for the naming convention, which is owner_repo_(min time)_(max time).json.
#'
#' @param token Your GitHub API token
#' @param gh_response A response returned by any GitHub endpoint which is paginated (e.g. \code{\link{github_api_project_commits}}).
#' @param save_folder_path A folder path to save the downloaded json pages "as-is".
#' @param prefix Prefix to be added to every json file name
#' @param max_pages The maximum number of pages to download. MAX = Available token requests left
#' @param verbose Boolean value that prints operating messages when set to TRUE, does not print when false.
#' Operating messages may be details about certain parts of the code correctly executing or printing names
#' of files created, etc.
#' @references For details see \url{https://docs.github.com/en/free-pro-team@latest/rest/guides/traversing-with-pagination}.
#' @export
#' @keywords internal
github_api_iterate_pages <- function(token,gh_response,save_folder_path,prefix=NA,max_pages=NA,verbose=TRUE){
  page_number <- 1

  data_exists = TRUE
  # Set the max_pages to your api limit unless specified
  if(is.na(max_pages)){
    max_pages <- github_api_rate_limit(token)$remaining
  }

  # determine if the passed data is from the refresh folder or not
  json_string <- toJSON(gh_response, pretty = TRUE, auto_unbox = TRUE)
  json_data <- fromJSON(json_string, simplifyVector = TRUE)

  # Check if 'total_count' is present at the top level of the JSON structure
  # This allows us to determine if the data is formatted from the search endpoint or not
  if ("total_count" %in% names(json_data)) {
    is_issue_refresh <- TRUE
  } else {
    is_issue_refresh <- FALSE
  }
  # message(is_issue_refresh)

  # Check if it is

  #Get the most and least recent 'created_at' date in unixtime in this page
  while(!is.null(gh_response) & page_number < max_pages){

    #  Set the file name from the config file. It will be modified in the following code
    file_name <- save_folder_path
    if(length(gh_response) > 0) {
      #  Extract 'created_at' dates. Different nesting levels for refresh data or not
      # Run this code if it's not issue_refresh. Important for different levels of nesting
      if (is_issue_refresh==FALSE){
        # Make list of all created_dates
        created_dates <- sapply(gh_response, function(issue) issue$created_at)
        # Remove NULL entries from the list. The list will be NULL if it is commit data currently
        created_dates <- Filter(Negate(is.null), created_dates)

        # Check if the list is NULL, signifying this is commit data
        if (length(created_dates)==0){
          created_dates <- sapply(gh_response, function(issue) {
            if (!is.null(issue$commit) && !is.null(issue$commit$author) && !is.null(issue$commit$author$date)) {
              return(issue$commit$author$date)
            } else {
              return(NA) # Return NA if the path does not exist
            }
          })
        }

        # Run this code if it is for issue refresh
      } else {
        # Make list of all created dates
        created_dates <- sapply(gh_response$items, function(issue) issue$created_at)
        # End the loop if there is no usable data
        # message(gh_response)
        # message(created_dates)
        if (length(created_dates)==0){
          if(verbose){
            message("Nothing left to download")
          }
          break
        }
      }
      #
      # Convert to POSIXct date objects
      # date_objects <- as.POSIXct(created_dates, format="%Y-%m-%dT%H:%M:%S", tz="UTC")
      date_objects <- as.POSIXct(created_dates, format="%Y-%m-%dT%H:%M:%SZ", tz="UTC")

      # Find the greatest and smallest date
      latest_date <- max(date_objects)
      latest_date_unix <- as.numeric(latest_date)
      oldest_date <- min(date_objects)
      oldest_date_unix <- as.numeric(oldest_date)

      # Append oldest and latest dates to the file name
      file_name <- paste0(file_name, "_", oldest_date_unix)
      file_name <- paste0(file_name, "_", latest_date_unix, ".json")

      # Print the latest and oldest dates and file name
      if (verbose){
        message("Latest date:", latest_date_unix)
        message("Oldest date:", oldest_date_unix)
        message("File name: ", file_name)
        message("extracted dates for page ", page_number)
      }
    } else {
      data_exists = FALSE
      if(verbose){
        message("Nothing to download")
      }
    }

    # Save the pages to file
    if (data_exists == TRUE){
      # construct the file name
      file_name <- paste0(save_folder_path,
                          owner,"_",repo,"_",
                          oldest_date_unix, "_",
                          latest_date_unix,
                          ".json")
      # Write to file
      write_json(gh_response,file_name,
                 pretty=TRUE,auto_unbox=TRUE)
      if (verbose){
        message("Written to file: ", file_name)
      }
    }
    # increment the page number
    page_number <- page_number + 1
    res <- try(
      {
        gh_response <- github_api_page_next(gh_response)
      },silent=TRUE)
    if(inherits(res,"try-error")) {
      gh_response <- NULL
    }
  }
}


#' Download Project Issues Refresh
#'
#' Uses the adopted file name convention by \code{\link{github_api_iterate_pages}} to identify
#' the latest downloaded Github created_at date among directory "issue_search".
#' It returns the first page of the github query for issues created after this date by calling
#' \code{\link{github_api_project_issue_search}}
#'
#' If the issue directory is empty, then the created query will not be appended to the api call
#' and the first page of a query retrieving all issues will be returned. This function can therefore
#' be used in the specified folder to continuously refresh available issues
#' data.
#'
#' @param owner GitHub's repository owner (e.g. sailuh)
#' @param repo GitHub's repository name (e.g. kaiaulu)
#' @param token Your GitHub API token
#' @param save_path_issue_refresh The folder path that the refresh downloader downloads to
#' @param issue_or_pr This specifies whether issues or pull requests are being searched for.
#' Acceptable inputs are "is:issue" or "is:pull-request".
#' @param verbose A boolean value that prints operational messages when set to TRUE.
#' These may include announcing successful execution of code, API queries, files saved, etc.
#' @export
#' @references For details, see \url{https://docs.github.com/en/rest/reference/issues#list-repository-issues}.
#' @references For details on timestampes, se \url{https://docs.github.com/en/search-github/searching-on-github/searching-issues-and-pull-requests#search-by-when-an-issue-or-pull-request-was-created-or-last-updated}
#' @seealso  \code{link{github_api_project_issue}} to download all issue data
#' @seealso  \code{link{format_created_at_from_file}} for function that iterates through
#' a .json file and returns the greatest 'created_at' value
#' @seealso  \code{link{github_api_iterate_pages}} to write data returned by this function to file as .json
github_api_project_issue_refresh <- function(owner,
                                             repo,
                                             token,
                                             save_path_issue_refresh,
                                             issue_or_pr,
                                             verbose){


  # Check if refresh folder is empty
  contents_refresh <- list.files(path = save_path_issue_refresh)

  # If the file is empty, download all issues
  if(length(contents_refresh) == 0) {
    if(verbose){
      message("No files exist in directory. Downloading all files")
    }
    query <- NULL
    gh_response <- github_api_project_issue_search(owner, repo, token, query, issue_or_pr, verbose=TRUE)
    return(gh_response)
  } else {
    # Get the name of the file with the most recent date from the refresh_issue file if not empty
    latest_created_issue_refresh <- paste0(save_path_issue_refresh, parse_jira_latest_date(save_path_issue_refresh))
    latest_created_issue_refresh <- head(latest_created_issue_refresh,1)
    # get the greatest created_at value among issues in the refresh_issues file
    created_refresh <- format_created_at_from_file(latest_created_issue_refresh, item="items")
    # }
    if(verbose){
      message("Greatest created value from issue_search folder: ", created_refresh)
    }

    # construct the query
    query <- paste0("repo:",owner,"/",repo," ", issue_or_pr," created:>",created_refresh)

    if (verbose){
      message("Github API query: ",query)
    }
    # Call the API function
    gh_response <- github_api_project_issue_search(owner, repo, token, query, verbose=TRUE)
    return(gh_response)
  }
}

#' Download Project issues or pr comments after certain date
#'
#' Uses the adopted file name convention by \code{\link{github_api_iterate_pages}} to identify
#' the latest downloaded Github created_at date among the directory(intended to be the  folder).
#' It uses this date to construct a query and calls \code{\link{github_api_project_issue_or_pr_comments}}
#'
#' If no files exist in the file_save_path,\code{link{github_api_project_issue_or_pr_comments}}
#' is called with no additional query and all comments are downloaded.
#'
#' Because the endpoint this function relies on is based on the updated timestamp, running the refresher
#' will download the most recent version of the comment changes. Only the most recent version of the comment will
#' be downloaded, not all copies. However, if the same comment was modified before the next refresh call,
#' then if the refresher function was executed again, then this would result in two comments with the same
#' comment id being present in the table. This can be addressed by performing a group by over the comment\_id
#' in the generated parsed table, and selecting to return the max(updated_at) comment, resulting in a table
#' that only the most recent comment verson as of the latest time the refresher was executed.
#'
#' @param owner GitHub's repository owner (e.g. sailuh)
#' @param repo GitHub's repository name (e.g. kaiaulu)
#' @param token Your GitHub API token
#' @param file_save_path the save path for the issue comments folder
#' @param verbose boolean value. When set to true, it prints operational messages including
#' greatest dates and the file name that contains the greatest date.
#' @export
#' @references For details, see \url{https://docs.github.com/en/rest/reference/issues#list-repository-issues}.
#' @seealso  \code{link{github_api_project_issue_or_pr_comments}} to download all comment data
#' @seealso  \code{link{format_created_at_from_file}} for function that iterates through
#' a .json file and returns the greatest 'created_at' value
#' @seealso  \code{link{github_api_iterate_pages}} to write data returned by this function to file as .json
#' @seealso  \code{link{github_api_project_issue_or_pr_comments}} to call issue/comments endpoint
github_api_project_issue_or_pr_comment_refresh <- function(owner,repo,token,file_save_path,verbose=TRUE){
  # Check if the file is empty by checking its size
  # List all files and subdirectories in the directory
  contents <- list.files(path = save_path_issue_or_pr_comments)

  # If the file is empty, download all issues
  if(length(contents) == 0) {
    # Run regular downloader
    issues <- github_api_project_issue_or_pr_comments(owner,repo,token)
    return (issues)
  } else {
    # Get the name of the file with the most recent date
    latest_updated_issue_or_pr_comment <- paste0(file_save_path, parse_jira_latest_date(save_path_issue_or_pr_comments))
    latest_updated_issue_or_pr_comment <- (head(latest_updated_issue_or_pr_comment,1))
    # get the created_at value
    message("got file", latest_updated_issue_or_pr_comment)
    created <- format_created_at_from_file(latest_updated_issue_or_pr_comment, item="")

    # Convert the string to a POSIXct object
    time_value <- as.POSIXct(created, format="%Y-%m-%dT%H:%M:%SZ", tz="UTC")

    # Add one second
    new_time_value <- time_value + 1

    # Format the new time value back into the original string format
    formatted_new_time_value <- format(new_time_value, "%Y-%m-%dT%H:%M:%SZ")

    if(verbose){
      message("file name with greatest date: ",latest_updated_issue_or_pr_comment)
      message("Latest date: ",formatted_new_time_value)
    }
    # Make the API call
    gh_response <- github_api_project_issue_or_pr_comments(owner,repo,token,formatted_new_time_value)
  } #end if/else
}

#' Retrieve greatest 'created_at' value from file
#'
#' Function to read a JSON file along a path and return the 'created_at'
#' date of the greatest value for the issue key. Note that the 'created_at'
#' value differs in how it is nested. This format is returned by the
#' issue endpoint currently, but is in level 2 of data returned by search endpoint.
#' So we allow the input of an item_path parameter to specify the level of nesting
#'
#' @param file_name the path and the file name. For example:
#' ../../rawdata/github/kaiaulu/issue_or_pr_comment/sailuh_kaiaulu_issue_or_pr_comment_1701216000_1701261374.json
#' @param item_path specifies the level of nesting to look for the created_at value. This was
#' implemented given that the results of the search endpoint are differently nested than others.

#' @seealso  \code{link{github_api_project_issue_or_pr_comment_refresh}} to refresh comment data
#' @seealso  \code{link{github_api_project_issue_refresh}} to refresh issue data
#' @keywords internal
format_created_at_from_file <- function(file_name,item_path) {
  # Read the JSON file
  json_data <- fromJSON(txt= file_name, simplifyVector = FALSE)

  # Navigate to the correct level in the JSON structure based on the item_path
  data_to_process <- if (item_path != "") {
    eval(parse(text=paste0("json_data$", item_path)))
  } else {
    json_data
  }

  # Initialize a variable to keep track of the greatest date
  greatest_date <- as.POSIXct("1970-01-01T00:00:00Z", tz = "UTC")

  # Iterate through each element in the data_to_process
  for (item in data_to_process) {
    # Extract 'created_at' date and convert to POSIXct
    current_date <- as.POSIXct(item$created_at, format = "%Y-%m-%dT%H:%M:%SZ", tz = "UTC")

    # Update greatest_date if the current item's date is later
    if (current_date > greatest_date) {
      greatest_date <- current_date
    }
  }

  # Format the greatest date found
  formatted_greatest_date <- format(greatest_date, "%Y-%m-%dT%H:%M:%SZ")

  # Return the latest 'created_at' value
  return(formatted_greatest_date)
}

#' Parse Issues JSON from refresh to Table
#'
#' Note not all columns available in the downloaded json are parsed. This parser
#' is adapted from \code{link{github_parse_project_issue}} to parse data
#' from the refresh_issue folder. This data is downloaded from the Github API
#' search endpoint and has a different level of nesting than the original data
#'
#' @param api_responses API response obtained from github_api_* function.
#' @export
#' @seealso  \code{link{github_api_project_issue_refresh}} to refresh issue data
github_parse_search_issues_refresh <- function(api_responses) {
  # Helper function to parse each issue
  parse_response <- function(api_response) {
    parsed_response <- list()
    parsed_response[["issue_id"]] <- api_response[["id"]]
    parsed_response[["issue_number"]] <- api_response[["number"]]
    parsed_response[["html_url"]] <- api_response[["html_url"]]
    parsed_response[["url"]] <- api_response[["url"]]
    parsed_response[["created_at"]] <- api_response[["created_at"]]
    parsed_response[["updated_at"]] <- api_response[["updated_at"]]
    parsed_response[["state"]] <- api_response[["state"]]
    parsed_response[["issue_user_login"]] <- api_response[["user"]][["login"]]
    parsed_response[["author_association"]] <- api_response[["author_association"]]
    parsed_response[["title"]] <- api_response[["title"]]
    parsed_response[["body"]] <- api_response[["body"]]

    # Parsing labels
    parsed_response[["labels"]] <- api_response[["labels"]]
    if(length(parsed_response[["labels"]]) > 0) {
      parsed_response[["labels"]] <- stringi::stri_c(sapply(parsed_response[["labels"]], "[[", "name"), collapse = ",")
    } else {
      parsed_response[["labels"]] <- NA_character_
    }

    parsed_response <- as.data.table(parsed_response)
    return(parsed_response)
  }

  # Assuming 'items' contains the issues
  all_issues <- lapply(api_responses[["items"]], parse_response)
  return(rbindlist(all_issues, fill = TRUE))
}


#' Download Github comment Data by Date
#'
#' Appends a 'since' query to the issue/comments api request and returns the first page of the result.
#'
#' #' Acceptable formats for `since` are:
#'
#' * "YYYY-MM-DD"
#' * "YYYY-MM-DDTHH:MM"
#' * "YYYY-MM-DDTHH:MM:SS"
#' * "YYYY-MM-DDTHH:MM:SSZ"
#' * "YYYY-MM-DDTHH:MM:SS+00:00"
#' * NULL
#'
#'#' For example: `since="2020-07-04"` (a comment ocurring at the exact specified time will also be downloaded).
#'
#' For further details on the `since` Query see [the associated Github API documentation](https://docs.github.com/en/rest/issues/comments?apiVersion=2022-11-28#:~:text=asc%2C%20desc-,since,-string).
#'
#' @param owner GitHub's repository owner (e.g. sailuh)
#' @param repo GitHub's repository name (e.g. kaiaulu)
#' @param token Your GitHub API token
#' @param since The lower bound. Comments created and/or updated after this date will be retrieved.
#' @param verbose boolean value. When set to true, it prints operational messages including
#' greatest dates and the file name that contains the greatest date.
#' @export
#' @seealso  \code{link{github_api_project_issue_or_pr_comment_refresh}} to refresh comment data
#' @seealso  \code{link{github_api_project_issue_refresh}} to refresh issue data
#' @seealso  \code{link{github_api_project_issue_or_pr_comments}} to call issue/comments endpoint
github_api_project_issue_or_pr_comments_by_date <- function(owner,
                                                            repo,
                                                            token,
                                                            since,
                                                            verbose = FALSE) {
  if (is.null(since)) {
    stop("The lower bound parameter is empty or improperly formatted")
  }
  if(verbose){
    message("Downloading comments updated/created after: ", since)
  }
  # Make the API call
  gh_response <- github_api_project_issue_or_pr_comments(owner,repo,token,since)
  return(gh_response)
}

#' Download Github Issue Data by Date
#'
#' Appends a 'created' field to a search github JQL query and returns the first page of the response.
#'
#' Acceptable formats for `date_lower_bound` and `date_upper_bound` are:
#'
#' * "YYYY-MM-DD"
#' * "YYYY-MM-DDTHH:MM"
#' * "YYYY-MM-DDTHH:MM:SS"
#' * "YYYY-MM-DDTHH:MM:SSZ"
#' * "YYYY-MM-DDTHH:MM:SS+00:00"
#' * NULL
#'
#' For example: `date_lower_bound="2020-07-04"` (an issue ocurring at the exact specified time will also be downloaded).
#'
#' For further details on the `created` Query see [the associated Github API documentation](https://docs.github.com/en/search-github/searching-on-github/searching-issues-and-pull-requests#search-by-when-an-issue-or-pull-request-was-created-or-last-updated).
#'
#' @param owner GitHub's repository owner (e.g. sailuh)
#' @param repo GitHub's repository name (e.g. kaiaulu)
#' @param token Your GitHub API token
#' @param date_lower_bound Optional. Specify the lower bound date time (e.g. 2023/11/16 21:00)
#' @param date_upper_bound Optional. Specify the upper bound date time (e.g. 2023/11/16 21:00)
#' @param issue_or_pr This specifies whether issues or pull requests are being searched for.
#' Acceptable inputs are "is:issue" or "is:pull-request".
#' greatest dates and the file name that contains the greatest date.
#' @param verbose boolean value. When set to true, it prints operational messages including
#' greatest dates and the file name that contains the greatest date.
#' @export
#' @references For details on is:issue or is:pull-request see \url{https://docs.github.com/en/rest/search/search?apiVersion=2022-11-28}
#' @references For details on timestampes, se \url{https://docs.github.com/en/search-github/searching-on-github/searching-issues-and-pull-requests#search-by-when-an-issue-or-pull-request-was-created-or-last-updated}
#' @seealso  \code{link{github_api_project_issue_or_pr_comment_refresh}} to refresh comment data
#' @seealso  \code{link{github_api_project_issue_or_pr_comments}} to refresh issue data
github_api_project_issue_by_date <- function(owner,
                                             repo,
                                             token,
                                             date_lower_bound = NULL,
                                             date_upper_bound = NULL,
                                             issue_or_pr,
                                             verbose = FALSE) {
  # Base query to include repository and issue filter
  query <- paste0("repo:",owner, "/", repo, " ", issue_or_pr)
  message(query)

  # Add date filters to the query if provided
  if (!is.null(date_lower_bound) && !is.null(date_upper_bound)) {
    query <- sprintf("%s created:%s..%s", query, date_lower_bound, date_upper_bound)
    if(verbose){
      message("Downloading issue data created between ", date_lower_bound, " and ", date_upper_bound, ".")
    }
  } else if (!is.null(date_lower_bound)) {
    query <- sprintf("%s created:>=%s", query, date_lower_bound)
  } else if (!is.null(date_upper_bound)) {
    query <- sprintf("%s created:<=%s", query, date_upper_bound)
  }

  # Only proceed if at least one date bound is provided
  if (is.null(date_lower_bound) && is.null(date_upper_bound)) {
    stop("At least one of 'date_lower_bound' or 'date_upper_bound' must be provided.
         If you have provided at least one, it may be improperly formatted.")
  }

  # Perform the API call using the constructed query
  gh_response <- github_api_project_issue_search(owner, repo, token, query, issue_or_pr,verbose=TRUE)

  return(gh_response)
}

#' Download Project Issues via Search
#'
#' Download Commits from "GET /repos/{owner}/{repo}/search/issues" endpoint.
#' This search endpoint allows for optional query parameter. Potential queries are found
#' [here](https://docs.github.com/en/rest/search/search?apiVersion=2022-11-28). The query parameter
#' assumes that owner/repo is already prepended to the query. If no query is passed to the function,
#' it will prepend only owner/repo to the query.
#'
#' @param owner GitHub's repository owner (e.g. sailuh)
#' @param repo GitHub's repository name (e.g. kaiaulu)
#' @param token Your GitHub API token
#' @param query Optional query to append to search api
#' @param issue_or_pr This specifies whether issues or pull requests are being searched for.
#' Acceptable inputs are "is:issue" or "is:pull-request".
#' @param verbose Prints operational messages when se to true such as stating the search query.
#' @references For details, see \url{https://docs.github.com/en/rest/search/search?apiVersion=2022-11-28}.
#' @references For details on timestampes, se \url{https://docs.github.com/en/search-github/searching-on-github/searching-issues-and-pull-requests#search-by-when-an-issue-or-pull-request-was-created-or-last-updated}
#' @export
github_api_project_issue_search <- function(owner, repo, token, query = NULL, issue_or_pr, verbose=TRUE) {
  # Construct the search query
  #Check if there is a query
  if (!is.null(query)){
    search_query <- query
  } else {
    search_query <- "repo:"
    search_query <- paste0(search_query,owner,"/",repo," ", issue_or_pr)
  }

  if(verbose){
    message("Search query: ", search_query)
  }

  # Perform the GitHub API call
  gh_response <- gh::gh("/search/issues",
                        q = search_query,
                        state = 'all',
                        page = 1,
                        per_page = 100,
                        .token = token)
  return(gh_response)
}
sailuh/kaiaulu documentation built on Dec. 10, 2024, 3:14 a.m.