R/git_utils.R

Defines functions .checkout_branch_and_commit .insert_username_and_password_to_repo_url .insert_token_to_repo_url .run_clone_command .clone_command_for_codecommit_https .clone_command_for_codecommit .clone_command_for_github_like_https_2fa_enabled .clone_command_for_github_like_https_2fa_disabled .clone_command_for_ssh .clone_command_for_github_like .generate_and_run_clone_command .valid_git_config git_clone_repo

Documented in git_clone_repo

# NOTE: This code has been modified from AWS Sagemaker Python:
# https://github.com/aws/sagemaker-python-sdk/blob/master/src/sagemaker/git_utils.py

#' @include utils.R
#' @include r_utils.R
#' @include error.R

#' @importFrom processx run
#' @importFrom utils URLencode
#' @importFrom fs is_file

#' @title Clone Sagemaker repositories by calling git
#' @description Git clone repo containing the training code and serving code. This method
#'              also validate ``git_config``, and set ``entry_point``, ``source_dir`` and
#'              ``dependencies`` to the right file or directory in the repo cloned.
#' @param git_config (dict[str, str]): Git configurations used for cloning files,
#'              including ``repo``, ``branch``, ``commit``, ``2FA_enabled``,
#'              ``username``, ``password`` and ``token``. The ``repo`` field is
#'              required. All other fields are optional. ``repo`` specifies the Git
#'              repository where your training script is stored. If you don't
#'              provide ``branch``, the default value 'master' is used. If you don't
#'              provide ``commit``, the latest commit in the specified branch is
#'              used. ``2FA_enabled``, ``username``, ``password`` and ``token`` are
#'              for authentication purpose. If ``2FA_enabled`` is not provided, we
#'              consider 2FA as disabled.
#'              For GitHub and GitHub-like repos, when SSH URLs are provided, it
#'              doesn't matter whether 2FA is enabled or disabled; you should either
#'              have no passphrase for the SSH key pairs, or have the ssh-agent
#'              configured so that you will not be prompted for SSH passphrase when
#'              you do 'git clone' command with SSH URLs. When https URLs are
#'              provided: if 2FA is disabled, then either token or username+password
#'              will be used for authentication if provided (token prioritized); if
#'              2FA is enabled, only token will be used for authentication if
#'              provided. If required authentication info is not provided, python
#'              SDK will try to use local credentials storage to authenticate. If
#'              that fails either, an error message will be thrown.
#'              For CodeCommit repos, 2FA is not supported, so '2FA_enabled' should
#'              not be provided. There is no token in CodeCommit, so 'token' should
#'              not be provided too. When 'repo' is an SSH URL, the requirements are
#'              the same as GitHub-like repos. When 'repo' is an https URL,
#'              username+password will be used for authentication if they are
#'              provided; otherwise, python SDK will try to use either CodeCommit
#'              credential helper or local credential storage for authentication.
#' @param entry_point (str): A relative location to the Python source file which
#'              should be executed as the entry point to training or model hosting
#'              in the Git repo.
#' @param source_dir (str): A relative location to a directory with other training
#'              or model hosting source code dependencies aside from the entry point
#'              file in the Git repo (default: None). Structure within this
#'              directory are preserved when training on Amazon SageMaker.
#' @param dependencies (list[str]): A list of relative locations to directories
#'              with any additional libraries that will be exported to the container
#'              in the Git repo (default: []).
#' @return dict: A dict that contains the updated values of entry_point, source_dir
#'              and dependencies.
#' @family git_config
#' @export
git_clone_repo <- function(git_config,
                           entry_point,
                           source_dir=NULL,
                           dependencies=NULL){
  if(missing(entry_point))
    ValueError$new("Please provide an entry point.")

  .valid_git_config(git_config)
  dest_dir = fs::file_temp()
  .generate_and_run_clone_command(git_config, dest_dir)

  .checkout_branch_and_commit(git_config, dest_dir)

  updated_paths = list(
    "entry_point"= entry_point,
    "source_dir"= source_dir,
    "dependencies"= dependencies)

  # check if the cloned repo contains entry point, source directory and dependencies
  if (!is.null(source_dir)){
    if (!fs::is_dir(fs::path(dest_dir, source_dir)))
      ValueError$new("Source directory does not exist in the repo.")
    if (!fs::is_file(fs::path(dest_dir, source_dir, entry_point)))
      ValueError$new("Entry point does not exist in the repo.")
    updated_paths$source_dir = file.path(dest_dir, source_dir)
  } else {
    if (fs::is_file(fs::path(dest_dir, entry_point)))
      updated_paths$entry_point = file.path(dest_dir, entry_point)
    else
      ValueError$new("Entry point does not exist in the repo.")
  }

  if (!islistempty(dependencies)) {
    updated_paths$dependencies = list()
    for (path in dependencies){
      if (fs::dir_exists(fs::path(dest_dir, path)))
        updated_paths$dependencies = c(updated_paths$dependencies, file.path(dest_dir, path))
      else
        ValueError$new(sprintf("Dependency %s does not exist in the repo.",path))
    }
  }
  return(updated_paths)
}

# Validates the git configuration.
# Checks all configuration values except 2FA_enabled are string types. The
# 2FA_enabled configuration should be a boolean.
# Args:
#   git_config: The configuration to validate.
.valid_git_config <- function(git_config){
  if (!("repo" %in% names(git_config)))
    ValueError$new("Please provide a repo for git_config.")
  for (key in names(git_config)){
    if (key == "2FA_enabled"){
      if (!inherits(git_config[["2FA_enabled"]], "logical")){
        ValueError$new("Please enter a logical type for 2FA_enabled'.")}
    } else if(!inherits(git_config[[key]], "character")){
      ValueError$new(sprintf("'%s' must be a string.",key))}
  }
}


# check if a git_config param is valid, if it is, create the command to git
# clone the repo, and run it.
# Args:
#   git_config ((dict[str, str]): Git configurations used for cloning files,
#               including ``repo``, ``branch`` and ``commit``.
#               dest_dir (str): The local directory to clone the Git repo into.
.generate_and_run_clone_command <- function(git_config,
                                            dest_dir){
  if (startsWith(git_config$repo, "https://git-codecommit")
      || startsWith(git_config$repo, "ssh://git-codecommit")){
    .clone_command_for_codecommit(git_config, dest_dir)
  } else {
    .clone_command_for_github_like(git_config, dest_dir)}
}

# check if a git_config param representing a GitHub (or like) repo is
# valid, if it is, create the command to git clone the repo, and run it.
# Args:
#   git_config ((dict[str, str]): Git configurations used for cloning files,
#               including ``repo``, ``branch`` and ``commit``.
#               dest_dir (str): The local directory to clone the Git repo into.
.clone_command_for_github_like <- function(git_config,
                                           dest_dir){
  is_https = startsWith(git_config$repo,"https://")
  is_ssh = startsWith(git_config$repo,"git@")
  if (!is_https && !is_ssh)
    ValueError$new("Invalid Git url provided.")
  if (is_ssh)
    .clone_command_for_ssh(git_config, dest_dir)
  else if ("2FA_enabled" %in% names(git_config) && git_config[["2FA_enabled"]])
    .clone_command_for_github_like_https_2fa_enabled(git_config, dest_dir)
  else
    .clone_command_for_github_like_https_2fa_disabled(git_config, dest_dir)
}

.clone_command_for_ssh <- function(git_config, dest_dir){
  if ("username" %in% names(git_config) || "password" %in% names(git_config) || "token" %in% names(git_config))
    warning("SSH cloning, authentication information in git config will be ignored.")
  .run_clone_command(git_config$repo, dest_dir)
}

.clone_command_for_github_like_https_2fa_disabled <- function(git_config, dest_dir){
  updated_url = git_config$repo
  if ("token" %in% names(git_config)){
    if ("username" %in% names(git_config) || "password" %in% names(git_config))
      warning("Using token for authentication, other credentials will be ignored.")
    updated_url = .insert_token_to_repo_url(url=git_config$repo, token=git_config$token)
  } else if ("username" %in% names(git_config) && "password" %in% names(git_config)){
    updated_url = .insert_username_and_password_to_repo_url(
      url=git_config$repo, username=git_config$username, password=git_config$password)
  } else if ("username" %in% names(git_config) || "password" %in% names(git_config))
    warning("Credentials provided in git config will be ignored.")
  .run_clone_command(updated_url, dest_dir)
}

.clone_command_for_github_like_https_2fa_enabled <- function(git_config, dest_dir){
  updated_url = git_config$repo
  if ("token" %in% names(git_config)){
    if ("username" %in% names(git_config) || "password" %in% names(git_config))
      warning("Using token for authentication, other credentials will be ignored.")
    updated_url = .insert_token_to_repo_url(url=git_config$repo, token=git_config$token)}
  .run_clone_command(updated_url, dest_dir)
}

# check if a git_config param representing a CodeCommit repo is valid, if
# it is, create the command to git clone the repo, and run it.
# Args:
#   git_config ((dict[str, str]): Git configurations used for cloning files,
#               including ``repo``, ``branch`` and ``commit``.
#               dest_dir (str): The local directory to clone the Git repo into.
#               Raises:
#                 ValueError: If git_config['repo'] is in the wrong format.
#               CalledProcessError: If failed to clone git repo.
.clone_command_for_codecommit <- function(git_config, dest_dir){
  is_https = startsWith(git_config$repo,"https://git-codecommit")
  is_ssh = startsWith(git_config$repo,"ssh://git-codecommit")
  if (!is_https && !is_ssh)
    ValueError$new("Invalid Git url provided.")
  if ("2FA_enabled" %in% names(git_config))
    warning("CodeCommit does not support 2FA, '2FA_enabled' will be ignored.")
  if ("token" %in% names(git_config))
    warning("There are no tokens in CodeCommit, the token provided will be ignored.")
  if (is_ssh)
    .clone_command_for_ssh(git_config, dest_dir)
  else
    .clone_command_for_codecommit_https(git_config, dest_dir)
}

# Invoke the clone command for codecommit.
# Args:
#   git_config: The git configuration.
# dest_dir: The destination directory for the clone.
.clone_command_for_codecommit_https <- function(git_config, dest_dir){
  updated_url = git_config$repo
  if ("username" %in% names(git_config) && "password" %in% names(git_config))
    updated_url = .insert_username_and_password_to_repo_url(
      url=git_config$repo, username=git_config$username, password=git_config$password)
  else if ("username" %in% names(git_config) || "password" %in% names(git_config))
    warning("Credentials provided in git config will be ignored.")
  .run_clone_command(updated_url, dest_dir)
}


# Run the 'git clone' command with the repo url and the directory to clone the repo into.
# Args:
#   repo_url (str): Git repo url to be cloned.
# dest_dir: (str): Local path where the repo should be cloned into.
# Raises:
#   CalledProcessError: If failed to clone git repo.
.run_clone_command <-function(repo_url, dest_dir){
  if (startsWith(repo_url, "https://")){
    processx::run("git", args = c("clone", repo_url, dest_dir), env = c("current", GIT_TERMINAL_PROMPT = "0"))
  } else if (startsWith(repo_url, "git@")){
    sshnoprompt = fs::file_temp()
    on.exit(unlink(sshnoprompt))
    writeLines("ssh -oBatchMode=yes $@", sshnoprompt)
    fs::file_chmod(sshnoprompt, mode = "511")
    processx::run("git", args = c("clone", repo_url, dest_dir), env = c("current", GIT_SSH = sshnoprompt))
  }
}

# Insert the token to the Git repo url, to make a component of the git
# clone command. This method can only be called when repo_url is an https url.
# Args:
#   url (str): Git repo url where the token should be inserted into.
# token (str): Token to be inserted.
# Returns:
#   str: the component needed fot the git clone command.
.insert_token_to_repo_url <- function(url, token){
  index = nchar("https://")
  if (gregexpr(token, url)[[1]][1] == index)
    return(url)
  return(gsub("https://", paste0("https://",token, "@"), url))
}

# Insert the username and the password to the Git repo url, to make a
# component of the git clone command. This method can only be called when
# repo_url is an https url.
# Args:
#   url (str): Git repo url where the token should be inserted into.
# username (str): Username to be inserted.
# password (str): Password to be inserted.
# Returns:
#   str: the component needed for the git clone command.
.insert_username_and_password_to_repo_url <- function(url, username, password){
  password = utils::URLencode(password, reserved = T)
  index = nchar("https://")
  pt1 = substring(url, 1, index)
  pt4 = substring(url, index+1, nchar(url))
  return(paste0(pt1, username, ":", password, "@", pt4))
}

# Checkout the required branch and commit.
# Args:
#   git_config (dict[str, str]): Git configurations used for cloning files,
# including ``repo``, ``branch`` and ``commit``.
# dest_dir (str): the directory where the repo is cloned
.checkout_branch_and_commit <- function(git_config, dest_dir){
  dest_dir = as.character(dest_dir)
  if ("branch" %in% names(git_config)){
    processx::run("git", args = c("checkout", git_config$branch), wd = dest_dir)
  }
  if ("commit" %in% names(git_config)){
    processx::run("git", args = c("checkout", git_config$commit), wd = dest_dir)
  }
}
DyfanJones/sagemaker-r-local documentation built on June 14, 2022, 10:32 p.m.