R/legacy_kaggle.R

Defines functions board_wait_create kaggle_resource_exists board_pin_versions.pins_board_kaggle board_browse.pins_board_kaggle board_pin_remove.pins_board_kaggle board_pin_get.pins_board_kaggle kaggle_competition_files board_pin_find.pins_board_kaggle board_pin_search_kaggle board_pin_create.pins_board_kaggle kaggle_create_bundle kaggle_create_resource kaggle_upload_resource kaggle_qualify_name kaggle_auth kaggle_auth_info board_register_kaggle legacy_kaggle

Documented in board_register_kaggle legacy_kaggle

#' Kaggle board (legacy API)
#'
#' These functions are no longer supported because of changes to the Kaggle
#' API and will be removed in a future version of pins. We recommend that you
#' use the [Kaggle CLI](https://www.kaggle.com/docs/api) instead.
#'
#' To use a Kaggle board, you need to first download a token file from
#' your account.
#'
#' # Sharing
#'
#' When working in teams, you might want to share your pins with others. For
#' You can do by adding users or making the dataset public on Kaggle's website.
#'
#' Once you share with specific users, they can follow the same steps to
#' register a Kaggle board which allows them to download and upload pins
#'
#' @param token The Kaggle token as a path to the `kaggle.json` file, can
#'   be `NULL` if the `~/.kaggle/kaggle.json` file already exists.
#' @inheritParams new_board
#' @keywords internal
#' @examples
#' \dontrun{
#' # the following example requires a Kaggle API token
#' board <- legacy_kaggle(token = "path/to/kaggle.json")
#'
#' pin_find("crowdflower", board = board)
#'
#' # names starting with c/ are competitions
#' pin_get("c/crowdflower-weather-twitter", board = board)
#' }
#' @export
legacy_kaggle <- function(token = NULL, name = "kaggle", ...) {
  token <- token %||% "~/.kaggle/kaggle.json"
  if (!file.exists(token)) {
    stop("Kaggle token file '", token, "' does not exist.", call. = FALSE)
  }

  board <- new_board_v0(
    "pins_board_kaggle",
    name = name,
    token = token,
    ...
  )

  board
}

#' @rdname legacy_kaggle
#' @export
board_register_kaggle <- function(name = "kaggle",
                                  token = NULL,
                                  cache = NULL,
                                  ...) {

  cache <- cache %||% board_cache_path(name)

  board <- legacy_kaggle("kaggle",
    name = name,
    token = token,
    cache = cache,
    ...
  )
  board_register2(board)
}


kaggle_auth_info <- function(board) {
  jsonlite::read_json(board$token)
}

kaggle_auth <- function(board) {
  kaggle_keys <- kaggle_auth_info(board)

  httr::authenticate(
    kaggle_keys$username,
    kaggle_keys$key
  )
}

kaggle_qualify_name <- function(name, board) {
  qualified <- name
  if (!grepl("/", qualified)) qualified <- paste0(kaggle_auth_info(board)$username, "/", name)

  qualified
}

kaggle_upload_resource <- function(path, board) {
  path <- normalizePath(path)
  if (!file.exists(path)) stop("Invalid path: ", path)

  content_length <- file.info(path)$size
  modified <- as.integer(file.info(path)$mtime)

  url <- paste0("https://www.kaggle.com/api/v1/datasets/upload/file/", content_length, "/", modified)

  results <- httr::POST(url, body = list(fileName = basename(path)), kaggle_auth(board))

  if (httr::http_error(results)) stop("Upload registration failed with status ", httr::status_code(results))

  parsed <- httr::content(results, encoding = "UTF-8")

  if (!identical(parsed$error, NULL)) stop("Upload registration failed: ", parsed$error)

  upload_url <- parsed$createUrl
  token <- parsed$token

  results <- httr::PUT(upload_url,
    body = httr::upload_file(normalizePath(path)), kaggle_auth(board),
    http_utils_progress("up", size = file.info(normalizePath(path))$size)
  )

  if (httr::http_error(results)) stop("Upload failed with status ", httr::status_code(results))

  parsed <- httr::content(results, encoding = "UTF-8")

  if (!identical(parsed$error, NULL)) stop("Upload failed: ", parsed$error)

  token
}

kaggle_create_resource <- function(name, description, token, type, metadata, notes, board) {
  if (kaggle_resource_exists(board, name)) {
    owner <- kaggle_auth_info(board)$username
    url <- paste("https://www.kaggle.com/api/v1/datasets/create/version", owner, name, sep = "/")

    if (is.null(notes)) notes <- paste("Updated version")

    body <- list(
      convertToCsv = jsonlite::unbox(FALSE),
      files = data.frame(
        token = token
      ),
      versionNotes = jsonlite::unbox(notes),
      title = jsonlite::unbox(description),
      deleteOldVersions = jsonlite::unbox(identical(board$versions, FALSE))
    )
  }
  else {
    url <- "https://www.kaggle.com/api/v1/datasets/create/new"

    body <- list(
      convertToCsv = jsonlite::unbox(FALSE),
      files = data.frame(
        token = token
      ),
      isPrivate = jsonlite::unbox(TRUE),
      licenseName = jsonlite::unbox("CC0-1.0"),
      ownerSlug = jsonlite::unbox(kaggle_auth_info(board)$username),
      slug = jsonlite::unbox(name),
      title = jsonlite::unbox(description),
      categories = list()
    )
  }

  results <- httr::POST(url, body = body, kaggle_auth(board), encode = "json")

  if (httr::http_error(results)) stop("Resource creation failed with status ", httr::status_code(results))

  parsed <- httr::content(results, encoding = "UTF-8")

  if (!identical(parsed$error, NULL)) stop("Resource creation failed: ", parsed$error)

  parsed$url
}

kaggle_create_bundle <- function(path, type, description) {
  check_installed("zip")
  bundle_path <- tempfile()
  dir.create(bundle_path)
  on.exit(unlink(bundle_path, recursive = TRUE))

  if (dir.exists(path)) {
    file.copy(file.path(path, dir(path)), bundle_path, recursive = TRUE)
  }
  else {
    file.copy(path, bundle_path)
  }

  bundle_file <- tempfile(fileext = ".zip")
  withr::with_dir(
    bundle_path,
    zip::zipr(bundle_file, dir(bundle_path))
  )

  bundle_file
}

#' @export
board_pin_create.pins_board_kaggle <- function(board, path, name, metadata, notes = NULL, ...) {
  description <- metadata$description
  type <- metadata$type

  if (is.null(description) || nchar(description) == 0) description <- paste("A pin for the", gsub("-pin$", "", name), "dataset")
  if (!file.exists(path)) stop("File does not exist: ", path)

  if (identical(list(...)$use_zip, TRUE)) {
    temp_bundle <- kaggle_create_bundle(path, type, description)
    on.exit(unlink(temp_bundle))

    token <- kaggle_upload_resource(temp_bundle, board)
  }
  else {
    token <- list()

    if (dir.exists(path)) {
      upload_files <- file.path(path, dir(path))
    }
    else {
      upload_files <- path
    }

    for (upload_file in upload_files) {
      token[[length(token) + 1]] <- kaggle_upload_resource(upload_file, board)
    }

    token <- unlist(token)
  }

  kaggle_create_resource(name, description, token, type, metadata, notes, board)

  qualified_name <- paste0(kaggle_auth_info(board)$username, "/", name)

  board_wait_create(board, qualified_name)
}

board_pin_search_kaggle <- function(board, text = NULL, base_url = "https://www.kaggle.com/api/v1/datasets/list?") {
  if (identical(text, NULL) || length(text) == 0 || nchar(text) == 0) {
    params <- "group=my"
  }
  else {
    params <- paste0("search=", text)
  }

  url <- utils::URLencode(paste0(base_url, params))

  results <- httr::GET(url, kaggle_auth(board))
  if (httr::http_error(results)) stop("Finding pin failed with status ", httr::status_code(results))

  httr::content(results, encoding = "UTF-8")
}

#' @export
board_pin_find.pins_board_kaggle <- function(board, text, extended = FALSE, ...) {
  if (is.null(text)) text <- ""

  # clear name searches
  text <- gsub("^[^/]+/", "", text)

  # search private dataserts first since they won't search by default
  results <- board_pin_search_kaggle(board)
  reults <- Filter(function(e) grepl(text, e$ref), results)

  results <- c(results, board_pin_search_kaggle(board, text))

  if (nchar(text) > 0) {
    competitions <- board_pin_search_kaggle(board, text, base_url = "https://www.kaggle.com/api/v1/competitions/list?") %>%
      lapply(function(e) {
        e$ref <- paste0("c/", e$ref)
        e
      })
    results <- c(results, competitions)
  }

  results <- pin_entries_to_dataframe(results)

  if (identical(extended, TRUE)) {
    results$name <- results$ref
    results$description <- results$title
    results$ref <- NULL
    results$title <- NULL
    return(results)
  }

  if (length(results) == 0) {
    return(board_empty_results())
  }

  data.frame(
    name = as.character(results$ref),
    description = as.character(results$title),
    type = "files",
    metadata = rep(as.character(jsonlite::toJSON(list(extension = "zip"), auto_unbox = TRUE)), length(results$ref)),
    stringsAsFactors = FALSE
  ) %>%
    unique()
}

kaggle_competition_files <- function(board, name) {
  url <- paste0("https://www.kaggle.com/api/v1/competitions/data/list/", gsub("^c/", "", name))

  results <- httr::GET(url, kaggle_auth(board))
  if (httr::http_error(results)) stop("Finding pin failed with status ", httr::status_code(results))

  httr::content(results, encoding = "UTF-8")
}

#' @export
board_pin_get.pins_board_kaggle <- function(board, name, extract = NULL, version = NULL, ...) {
  if (grepl("^c/", name)) {
    competition_files <- name <- gsub("^c/", "", name)
    if (!grepl("/", name)) {
      files <- kaggle_competition_files(board, name)
      competition_files <- paste0(name, "/", sapply(files, function(e) e$ref))
    }
    url <- paste0("https://www.kaggle.com/api/v1/competitions/data/download/", competition_files)
    extract <- FALSE

    etag <- max(sapply(files, function(e) e$creationDate))
    content_length <- sum(sapply(files, function(e) e$totalBytes))
  }
  else {
    if (!grepl("/", name)) name <- paste(kaggle_auth_info(board)$username, name, sep = "/")
    url <- paste0("https://www.kaggle.com/api/v1/datasets/download/", name)

    extended <- pin_find(name = name, board = board, extended = TRUE)

    etag <- if (is.null(extended$lastUpdated)) "" else as.character(extended$lastUpdated)
    content_length <- if (is.null(extended$totalBytes)) 0 else as.integer(extended$totalBytes)
  }

  subpath <- name
  if (!is.null(version)) {
    url <- paste0(url, "?datasetVersionNumber=", version)
    subpath <- file.path(name, pin_versions_path_name(), version)
    etag <- NULL
  }

  local_path <- pin_download_files(url,
    name,
    board,
    config = kaggle_auth(board),
    custom_etag = etag,
    extract = !identical(extract, FALSE),
    content_length = content_length,
    subpath = subpath
  )

  local_path
}

#' @export
board_pin_remove.pins_board_kaggle <- function(board, name, ...) {
  qualified <- kaggle_qualify_name(name, board)
  stop("Please remove dataset from: https://www.kaggle.com/", qualified, "/settings")
}

#' @export
board_browse.pins_board_kaggle <- function(board, ...) {
  utils::browseURL("https://www.kaggle.com/datasets?tab=my")
}

#' @export
board_pin_versions.pins_board_kaggle <- function(board, name, ...) {
  if (!grepl("/", name)) name <- paste(kaggle_auth_info(board)$username, name, sep = "/")

  url <- paste0("https://www.kaggle.com/api/v1/datasets/view/", name)

  response <- httr::GET(url, kaggle_auth(board))

  if (httr::http_error(response)) stop("Failed to view dataset with status ", httr::status_code(response))

  parsed <- httr::content(response, encoding = "UTF-8")

  if (httr::http_error(response)) {
    stop("Failed to retrieve commits from ", board$repo, ": ", parsed$message)
  }

  tibble::tibble(
    version = sapply(parsed$versions, function(e) e$versionNumber),
    created = sapply(parsed$versions, function(e) e$creationDate),
    author = sapply(parsed$versions, function(e) e$creatorName),
    message = sapply(parsed$versions, function(e) e$versionNotes),
  )
}

kaggle_resource_exists <- function(board, name) {
  if (!grepl("/", name)) name <- paste(kaggle_auth_info(board)$username, name, sep = "/")

  url <- paste0("https://www.kaggle.com/api/v1/datasets/view/", name)

  response <- httr::GET(url, kaggle_auth(board))

  !httr::http_error(response)
}


board_wait_create <- function(board, name) {
  retrieved <- NULL
  retries <- 10
  while (retries > 0 && is.null(retrieved)) {
    retrieved <- suppressWarnings(tryCatch(
      {
        pin_get(name, board)
      },
      error = function(e) NULL
    ))

    Sys.sleep(1)
    retries <- retries - 1
  }
}

Try the pins package in your browser

Any scripts or data that you put into this service are public.

pins documentation built on Nov. 10, 2023, 1:06 a.m.