cranlift: Serve a 'CRAN'-Like Repository as a RESTful API

Documented in serve

#' Serve a CRAN-like Package Repository
#'
#' Starts a local web server to serve packages from the repository.
#'
#' @param repo The location of the package repository.
#' @param repo_name The name of the repository, written to \code{DESCRIPTION}
#'   files.
#' @param host An IPv4 address owned by the server. Defaults to localhost.
#' @param port The port to run the server on.
#' @param detach Whether the server should run in the foreground or return
#'   immediately and run in the background.
#' @param use_archive Whether old source packages should be moved to an Archive.
#' @param fields Metadata for each package to add to the index. When
#'   \code{NULL}, use the defaults. See \code{\link[tools]{write_PACKAGES}}.
#'
#' @return
#'
#' When \code{detach = TRUE}, this function returns a handle that can be passed
#' to \code{\link{stopServer}}. Otherwise it will not return at all, and must
#' be interrupted from the console.
#'
#' @export
#' @importFrom utils contrib.url
serve <- function(repo, repo_name = "Cranlift", host = "127.0.0.1", port = 8000,
                  detach = FALSE, use_archive = TRUE, fields = NULL) {
  config <- list(
    use_archive = use_archive %||% TRUE,
    fields = fields %||% required_fields,
    repo_name = repo_name
  )

  # Handle correct repository initialization.
  repo <- repository(repo, fields = config$fields)

  # Keep the package index in memory.
  env <- new.env(FALSE, size = 1L)
  env$index <- lapply(repo$contrib_urls, function(url) {
    readRDS(file.path(url, "PACKAGES.rds"))
  })
  names(env$index) <- repo$contrib_urls

  if (!detach) {
    httpuv::runServer(host, port, list(
      call = router(repo$path, config, env)
    ))
  } else {
    httpuv::startServer(host, port, list(
      call = router(repo$path, config, env)
    ))
  }
}

router <- function(repo, config, env) {
  function(req) {
    path <- httpuv::decodeURIComponent(req$PATH_INFO)
    Encoding(path) <- "UTF-8"

    if (path == "/_ping") {
      return(ping())
    }

    # Serve the index out of memory so that we can be sure it is always up-to-
    # date vis-a-vis this server.
    if (req$REQUEST_METHOD %in% c("GET", "HEAD") &&
          grepl("PACKAGES.rds$", path)) {
      location <- file.path(repo, sub("^/", "", path))
      if (!dirname(location) %in% names(env$index)) {
        return(not_found())
      }
      if (req$REQUEST_METHOD == "GET") {
        # NOTE: We're not using the traditional compression here.
        body <- serialize(env$index[[dirname(location)]], connection = NULL)
      } else {
        body <- raw(0)
      }
      return(list(
        status = 200L,
        headers = list(
          "Content-Type" = "application/octet-stream",
          "Content-Length" = length(body)
        ),
        body = body
      ))
    }

    # No support for pre-3.0 clients, i.e. raw PACKAGES or PACKAGES.gz files.
    if (req$REQUEST_METHOD %in% c("GET", "HEAD") &&
          grepl("PACKAGES(|.gz)$", path)) {
      return(bad_request("Pre-3.0 clients are not supported."))
    }

    # Here's a nickel kid, use a real web server instead.
    if (req$REQUEST_METHOD %in% c("GET", "HEAD")) {
      location <- file.path(repo, sub("^/", "", path))
      res <- if (file.exists(location)) {
        size <- file.info(location)[, "size"]
        if (req$REQUEST_METHOD == "GET") {
          con <- file(location, "rb", raw = TRUE)
          on.exit(close(con))
          body <- readBin(con, "raw", n = size)
        } else {
          body <- raw(0)
        }
        list(
          status = 200L,
          headers = list(
            # This is what CRAN uses as the mime type.
            "Content-Type" = if (endsWith(location, "zip")) "zip" else "x-gzip",
            # Send the real content length for HEAD requests.
            # See: https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.13
            "Content-Length" = size
          ),
          body = body
        )
      } else {
        not_found()
      }
      return(res)
    }

    if (req$REQUEST_METHOD == "POST" && path == "/") {
      parsed <- webutils::parse_http(req$rook.input$read(), req$CONTENT_TYPE)
      if (!"file" %in% names(parsed)) {
        return(bad_request("Request must contain a 'file' in the form."))
      }

      # TODO: Have a per-session upload directory. Note that we need the
      # temporary filename to be package-like, otherwise desc::description$new()
      # won't handle it correctly.
      temp_file <- file.path(tempdir(), parsed$file$filename)
      con <- file(temp_file, open = "wb", raw = TRUE)
      on.exit(close(con))
      writeBin(parsed$file$value, con)
      # Make sure we wrote the whole file (even if it is large) before we
      # attempt to extract metadata from it.
      flush(con)

      # TODO: The only reason we do this at the moment is to "validate" the
      # package. We don't do anything with the result as of yet. Should we?
      pkg <- try({
        desc <- desc::description$new(temp_file)
        desc$get(config$fields)
      })
      if (inherits(pkg, "try-error")) {
        # TODO: Log the error.
        return(bad_request("Invalid package bundle."))
      }
      pkg <- as.data.frame(t(pkg), stringsAsFactors = FALSE)

      bundle <- sprintf("%s_%s.tar.gz", pkg$Package, pkg$Version)
      location <- file.path(contrib.url(repo, type = "source"), bundle)
      res <- if (file.exists(location)) {
        list(
          status = 409L,
          headers = list(
            "Content-Type" = "text/plain; charset=utf-8",
            "Location" = paste0("/src/contrib/", bundle)
          ),
          body = "Package already exists on the server. Use PUT to replace it."
        )
      } else {
        # FIXME: Handle potential copying errors.
        file.copy(temp_file, location)
        cranlike::add_PACKAGES(
          basename(location), dir = dirname(location), fields = config$fields
        )
        # Update the in-memory representation.
        env$index[[dirname(location)]] <- readRDS(
          file.path(dirname(location), "PACKAGES.rds")
        )
        list(
          status = 201L,
          headers = list(
            "Content-Type" = "text/plain; charset=utf-8",
            "Location" = paste0("/src/contrib/", bundle)
          ),
          body = ""
        )
      }
      return(res)
    }

    if (req$REQUEST_METHOD == "PUT") {
      location <- file.path(repo, sub("^/", "", path))
      if (!dirname(location) %in% names(env$index)) {
        # TODO: Allow for creating valid new contrib URLs with an empty index.
        return(bad_request("URI does not match the repository structure."))
      }

      parsed <- webutils::parse_http(req$rook.input$read(), req$CONTENT_TYPE)
      if (!"file" %in% names(parsed)) {
        return(bad_request("Request must contain a 'file' in the form."))
      }

      temp_file <- file.path(tempdir(), parsed$file$filename)
      con <- file(temp_file, open = "wb", raw = TRUE)
      on.exit(close(con))
      writeBin(parsed$file$value, con)
      # Make sure we wrote the whole file (even if it is large) before we
      # attempt to extract metadata from it.
      flush(con)

      pkg <- try({
        desc <- desc::description$new(temp_file)
        desc$get(config$fields)
      })
      if (inherits(pkg, "try-error")) {
        # TODO: Log the error.
        return(bad_request("Invalid package bundle."))
      }
      pkg <- as.data.frame(t(pkg), stringsAsFactors = FALSE)

      bundle <- sprintf(
        "%s_%s.%s", pkg$Package, pkg$Version,
        # Match the existing extension.
        sub("(.*)\\.(tar\\.gz|zip|tgz)", "\\2", parsed$file$filename)
      )
      if (basename(location) != bundle) {
        return(bad_request("URI does not match the upload contents."))
      }

      status <- if (file.exists(location)) 200L else 201L
      # FIXME: Handle potential copying errors.
      file.copy(temp_file, location)
      cranlike::add_PACKAGES(
        basename(location), dir = dirname(location), fields = config$fields
      )
      # Update the in-memory representation.
      env$index[[dirname(location)]] <- readRDS(
        file.path(dirname(location), "PACKAGES.rds")
      )
      return(list(
        status = status,
        headers = list(
          "Content-Type" = "text/plain; charset=utf-8",
          "Location" = paste0("/src/contrib/", bundle)
        ),
        body = raw(0)
      ))
    }

    if (req$REQUEST_METHOD == "DELETE") {
      location <- file.path(repo, sub("^/", "", path))

      if (grepl("PACKAGES", location, fixed = TRUE)) {
        return(bad_request("Package indices cannot be deleted."))
      }

      res <- if (file.exists(location)) {
        if (config$use_archive) {
          # TODO: Is there a more cannonical regex we can use?
          regexp <- "([^_]+)_([0-9\\.]+)\\.(tar\\.gz|zip|tgz)"
          if (!grepl(regexp, basename(location))) {
            stop("Unexpected package path format: ", basename(location))
          }
          name <- sub(regexp, "\\1", basename(location))
          archive <- file.path(dirname(location), "Archive", name)
          dir.create(archive, showWarnings = FALSE, recursive = TRUE)
          file.copy(
            location, file.path(archive, basename(location)), copy.date = TRUE
          )
        }

        # Files can be deleted from the archive by default.
        if (grepl("Archive", location)) {
          unlink(location)
        } else {
          cranlike::remove_PACKAGES(basename(location), dirname(location))
          # Update the in-memory representation.
          env$index[[dirname(location)]] <- readRDS(
            file.path(dirname(location), "PACKAGES.rds")
          )
        }

        list(
          status = 200L,
          headers = list(
            "Content-Type" = "text/plain; charset=utf-8",
            "Location" = path
          ),
          body = raw(0)
        )
      } else {
        not_found()
      }
      return(res)
    }

    not_found()
  }
}

ping <- function() {
  list(status = 200L, body = "", headers = list(
    "Content-Type" = "text/plain; charset=utf-8"
  ))
}

not_found <- function() {
  list(status = 404L, body = "", headers = list(
    "Content-Type" = "text/plain; charset=utf-8"
  ))
}

bad_request <- function(msg) {
  list(status = 400L, body = msg, headers = list(
    "Content-Type" = "text/plain; charset=utf-8"
  ))
}

date <- function() {
  format.POSIXlt(
    as.POSIXlt(Sys.time(), tz = "GMT"), format = "%a, %d %b %Y %H:%M:%S %Z",
    usetz = FALSE
  )
}

`%||%` <- function(lhs, rhs) if (!is.null(lhs)) lhs else rhs

# From tools:::.get_standard_repository_db_fields().
required_fields <- c(
  "Package", "Version", "Priority", "Depends", "Imports", "LinkingTo",
  "Suggests", "Enhances", "License", "License_is_FOSS", "License_restricts_use",
  "OS_type", "Archs", "MD5sum", "NeedsCompilation"
)