knitr::opts_chunk$set(echo = TRUE)
Emulation of blog post Train and deploy ML models with R and plumber on Vertex AI | Google Cloud Blog and related notebook: vertex-ai-samples/get_started_vertex_training_r_using_r_kernel.ipynb at main ยท GoogleCloudPlatform/vertex-ai-samples
Core concept: Create a TrainingPipeline
that runs a CustomJob
and imports the resulting artifacts as a Model
.
Run the following chunk to install googleCloudVertexAIR
and the other required R packages to complete this tutorial (checking to see if they are installed first and only install if not already):
required_packages <- c("remotes", "googleAuthR", "withr", "jsonlite") missing_packages <- required_packages[!(required_packages %in% installed.packages()[,"Package"])] if(length(missing_packages)) install.packages(missing_packages) # remotes::install_github("justinjm/googleCloudVertexAIR") # run first time # options(googleAuthR.verbose = 0) # set when debugging
library(withr) # for gcloud sdk: https://stackoverflow.com/a/64381848/1812363 library(glue) sh <- function(cmd, args = c(), intern = FALSE) { with_path(path.expand("~/google-cloud-sdk/bin/"), { if (is.null(args)) { cmd <- glue(cmd) s <- strsplit(cmd, " ")[[1]] cmd <- s[1] args <- s[2:length(s)] } ret <- system2(cmd, args, stdout = TRUE, stderr = TRUE) if ("errmsg" %in% attributes(attributes(ret))$names) cat(attr(ret, "errmsg"), "\n") if (intern) return(ret) else cat(paste(ret, collapse = "\n")) } ) }
Create a file called .Renviron
in your project's working directory and use the following environemtn arguments:
GAR_SERVICE_JSON
- path to service account (JSON) keyfile downloaded before and copiedGCVA_DEFAULT_PROJECT_ID
- string of your GCP project you configured beforeGCVA_DEFAULT_REGION
- region of GCP resorces that can be one of: "us-central1"
or "eu"
e.g. your .Renviron
should look like:
# .Renviron GAR_SERVICE_JSON="/Users/me/auth/auth.json" GCVA_DEFAULT_PROJECT_ID="my-project" GCVA_DEFAULT_REGION="us-central1"
TODO
library(googleAuthR) library(googleCloudVertexAIR) options(googleAuthR.scopes.selected = "https://www.googleapis.com/auth/cloud-platform") gar_auth_service(json_file = Sys.getenv("GAR_SERVICE_JSON"))
gcva_region_set(region = "us-central1") gcva_project_set(projectId = Sys.getenv("GCVA_DEFAULT_PROJECT_ID")) timestamp <- strftime(Sys.time(), "%Y%m%d%H%M%S") paste0("timestamp: ", timestamp) datasetDisplayName <- sprintf("california-housing-%s", timestamp) paste0("datasetDisplayName: ", datasetDisplayName) bucket_name <- paste0(gcva_project_get(), "-aip-", timestamp) bucket_uri <- paste0("gs://", bucket_name) paste0("bucket_name: ", bucket_name) paste0("bucket_uri: ", bucket_uri) private_repo <- "my-docker-repo" image_name <- "vertex-r" image_tag <- "latest" image_uri <- glue( "{gcva_region_get()}-docker.pkg.dev/{gcva_project_get()}/{private_repo}/{image_name}:{image_tag}" ) image_uri
sh("gcloud config set project {gcva_project_get()}")
sh("gsutil mb -l {gcva_region_get()} -p {gcva_project_get()} {bucket_uri}")
Set newly created bucket as a global argument:
gcva_bucket_set(bucket_uri)
Your first step is to create your own Docker repository in Google Artifact Registry.
Run the gcloud artifacts repositories create command to create a new Docker repository with your region with the description "docker repository".
Run the gcloud artifacts repositories list command to verify that your repository was created.
sh('gcloud artifacts repositories create {private_repo} --repository-format=docker --location={gcva_region_get()} --description="Docker repository"')
sh("gcloud artifacts repositories list")
Before you push or pull container images, configure Docker to use the gcloud
command-line tool to authenticate requests to Artifact Registry
for your region.
sh("gcloud auth configure-docker {gcva_project_get()}-docker.pkg.dev --quiet")
dir.create("src", showWarnings=FALSE)
.src/Dockerfile
Dockerfile = cat( "FROM gcr.io/deeplearning-platform-release/r-cpu.4-1:latest WORKDIR /root COPY train.R /root/train.R COPY serve.R /root/serve.R # Install Fortran RUN apt-get update RUN apt-get install gfortran -yy # Install R packages RUN Rscript -e \"install.packages('plumber')\" RUN Rscript -e \"install.packages('randomForest')\" EXPOSE 8080 ", file = "src/Dockerfile")
.src/train.R
cat( '#!/usr/bin/env Rscript library(tidyverse) library(data.table) library(randomForest) Sys.getenv() # The GCP Project ID project_id <- Sys.getenv("CLOUD_ML_PROJECT_ID") # The GCP Region location <- Sys.getenv("CLOUD_ML_REGION") # The Cloud Storage URI to upload the trained model artifact to model_dir <- Sys.getenv("AIP_MODEL_DIR") # Next, you create directories to download our training, validation, and test set into. dir.create("training") dir.create("validation") dir.create("test") # You download the Vertex AI managed data sets into the container environment locally. system2("gsutil", c("cp", Sys.getenv("AIP_TRAINING_DATA_URI"), "training/")) system2("gsutil", c("cp", Sys.getenv("AIP_VALIDATION_DATA_URI"), "validation/")) system2("gsutil", c("cp", Sys.getenv("AIP_TEST_DATA_URI"), "test/")) # For each data set, you may receive one or more CSV files that you will read into data frames. training_df <- list.files("training", full.names = TRUE) %>% map_df(~fread(.)) validation_df <- list.files("validation", full.names = TRUE) %>% map_df(~fread(.)) test_df <- list.files("test", full.names = TRUE) %>% map_df(~fread(.)) print("Starting Model Training") rf <- randomForest(median_house_value ~ ., data=training_df, ntree=100) rf saveRDS(rf, "rf.rds") system2("gsutil", c("cp", "rf.rds", model_dir)) ', file = "src/train.R")
.src/serve.R
cat( '#!/usr/bin/env Rscript Sys.getenv() library(plumber) system2("gsutil", c("cp", "-r", Sys.getenv("AIP_STORAGE_URI"), ".")) system("du -a .") rf <- readRDS("artifacts/rf.rds") library(randomForest) predict_route <- function(req, res) { print("Handling prediction request") df <- as.data.frame(req$body$instances) preds <- predict(rf, df) return(list(predictions=preds)) } print("Staring Serving") pr() %>% pr_get(Sys.getenv("AIP_HEALTH_ROUTE"), function() "OK") %>% pr_post(Sys.getenv("AIP_PREDICT_ROUTE"), predict_route) %>% pr_run(host = "0.0.0.0", port=as.integer(Sys.getenv("AIP_HTTP_PORT", 8080))) ', file = "src/serve.R")
sh("gcloud services enable artifactregistry.googleapis.com")
sh("cd src && gcloud builds submit --region={gcva_region_get()} --tag={image_uri} --timeout=1h")
data_uri <- "gs://cloud-samples-data/ai-platform-unified/datasets/tabular/california-housing-tabular-regression.csv" dataset <- gcva_create_tabluar_dataset( displayName = datasetDisplayName, gcsSource = data_uri) dataset
job <- gcva_custom_container_training_job( displayName = image_name, containerUri = image_uri, command = c("Rscript", "train.R"), modelServingContainerCommand = c("Rscript", "serve.R"), modelServingContainerImageUri = image_uri ) job
gcva_bucket_set(bucket_uri) # set here during testing only model <- gcva_run_job( job = job, dataset = dataset, modelDisplayName = "vertex-r-model", machineType = "n1-standard-4" ) model
023-01-20 17:37:59> bucket set to 'gs://gc-vertex-ai-r-aip-20230120172747' 2023-01-20 17:38:00> view training: https://console.cloud.google.com/vertex-ai/locations/us-central1/training/947230504279605248?project=gc-vertex-ai-r 2023-01-20 17:38:00> pipeline state: PIPELINE_STATE_RUNNING 2023-01-20 17:43:00> view training: https://console.cloud.google.com/vertex-ai/locations/us-central1/training/947230504279605248?project=gc-vertex-ai-r 2023-01-20 17:43:00> pipeline state: PIPELINE_STATE_RUNNING 2023-01-20 17:48:01> view training: https://console.cloud.google.com/vertex-ai/locations/us-central1/training/947230504279605248?project=gc-vertex-ai-r 2023-01-20 17:48:01> pipeline state: PIPELINE_STATE_SUCCEEDED ==Google Cloud Vertex AI TrainingPipeline Job== console: https://console.cloud.google.com/vertex-ai/locations/us-central1/training/947230504279605248?project=442003009360 state: PIPELINE_STATE_SUCCEEDED
Doc: Overview of getting predictions on Vertex AI | Google Cloud API: https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.endpoints/create
endpoint <- gcva_create_endpoint( displayName = "CaliforniaHousingEndpoint" ) endpoint
model <- gcva_model(model = Sys.getenv("GCVA_TEST_MODEL_NAME_CUSTOM")) gcva_deploy( model = model, endpoint = endpoint, machineType = "n1-standard-4" )
Now we will test the deployed model by performing an online prediction.
First, we create a sample set of input data from the original source, selecting the first 5 rows for ease of demonstration):
library(jsonlite) ## create example data for prediction request data_raw <- read.csv(text=sh("gsutil cat {data_uri}", intern = TRUE)) ## convert all to string values to build correct prediction request data <- as.data.frame(lapply(data_raw, as.character)) ## build list of 5 examples/rows only for testing instances <- list(instances=head(data[, names(data) != "median_house_value"], 5)) ## convert to required format of JSON json_instances <- toJSON(instances, auto_unbox = TRUE) json_instances
gcva_predict( endpoint = endpoint, instances = json_instances )
# gcva_undeploy_all(endpoint = endpoint)
gcva_delete_endpoint(endpoint = endpoint)
gcva_delete_job()
gcva_delete_dataset(displayName = datasetDisplayName)
local directory
unlink("src", recursive = TRUE) # will delete directory called 'mydir'
GCS bucket
delete_bucket <- TRUE if (delete_bucket == TRUE) sh("gsutil -m rm -r {bucket_uri}")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.