knitr::opts_chunk$set(echo = TRUE)

Overview

Emulation of notebook tutorials using Vertex AI SDK via Python:

Setup

Installation

Run the following chunk to install googleCloudVertexAIR and the other required R packages to complete this tutorial (checking to see if they are installed first and only install if not already):

required_packages <- c("remotes", "googleAuthR")
missing_packages <- required_packages[!(required_packages %in% 
                                          installed.packages()[,"Package"])]
if(length(missing_packages)) install.packages(missing_packages)

# remotes::install_github("justinjm/googleCloudVertexAIR") # run first time 

.Renviron

Create a file called .Renviron in your project's working directory and use the following environemtn arguments:

e.g. your .Renviron should look like:

# .Renviron
GAR_SERVICE_JSON="/Users/me/auth/auth.json"
GCVA_DEFAULT_PROJECT_ID="my-project"
GCVA_DEFAULT_REGION="us-central1"

Setup Google Cloud Project

TODO: https://github.com/justinjm/googleCloudVertexAIR/issues/26

Authenticate your Google Cloud Account

library(googleAuthR)
library(googleCloudVertexAIR)

options(googleAuthR.scopes.selected = "https://www.googleapis.com/auth/cloud-platform")

gar_auth_service(json_file = Sys.getenv("GAR_SERVICE_JSON"))

Set global arguements

projectId <- Sys.getenv("GCVA_DEFAULT_PROJECT_ID")
gcva_region_set(region = "us-central1")
gcva_project_set(projectId = projectId)

timestamp <- strftime(Sys.time(), "%Y%m%d%H%M%S")
timestamp

Set Vertex AI managed dataset display name

datasetDisplayName <- sprintf("california-housing-%s", timestamp)
datasetDisplayName

Tutorial

Create Dataset

Source dataset: gs://cloud-samples-data/ai-platform-unified/datasets/tabular/california-housing-tabular-regression.csv

dataset <- gcva_create_tabluar_dataset(
  displayName = datasetDisplayName,
  gcsSource = "gs://cloud-samples-data/ai-platform-unified/datasets/tabular/california-housing-tabular-regression.csv")
dataset

Train Model

Create training pipeline

job <- gcva_automl_tabluar_training_job(
  displayName = sprintf("california-housing-%s", timestamp),
  optimizationPredictionType = "regression",
  column_transformations = list(
          list(numeric     = list(column_name = "longitude")),
          list(numeric     = list(column_name = "latitude")),
          list(numeric     = list(column_name = "housing_median_age")),
          list(numeric     = list(column_name = "total_rooms")),
          list(numeric     = list(column_name = "total_bedrooms")),
          list(numeric     = list(column_name = "population")),
          list(numeric     = list(column_name = "households")),
          list(numeric     = list(column_name = "median_income"))
          )
  )

Run the training pipeline

model <- gcva_run_job(
  job = job,
  dataset = dataset,
  targetColumn = "median_house_value",
  modelDisplayName = sprintf("model-%s", datasetDisplayName))

model

Make a batch prediction request

Make test items for batch input

  1. Create BQ dataset california_housing
  2. Create BQ external table from GCS file california_housing.source_data
  3. Create BQ table batch02 from BQ table california_housing.source_data

Make the batch prediction request

set constants

bq_source_uri <- sprintf("bq://%s.california_housing.batch_02", projectId)
bq_destination_prefix <- sprintf("bq://%s.california_housing", projectId)

execute request

# hard code #modelName for testing purposes, model state = completed 
# model <- Sys.getenv("GCVA_TEST_MODEL_NAME_AUTOML")

batch_prediction_job <- gcva_batch_predict(
  jobDisplayName = sprintf("california-housing-%s", timestamp),
  model = model,
  bigquerySource= bq_source_uri,
  instancesFormat = "bigquery",
  predictionsFormat = "bigquery",
  bigqueryDestinationPrefix = bq_destination_prefix
)
batch_prediction_job

Wait for completion of batch prediction job

Once the batch prediction job has completed, you can then view and use the predictions

Get predictions

Open BigQuery console and navigate to the dataset where the predictions were saved, then modify and run the query below:

SELECT 
  predicted_TARGET_COLUMN_NAME.value,
  predicted_TARGET_COLUMN_NAME.lower_bound,
  predicted_TARGET_COLUMN_NAME.upper_bound
FROM BQ_DATASET_NAME.BQ_PREDICTIONS_TABLE_NAME

See more details here: https://cloud.google.com/vertex-ai/docs/tabular-data/classification-regression/get-batch-predictions#retrieve-batch-results

Cleaning up

gcva_delete_dataset(dataset = dataset)

Reference



justinjm/googleCloudVertexAIR documentation built on April 17, 2025, 5:04 p.m.