library(sager)
library(reticulate)
library(stringr)
library(readr)
library(ggplot2)
library(dplyr)
library(configr)
# virtualenv_create('sage')
# virtualenv_install(
# 'sage',
# c(
# 'sagemaker',
# 'boto3',
# 'numpy',
# 'pandas'
# ),
# ignore_installed = TRUE
# )
config_data = configr::read.config('config.ini')
# Find in py_config() the environment you created and paste it here.
use_python('/Users/digitalfirstmedia/.virtualenvs/sage/bin/python')
configure_aws(
config_data$aws$user,
config_data$aws$pass,
config_data$aws$region
)
sagemaker <- import('sagemaker')
session <- sagemaker$Session()
bucket <- session$default_bucket()
role_arn <- sagemaker$get_execution_role()
data_file <- 'https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data'
abalone <- read_csv(file = data_file, col_names = FALSE)
names(abalone) <- c('sex', 'length', 'diameter', 'height', 'whole_weight', 'shucked_weight', 'viscera_weight', 'shell_weight', 'rings')
abalone$sex <- as.factor(abalone$sex)
ggplot(abalone, aes(x = height, y = rings, color = sex)) +
geom_point() +
geom_jitter()
abalone <- abalone %>%
filter(height != 0)
abalone <- abalone %>%
mutate(female = as.integer(ifelse(sex == 'F', 1, 0)),
male = as.integer(ifelse(sex == 'M', 1, 0)),
infant = as.integer(ifelse(sex == 'I', 1, 0))) %>%
select(-sex)
abalone <- abalone %>%
select(rings:infant, length:shell_weight)
abalone_train <- abalone %>%
sample_frac(size = 0.7)
abalone <- anti_join(abalone, abalone_train)
abalone_test <- abalone %>%
sample_frac(size = 0.5)
abalone_valid <- anti_join(abalone, abalone_test)
write_csv(abalone_train, 'abalone_train.csv', col_names = FALSE)
write_csv(abalone_valid, 'abalone_valid.csv', col_names = FALSE)
s3_train <- session$upload_data(path = 'abalone_train.csv',
bucket = bucket,
key_prefix = 'data')
s3_valid <- session$upload_data(path = 'abalone_valid.csv',
bucket = bucket,
key_prefix = 'data')
abalone_valid <- anti_join(abalone, abalone_test)
s3_train_input <- sagemaker$s3_input(s3_data = s3_train,
content_type = 'csv')
s3_valid_input <- sagemaker$s3_input(s3_data = s3_valid,
content_type = 'csv')
registry <- sagemaker$amazon$amazon_estimator$registry(session$boto_region_name, algorithm='xgboost')
container <- paste(registry, '/xgboost:latest', sep='')
s3_output <- paste0('s3://', bucket, '/output')
estimator <- sagemaker$estimator$Estimator(image_name = container,
role = role_arn,
train_instance_count = 1L,
train_instance_type = 'ml.m5.4xlarge',
train_volume_size = 30L,
train_max_run = 3600L,
input_mode = 'File',
output_path = s3_output,
output_kms_key = NULL,
base_job_name = NULL,
sagemaker_session = NULL)
estimator$set_hyperparameters(num_round = 40L)
job_name <- paste('sagemaker-train-xgboost', format(Sys.time(), '%H-%M-%S'), sep = '-')
input_data <- list('train' = s3_train_input,
'validation' = s3_valid_input)
estimator$fit(inputs = input_data,
job_name = job_name)
estimator$model_data
model_endpoint <- estimator$deploy(initial_instance_count = 1L,
instance_type = 'ml.t2.medium')
model_endpoint$content_type <- 'text/csv'
model_endpoint$serializer <- sagemaker$predictor$csv_serializer
abalone_test <- abalone_test[-1]
num_predict_rows <- 500
test_sample <- as.matrix(abalone_test[1:num_predict_rows, ])
dimnames(test_sample)[[2]] <- NULL
predictions <- model_endpoint$predict(test_sample)
predictions <- str_split(predictions, pattern = ',', simplify = TRUE)
predictions <- as.numeric(predictions)
abalone_test <- cbind(predicted_rings = predictions,
abalone_test[1:num_predict_rows, ])
session$delete_endpoint(model_endpoint$endpoint)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.