#' In this example, we'll introduce how to use the TensorFlow Estimators API to
#' jointly train a wide linear model and a deep feed-forward neural network.
#' This approach combines the strengths of memorization and generalization. It's
#' useful for generic large-scale regression and classification problems with
#' sparse input features (e.g., categorical features with a large number of
#' possible feature values). If you're interested in learning more about how
#' Wide & Deep Learning works, please check out the [white
#' paper](http://arxiv.org/abs/1606.07792).
#'
#' ![Wide & Deep](https://www.tensorflow.org/images/wide_n_deep.svg)
#'
#' The figure above shows a comparison of a wide model (logistic regression with
#' sparse features and transformations), a deep model (feed-forward neural
#' network with an embedding layer and several hidden layers), and a Wide & Deep
#' model (joint training of both). At a high level, there are only 3 steps to
#' configure a wide, deep, or Wide & Deep model using the TF Estimators API:
#'
#' - Select features for the wide part: Choose the sparse base columns and
#' crossed columns you want to use. - Select features for the deep part: Choose
#' the continuous columns, the embedding dimension for each categorical column,
#' and the hidden layer sizes. - Put them all together in a Wide & Deep model
#' (linear_dnn_combined_classifier).
#'
#' And that's it! Let's go through a simple example.
#'
#'
#' ### Download Data
#'
#' First of all, let's download the census data:
library(tfestimators)
maybe_download_census <- function(train_data_path, test_data_path, column_names_to_assign) {
trim_character_cols <- function(df) {
df %>%
lapply(function(x) if (is.character(x)) trimws(x) else x) %>%
data.frame(stringsAsFactors = FALSE)
}
if (!file.exists(train_data_path) || !file.exists(test_data_path)) {
cat("Downloading census data ...")
train_data <- read.csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", header = FALSE, skip = 1,
stringsAsFactors = FALSE) %>%
trim_character_cols()
test_data <- read.csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test", header = FALSE, skip = 1,
stringsAsFactors = FALSE) %>%
trim_character_cols()
colnames(train_data) <- column_names_to_assign
colnames(test_data) <- column_names_to_assign
write.csv(train_data, train_data_path, row.names = FALSE)
write.csv(test_data, test_data_path, row.names = FALSE)
} else {
train_data <- read.csv(train_data_path, header = TRUE,
stringsAsFactors = FALSE) %>%
trim_character_cols()
test_data <- read.csv(test_data_path, header = TRUE,
stringsAsFactors = FALSE) %>%
trim_character_cols()
}
return(list(train_data = train_data, test_data = test_data))
}
COLNAMES <- c("age", "workclass", "fnlwgt", "education", "education_num", "marital_status", "occupation",
"relationship", "race", "sex", "capital_gain", "capital_loss", "hours_per_week", "native_country",
"income_bracket")
downloaded_data <- maybe_download_census(
file.path(getwd(), "train_census.csv"),
file.path(getwd(), "test_census.csv"),
COLNAMES
)
train_data <- downloaded_data$train_data
test_data <- downloaded_data$test_data
#' ### Define Base Feature Columns
#'
#' Next, let's define the base categorical and continuous feature columns that
#' we'll use. These base columns will be the building blocks used by both the
#' wide part and the deep part of the model.
#'
gender <- column_categorical_with_vocabulary_list(
"gender", vocabulary_list = c("Female", "Male"))
education <- column_categorical_with_vocabulary_list(
"education",
vocabulary_list = c(
"Bachelors", "HS-grad", "11th", "Masters", "9th",
"Some-college", "Assoc-acdm", "Assoc-voc", "7th-8th",
"Doctorate", "Prof-school", "5th-6th", "10th", "1st-4th",
"Preschool", "12th"))
marital_status <- column_categorical_with_vocabulary_list(
"marital_status",
vocabulary_list = c(
"Married-civ-spouse", "Divorced", "Married-spouse-absent",
"Never-married", "Separated", "Married-AF-spouse", "Widowed"))
relationship <- column_categorical_with_vocabulary_list(
"relationship",
vocabulary_list = c(
"Husband", "Not-in-family", "Wife", "Own-child", "Unmarried",
"Other-relative"))
workclass <- column_categorical_with_vocabulary_list(
"workclass",
vocabulary_list = c(
"Self-emp-not-inc", "Private", "State-gov", "Federal-gov",
"Local-gov", "?", "Self-emp-inc", "Without-pay", "Never-worked"))
# To show an example of hashing:
occupation <- column_categorical_with_hash_bucket(
"occupation", hash_bucket_size = 1000, dtype = tf$string)
native_country <- column_categorical_with_hash_bucket(
"native_country", hash_bucket_size = 1000, dtype = tf$string)
# Continuous base columns.
age <- column_numeric("age")
education_num <- column_numeric("education_num")
capital_gain <- column_numeric("capital_gain")
capital_loss <- column_numeric("capital_loss")
hours_per_week <- column_numeric("hours_per_week")
# Transformations.
age_buckets <- column_bucketized(
age, boundaries = c(18, 25, 30, 35, 40, 45, 50, 55, 60, 65))
base_columns <- c(gender, native_country, education, occupation, workclass, relationship, age_buckets)
#' ### The Wide Model: Linear Model with Crossed Feature Columns
#' The wide model is a linear model with a wide set of sparse and crossed feature columns:
crossed_columns <- feature_columns(
native_country, education, occupation, workclass, relationship, age_buckets,
column_crossed(c("education", "occupation"), hash_bucket_size = 10000),
column_crossed(c("native_country", "occupation"), hash_bucket_size = 10000),
column_crossed(c(age_buckets, "education", "occupation"), hash_bucket_size = 10000)
)
#' Wide models with crossed feature columns can memorize sparse interactions
#' between features effectively. That being said, one limitation of crossed
#' feature columns is that they do not generalize to feature combinations that
#' have not appeared in the training data. Let's add a deep model with
#' embeddings to fix that.
#'
#' ### The Deep Model: Neural Network with Embeddings
#'
#' The deep model is a feed-forward neural network, as shown in the previous
#' figure. Each of the sparse, high-dimensional categorical features are first
#' converted into a low-dimensional and dense real-valued vector, often referred
#' to as an embedding vector. These low-dimensional dense embedding vectors are
#' concatenated with the continuous features, and then fed into the hidden
#' layers of a neural network in the forward pass. The embedding values are
#' initialized randomly, and are trained along with all other model parameters
#' to minimize the training loss. If you're interested in learning more about
#' embeddings, check out the TensorFlow tutorial on Vector Representations of
#' Words, or Word Embedding on Wikipedia.
#'
#' We'll configure the embeddings for the categorical columns using
#' embedding_column, and concatenate them with the continuous columns:
deep_columns <- feature_columns(
column_embedding(workclass, dimension = 8),
column_embedding(education, dimension = 8),
column_embedding(relationship, dimension = 8),
column_embedding(native_country, dimension = 8),
column_embedding(occupation, dimension = 8),
age,
education_num,
capital_gain,
capital_loss,
hours_per_week
)
#' The higher the dimension of the embedding is, the more degrees of freedom the
#' model will have to learn the representations of the features. For simplicity,
#' we set the dimension to 8 for all feature columns here. Empirically, a more
#' informed decision for the number of dimensions is to start with a value on
#' the order of $\log_2{n}$ or $k\sqrt[4]{n}$, where n is the number of unique features in a
#' feature column and k is a small constant (usually smaller than 10).
#'
#' Through dense embeddings, deep models can generalize better and make
#' predictions on feature pairs that were previously unseen in the training
#' data. However, it is difficult to learn effective low-dimensional
#' representations for feature columns when the underlying interaction matrix
#' between two feature columns is sparse and high-rank. In such cases, the
#' interaction between most feature pairs should be zero except a few, but dense
#' embeddings will lead to nonzero predictions for all feature pairs, and thus
#' can over-generalize. On the other hand, linear models with crossed features
#' can memorize these “exception rules” effectively with fewer model parameters.
#' Now, let's see how to jointly train wide and deep models and allow them to
#' complement each other’s strengths and weaknesses.
#'
#' ### Combining Wide and Deep Models into One
#'
#' The wide models and deep models are combined by summing up their final output
#' log odds as the prediction, then feeding the prediction to a logistic loss
#' function. All the graph definition and variable allocations have already been
#' handled for you under the hood, so you simply need to create a
#' dnn_linear_combined_classifier:
model <- dnn_linear_combined_classifier(
linear_feature_columns = crossed_columns,
dnn_feature_columns = deep_columns,
dnn_hidden_units = c(100, 50)
)
#' ### Training and Evaluating The Model
# Build labels according to income bracket
train_data$income_bracket <- as.character(train_data$income_bracket)
test_data$income_bracket <- as.character(test_data$income_bracket)
train_data$label <- ifelse(train_data$income_bracket == ">50K", 1, 0)
test_data$label <- ifelse(test_data$income_bracket == ">50K", 1, 0)
constructed_input_fn <- function(dataset) {
input_fn(dataset, features = -label, response = label)
}
train_input_fn <- constructed_input_fn(train_data)
eval_input_fn <- constructed_input_fn(test_data)
train(model, input_fn = train_input_fn, steps = 2)
evaluate(model, input_fn = eval_input_fn, steps = 2)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.