knitr::opts_chunk$set(echo = TRUE)
The purpose of this notebook is to evaluate the Entity Extraction model, which was developed in Python.
if (!require(pacman)) {install.packages('pacman')} p_load( caret, dplyr, readxl, reticulate, stringr, tidyr )
The following imports functions defined in the sourced R scripts.
In order to evaluate the performance of the Entity Extraction Model, we need to use the same dataset used in training the model.
targets_test <- read.csv("./../data/entity_extraction_test_target_classes.csv", header = FALSE) features_test <- read.csv("./../data/entity_extraction_test_set.csv")
In order to use the Entity Extraction model in R, we need to import the Tensorflow Python package via the Reticulate Package. In addition, to ensure the training/test split is equivalent, we will also use the Sci-kit Learn Module.
Note: In order to load Python modules in R, a Virtual Environment or Conda Environment must be created, connected to, and the relevant packages loaded. Those steps occurred before executing this notebooks
# General np <- import("numpy") tf <- import("tensorflow")
With Tensorflow loaded, we can upload the Entity Extraction model.
path_model <- "./../data/models_python/entity_extraction_w_processing_keras/" model <- tf$keras$models$load_model(path_model)
rs <- as.integer(5590) np$random$seed(rs) tf$random$set_seed(rs)
With the model loaded we will inspect to ensure it looks as we expect.
model$summary()
In order to evaluate the model performance, we will use the same test dataset that was used when developing the model in Python.
# Extract Test Text X_test <- features_test %>% select(text) %>% pull()
With our test set in the form we need, we can generate predictions.
# Convert to Numpy Array X_test_np <- np$array(X_test) y_pred <- model$predict_classes(X_test_np)
The Entity Extraction model returns the predictions as a 2D array. For easier downstream use, we will convert this structure into a list of vector, with each vector being the class predictions for a hypothesis.
# Initialize Output List y_pred_lst <- vector(mode = "list", length = dim(y_pred)[1]) for (i in 1:dim(y_pred)[1]){ # Initialize Hypothesis Vector hypothesis_classes <- vector(mode = "integer", length = dim(y_pred)[2]) for (j in 1:dim(y_pred)[2]){ hypothesis_classes[j] <- y_pred[i,j] # print(y_pred[i,j]) } # print(hypothesis_classes) y_pred_lst[[i]] <- hypothesis_classes }
We need to select the target test set as well. Currently we have all target values in a dataframe. We need to reduce this to the test set, and convert to the same form as our predictions.
# Convert Dataframe to List of Vectors y_test_lst <- as.list(as.data.frame(t(targets_test)))
For final evaluations we need to convert both test and predictions set from a list of vectors to a individual vectors, each observation appended to each other.
# List to Vector Function list_to_vector <- function(input_list){ # Determine Length of Output Vector output_vector_len <- sum(lengths(y_pred_lst)) # Initialize output_vector <- c() # Combine List Element Vectors into Single Vector for (vector in input_list){ output_vector <- c(output_vector, vector) } return(output_vector) } # Convert y_pred_vec <- list_to_vector(y_pred_lst) y_test_vec <- list_to_vector(y_test_lst)
With our data in the correct format, we can finally evaluate the performance of the model against the test set, in order to compare the model performance to what was observed in Python.
y_test_pred_df <- data.frame(y_pred_vec, y_test_vec) %>% rename(obs = y_test_vec, pred = y_pred_vec) %>% mutate( obs = as.factor(obs), pred = as.factor(pred) ) caret::confusionMatrix(y_test_pred_df$pred, y_test_pred_df$obs)
# Initialize Vectors num1 = c() num2 = c() error = c() false_1 = c() false_2 = c() for (i in 1:length(y_pred_vec)) { if (y_test_vec[i] == 1){ num1 = append(num1, 1) } if (y_test_vec[i] == 2){ num2 = append(num2, 1) } if (y_test_vec[i] == y_pred_vec[i]){ error = append(error, 0) } else{ error = append(error, 1) if (y_test_vec[i] == 1){ false_1 = append(false_1, 1) } else { false_1 = append(false_1, 0) } if (y_test_vec[i] == 2){ false_2 = append(false_2, 1) } else { false_2 = append(false_2, 0) } } } # Accuracy - Overall acc_overall <- 1 - sum(error)/length(error) print(paste0("Accuracy - Overall: ", round(acc_overall*100,1), "%")) # Sensitivity - Node 1 Classification sen_node_1 <- 1 - sum(false_1)/sum(num1) print(paste0("Sensitivity - Node 1: ", round(sen_node_1*100,1), "%")) # Sensitivity - Node 2 sen_node_2 <- 1 - sum(false_2)/sum(num2) print(paste0("Sensitivity - Node 2: ", round(sen_node_2*100,1), "%"))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.