In e-sensing/activelearning: Active Learning Satellite Image Time Series

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

library(activelearning)
library(caret)
library(dplyr)
library(ensurer)
library(magrittr)
library(sits)
library(terra)

n_samples <- 20
n_iterations <- 5
n_multicores <- parallel::detectCores() / 2

sits_method <- sits::sits_xgboost(verbose = FALSE)

#' Compute accuracy by comparing a reference to a prediction raster.
#'
#' @param ref_rast   A terra's raster of reference.
#' @param pred_rast  A terra's raster with predictions.
#' @param ref_labels A named character. The names of the labels in ref_rast.
#' @return           A named numeric character. The overall, producer and user
#'                   accuracy.
compute_accuracy <- function(ref_rast, pred_rast, ref_labels) {

    reference_vec  <- dplyr::recode(ref_rast[],  !!!ref_labels)
    prediction_vec <- dplyr::recode(pred_rast[], !!!ref_labels)

    conf_mat <- caret::confusionMatrix(data = factor(prediction_vec,
                                                     levels = ref_labels),
                                       reference = factor(reference_vec,
                                                          levels = ref_labels))
    cf_mat <- as.matrix(conf_mat[["table"]])
    overall_accuracy  <- conf_mat$overall["Accuracy"]

    by_class <- conf_mat[["byClass"]]
    f1_score <- by_class[, "F1"]
    prod_acc <- by_class[, "Pos Pred Value"]
    user_acc <- by_class[, "Sensitivity"]
    class_names <- stringr::str_sub(rownames(by_class), 8)
    names(f1_score) <- paste0(class_names, "_f1") 
    names(prod_acc) <- paste0(class_names, "_pa")
    names(user_acc) <- paste0(class_names, "_ua")

    pua <- c(f1_score, prod_acc, user_acc)
    pua <- pua[match(sort(names(pua)), names(pua))]

    return(c(overall_accuracy, pua))
}

#' Count the number of NAs in a tibble.
#'
#' @param x A tibble.
#' @return  An integer.
#'                   accuracy.
count_ts_na <- function(x){
  sum(is.na(x))
}


#' Get the labels for the oracle samples.
#'
#' @param oracle_samples A tibble.
#' @param ref_raster     A classified terra raster.
#' @param ref_labels     The labels used during the classification of the
#'                       raster of reference.
#' @return               A tibble. The oracle samples with updated labels.
get_oracle_labels <- function(oracle_samples, ref_raster, 
                              ref_labels) {

  oracle_sf <-  sf::st_as_sf(oracle_samples,
                             coords = c("longitude", "latitude"),
                             crs = 4326) %>%
    sf::st_transform(crs = terra::crs(ref_raster))

  oracle_labels <- terra::extract(ref_raster, 
                                  terra::vect(oracle_sf)) %>%
    dplyr::pull(lyr1) %>%
    dplyr::recode(!!!ref_labels)

  # Update the labels using data from the oracle.
  oracle_samples <- oracle_samples %>% 
    dplyr::mutate(label = oracle_labels)

  return(oracle_samples)
}

Active Learning improves the results of a classification by feeding the classifier with informative samples.

Active Learning selects unlabeled samples from a data set using query strategies designed to take advantage of the labeled samples in the data set and also the properties of the classifier. Active Learning can be understood as a set made of a classifier (C), a set of samples both labeled (L) and unlabeled (U), a heuristic (Q), and a user (S). The user S assigns labels to the samples in U using Q as criterion, while the classifier C uses the samples in L to produce a classification model [@Crawford2013].

Reference classification {#reference_classification}

To establish a base-line for comparing different Active Learning heuristics, we prepared a classification using the data available in the package sits [@Simoes2021].

We are using sits' local cube MODIS TERRA collection 6 that covers Sinop, a Brazilian town in the state of Mato Grosso for the crop-year 2014(2013/09/14 - 2014/08/29).
The samples used for training a classification model also come from the sits package. The sits tibble containing the samples is called samples_modis_4bands and includes 1218 samples.

This classification uses an ensemble algorithm (extreme gradient boosting XGBoost [@Chen2015]) and the vegetation indexes EVI and NDVI. Its results are shown below.

#---- Do a full classification and use it as ground truth ----

samples_tb <- sits::samples_modis_4bands %>%
    sits::sits_select(bands = c("NDVI", "EVI"))

classification_interval <- samples_tb %>% 
  sits::sits_timeline() %>% 
  range()

modis_cube <- sits::sits_cube(
    source = "LOCAL",
    name = "sinop-2014",
    origin = "BDC",
    collection = "MOD13Q1-6",
    satellite = "TERRA",
    sensor = "MODIS",
    data_dir = system.file("extdata/raster/mod13q1", 
                           package = "sits"),
    delim = "_",
    parse_info = c("X1", "X2", "tile", "band", "date"),
    start_date = dplyr::first(classification_interval),
    end_date   = dplyr::last(classification_interval)
)

sits_model  <- sits::sits_train(data = samples_tb, 
                                ml_method = sits_method)

probs_cube  <- sits::sits_classify(data = modis_cube,
                                   ml_model = sits_model,
                                   output_dir = tempdir(),
                                   memsize = 4,
                                   multicores = 1)

label_cube <- sits::sits_label_classification(probs_cube,
                                              output_dir = tempdir())

# Labels used during the classification.
reference_labels <- sits_model %>% 
  environment() %>% 
  magrittr::extract2("labels") %>% 
  magrittr::set_names(1:length(.))

# Get a raster with the ground truth.
reference_raster <- label_cube %>% 
  magrittr::extract2("file_info") %>% 
  magrittr::extract2(1) %>% 
  magrittr::extract2("path") %>% 
  terra::rast()

plot(label_cube, 
     title = "Ground truth")

EGAL: Exploration Guided Active Learning

EGAL in an Active Learning heuristic that ranks unlabeled samples using on two measures: density and diversity [@Hu2010].

Density measures the amount of neighbors of a sample. The more neighbors a sample has, the more unlikely the sample is an outlier. Outliers are low-density samples with high uncertainty that can trick Active Learning heuristics into selecting non-representative samples.

Diversity promotes the exploration of the sample set. It is a measure of dissimilarity among samples and favors samples which are farther apart among the sample set.

EGAL ranks candidate samples based on both their density and diversity, and selects those examples with the highest density for labelling first. Thus, examples close to each other in the feature space will not be selected successively for labelling [@Hu2010].

Unlike Random Sampling, EGAL doesn't compute the classification probabilities of each point, it just needs density and diversity measures to rank the points to be sent to the oracle.

Note that Active Learning using EGAL is independent of the classifier (unlike random sampling). In consequence, it is possible to build a large set of points for the oracle from several iterations without doing a classification.

Example 1

# Let's suppose we have only 5 samples of each label to start the 
# classification.
train_samples <- samples_tb %>%
    dplyr::group_by(label) %>%
    dplyr::sample_n(5) %>%
    dplyr::ungroup() %>%
    magrittr::set_class(class(sits::samples_modis_4bands))

accuracy_egal <- tibble::tibble()

for (i in 1:n_iterations) {

  # Let's take 400 random points in the area.
  random_points <- 
    dplyr::bind_rows(al_get_random_points(data_cube = modis_cube,
                                          n_samples = 400,
                                          multicores = n_multicores))

  # Now let's rank the random points and choose those with higher EGAL. These 
  # points will be sent to the oracle for labelling.
  oracle_samples <- train_samples %>%
    dplyr::bind_rows(random_points) %>% 
    al_egal() %>% 
    dplyr::filter(!is.na(egal)) %>% 
    dplyr::arrange(dplyr::desc(egal)) %>% 
    dplyr::slice(1:n_samples) %>%
    dplyr::select(-egal)

  # NOTE: Here we should go to the oracle. Instead, we're getting labels from
  # the reference raster.
  oracle_samples <- oracle_samples %>%
    get_oracle_labels(ref_raster = reference_raster,
                      ref_labels = reference_labels)

  # Now we join the oracle samples to the training samples. 
  train_samples <- train_samples %>%
    dplyr::bind_rows(oracle_samples)

  # We're ready to run a classification!

  out_dir <- file.path(tempdir(), paste0("iter_egal_", i))
  dir.create(out_dir)

  egal_model  <- sits::sits_train(train_samples,
                                  ml_method = sits_method)

  egal_probs_cube <- sits::sits_classify(modis_cube,
                                         ml_model = egal_model,
                                         output_dir = out_dir,
                                         memsize = 4,
                                         multicores = n_multicores)

  egal_label_cube <- sits::sits_label_classification(egal_probs_cube,
                                                     output_dir = out_dir)

  prediction_raster <- egal_label_cube %>% 
    magrittr::extract2("file_info") %>% 
    magrittr::extract2(1) %>% 
    magrittr::extract2("path") %>%
    terra::rast()

  # Compute the classification's accuracy.
  accuracy_row <- compute_accuracy(ref_rast = reference_raster,
                                   pred_rast = prediction_raster,
                                   ref_labels = reference_labels) %>% 
    data.frame() %>%
    t() %>%
    as_tibble() %>% 
    mutate(n_samples = nrow(train_samples))

  accuracy_egal <- accuracy_egal %>% 
    dplyr::bind_rows(accuracy_row)
}

knitr::kable(accuracy_egal %>% dplyr::select(n_samples, dplyr::everything()), 
             digits = 2)

plot(egal_label_cube, title = "EGAL")