#' Iterative algorithm using pivot features
#'
#' An iterative semi-supervised learning approach using pivot features. Uses a
#' random forest classifier to train and predict probabilities.
#'
#' @param source A data frame containing observations and features for the
#' source domains. Must contain a column named "Domain" which specifies the
#' domain each observation belongs to and "Class" which gives the response.
#' @param target A data frame containing unlabelled observations and features
#' for the target domain.
#' @param x_pivot A vector of strings. Gives the column names of features with
#' common distribution between source and target domains.
#' @param x_nonpivot A vector of strings. Gives the column names of features
#' that have a shifted distribution between the target and source domains.
#' @param rho Number of observations to be added to labelled target training set
#' (actual number added is 4*rho).
#'
#' @return `iter_pivot` returns an object of class "iter_pivot".
#'
#' The function `plot` produces a plot showing predictions for the most recent
#' iteration in the two feature spaces.
#'
#' An object of class "iter_pivot" is a list containing the following
#' components:
#'
#' * `final_pred` A data frame containing final predictions.
#'
#' * `all_preds` A data frame containing predictions for each iteration and
#' model.
#'
#' @examples
#' data(sim_pivot_data)
#' source <- sim_pivot_data[sim_pivot_data$Domain == "Source",]
#' target <- sim_pivot_data[sim_pivot_data$Domain == "Target",]
#' model <- iter_pivot(source = source, target = target, x_pivot = "Feature1",
#' x_nonpivot = "Feature2", rho = 3)
#' target$Prediction <- predict(model, newdata = target[c("Feature1", "Feature2")])
#' require(ggplot2)
#' plot(model, aes(x = Feature1, y = Feature2))
#' plot(model, aes(x = Feature1, y = Feature2), type = "Learning")
#'
#' @author Cameron Roach
#'
#' @importFrom magrittr "%>%"
#' @export
iter_pivot <- function(source, target, x_pivot, x_nonpivot, rho = 3) {
source_domains <- as.character(unique(source$Domain))
if (!("Domain" %in% names(target))) target$Domain <- "Target"
# Do initial model fit for each source domain using pivot features only
fit_S <- list()
fit_T <- list()
LS_p <- list()
UT <- list()
LT <- list()
LT_final <- list()
hyperplane_params <- dplyr::tibble()
for (iS in source_domains) {
# Initialise
LS_p[[iS]] <- source %>%
dplyr::filter(Domain == iS) %>%
#dplyr::mutate(Index = paste0(iS, row_number())) %>%
dplyr::select(dplyr::one_of(c("Class", x_pivot)))
UT[[iS]] <- target %>%
dplyr::mutate(Index = paste0(Domain, row_number()))
fit_S[[iS]] <- e1071::svm(x = LS_p[[iS]][x_pivot],
y = LS_p[[iS]]$Class,
type="C-classification",
kernel="linear",
cost = 1,
scale = FALSE)
hyperplane_params <- dplyr::tibble(
Domain = iS,
Iteration = 0,
w = list(t(fit_S[[iS]]$coefs) %*% fit_S[[iS]]$SV),
b = -fit_S[[iS]]$rho) %>%
dplyr::bind_rows(hyperplane_params)
tmp <- UT[[iS]]
tmp$Prediction <- predict(fit_S[[iS]], UT[[iS]][x_pivot])
tmp <- svm_margin_filter(fit_S[[iS]], tmp, x_pivot, rho)
tmp$Iteration <- 0
# Update labelled and unlabelled data
LT[[iS]] <- tmp
idx <- LT[[iS]]$Index
UT[[iS]] <- dplyr::filter(UT[[iS]], !(Index %in% idx))
}
# TODO: Keep track of iteration number in labelled data. Add Iteration
# variable and bind rows. Maybe also use a dataframe instead of a list with
# Source domain as another column.
for (iS in source_domains) {
loop_index <- 1
while (dim(UT[[iS]])[1] > 0) {
fit_T[[iS]] <- e1071::svm(x = LT[[iS]][c(x_pivot, x_nonpivot)],
y = LT[[iS]]$Prediction,
type="C-classification",
kernel="linear",
cost = 1,
scale = FALSE)
hyperplane_params <- dplyr::tibble(
Domain = iS,
Iteration = loop_index,
w = list(t(fit_T[[iS]]$coefs) %*% fit_T[[iS]]$SV),
b = -fit_T[[iS]]$rho) %>%
dplyr::bind_rows(hyperplane_params)
tmp <- UT[[iS]]
tmp$Prediction <- predict(fit_T[[iS]],
UT[[iS]][c(x_pivot, x_nonpivot)])
tmp <- svm_margin_filter(fit_T[[iS]], tmp, c(x_pivot, x_nonpivot), rho)
tmp$Iteration <- loop_index
# Update labelled and unlabelled data
LT[[iS]] <- dplyr::bind_rows(LT[[iS]], tmp)
idx <- LT[[iS]]$Index
UT[[iS]] <- dplyr::filter(UT[[iS]], !(Index %in% idx))
loop_index <- loop_index + 1
}
}
iter_pivot_obj <- list(finalModel = fit_T,
data = LT,
source_domains = source_domains,
target_domain = as.character(target$Domain[1]),
rho = rho,
hyperplane_params = hyperplane_params)
class(iter_pivot_obj) <- "iter_pivot"
iter_pivot_obj
}
#' Predict method for iterative pivot fit
#'
#' Predicts classes based on `iter_pivot` model object.
#'
#' __TODO__: Need to decide if predictions should be based off:
#'
#' 1. Final fitted model.
#'
#' 2. Average of all fitted models created during training.
#'
#' 3. The predictions that happen during training.
#'
#' Using option 1 for the moment.
#'
#' @param object An object of class "`iter_pivot`".
#' @param newdata A data frame containing feature variables for prediction.
#'
#' @return Returns a vector of predictions for the target domain.
#'
#' @export
#'
#' @author Cameron Roach
predict.iter_pivot <- function(object, newdata) {
predictions <- NULL
for (iS in object$source_domains) {
predictions <- data.frame(
Domain = iS,
newdata,
Index = 1:dim(newdata)[1],
Prediction = predict(object$finalModel[[iS]], newdata)
) %>%
dplyr::bind_rows(predictions)
}
predictions %>%
dplyr::group_by(Index, Domain, Prediction) %>%
dplyr::summarise(n = n()) %>%
dplyr::ungroup() %>%
dplyr::top_n(1, n) %>%
dplyr::arrange(Index) %>%
.$Prediction
}
#' Plot results for iter_pivot object
#'
#' Plots the predicted and actual classes of an "iter_pivot" object.
#'
#' @param object An object of class "`iter_pivot`".
#' @param mapping List of aesthetic mappings to use for plot. Currently supports
#' __x__ and __y__ aesthetics, both of which are required.
#' @param type A string specifying the plot type to be returned. Can select
#' from,
#'
#' * "Prediction" for final fitted values from each source model on target
#' domain.
#'
#' * "Learning" to show how the model has learned at each iteration.
#'
#' @param show_hyperplane Logical indicating if SVMhyperplane should be shown.
#' @param show_margin Logical indicating if SVM margin should be shown.
#'
#' @return Returns a "ggplot" object showing results of iterative training using
#' pivot features.
#' @export
#'
#' @author Cameron Roach
plot.iter_pivot <- function(object, mapping = ggplot2::aes(),
type = "Prediction", show_hyperplane = TRUE,
show_margin = TRUE) {
if (type == "Prediction") {
p <- ggplot2::ggplot()
mapping_all <- c(mapping,
ggplot2::aes(colour = Iteration,
shape = Prediction))
class(mapping_all) <- "uneval"
for (iS in object$source_domains) {
p_data <- dplyr::mutate(object$data[[iS]], Training_domain = iS)
p <- p +
ggplot2::geom_point(data = p_data, mapping_all) +
ggplot2::facet_wrap(~Training_domain) +
#ggplot2::scale_shape_manual(values = c(21, 22)) +
ggplot2::ggtitle("Predicted classes during training",
paste("Iterative training on multiple domains with pivot features."))
}
} else if (type == "Learning") {
p <- ggplot2::ggplot()
mapping_all <- c(mapping,
ggplot2::aes(colour = Prediction))
class(mapping_all) <- "uneval"
for (iS in object$source_domains) {
n_iters <- object$hyperplane_params %>%
dplyr::filter(Domain == iS) %>%
.$Iteration %>%
max()
for (iI in 0:n_iters) {
p_data <- object$data[[iS]] %>%
dplyr::mutate(Training_domain = iS,
Labelled = dplyr::if_else(Iteration < iI, TRUE, FALSE),
Iteration = iI)
ab_data <- object$hyperplane_params %>%
dplyr::filter(Domain == iS, Iteration == iI) %>%
dplyr::mutate(Iteration = iI)
p <- p +
ggplot2::geom_point(data = dplyr::filter(p_data, Labelled == TRUE),
mapping_all) +
ggplot2::geom_point(data = dplyr::filter(p_data, Labelled == FALSE),
mapping_all, colour = "grey", shape = 21, alpha = 0.7)
# Add hyperplane and margin lines
if (dim(ab_data$w[[1]])[2] == 1) {
p <- p +
ggplot2::geom_vline(data = ab_data,
ggplot2::aes(xintercept=-b/w[[1]][1,1]),
colour="blue") +
ggplot2::geom_vline(data = ab_data,
ggplot2::aes(xintercept=(-b-1)/w[[1]][1,1]),
colour="orange") +
ggplot2::geom_vline(data = ab_data,
ggplot2::aes(xintercept=(-b+1)/w[[1]][1,1]),
colour="orange")
} else if (dim(ab_data$w[[1]])[2] == 2) {
x_valid <- make.names(deparse(mapping$x))
y_valid <- make.names(deparse(mapping$y))
p <- p +
ggplot2::geom_abline(data = ab_data,
ggplot2::aes(intercept=-b/w[[1]][1, y_valid],
slope=-w[[1]][1, x_valid]/w[[1]][1, y_valid]),
colour="blue") +
ggplot2::geom_abline(data = ab_data,
ggplot2::aes(intercept=(-b-1)/w[[1]][1, y_valid],
slope=-w[[1]][1, x_valid]/w[[1]][1,y_valid]),
colour="orange") +
ggplot2::geom_abline(data = ab_data,
ggplot2::aes(intercept=(-b+1)/w[[1]][1, y_valid],
slope=-w[[1]][1, x_valid]/w[[1]][1, y_valid]),
colour="orange")
} else if (dim(ab_data$w[[1]])[2] >2) {
stop("Dimension of w greater than 2")
}
}
}
p <- p +
facet_wrap(~Iteration) +
#facet_grid(iI ~ Training_domain) +
ggplot2::scale_shape_manual(values = c(21, 22)) +
ggplot2::labs(title = "SVM classifications using iterative learning.")
}
p
}
#' Check if iter_pivot object
#'
#' Function to check if an object has class "iter_pivot".
#'
#' @param x Any `R` object.
#'
#' @return `is.iter_pivot` returns `TRUE` if its argument is an object with
#' class "iter_pivot" and `FALSE` otherwise.
#' @export
#'
#' @author Cameron Roach
is.iter_pivot <- function(x) {
inherits(x, "iter_pivot")
}
#' Filter points using SVM hyperplane
#'
#' Filters points using SVM hyperplane and margin. Four regions are considered
#' from which `rho` points are returned.
#'
#' The four regions of the sample space
#' are,
#'
#' 1. Points above the hyperplane and within the margin. Points closest to the
#' margin boundary are returned.
#' 2. Points below the hyperplane and within the margin. Points closest to the
#' margin boundary are returned.
#' 3. Points above the hyperplane. Points furthest from the hyperplane are
#' returned.
#' 4. Points below the hyperplane. Points furthest from the hyperplane are
#' returned.
#'
#' If there are less than `rho` points remaining in any of these spaces only the
#' remaining points will be returned.
#'
#' @param object Object of class "`svm`".
#' @param data Data frame to be filtered.
#' @param x Feature columns.
#' @param rho Number of points to be returned from each
#'
#' @return A data frame of filtered values that can be added to the labelled
#' data set.
#'
#' @author Cameron Roach
svm_margin_filter <- function(object, data, x, rho) {
# get parameters of hyperplane
w <- t(object$coefs) %*% object$SV
b <- -object$rho
# Point distances
hyperplane_distance <- (w %*% t(data[, x]) + b )/norm(w, "2")
data$Distance <- as.vector(hyperplane_distance)
if (dim(w)[2] == 1) {
margin_distance <- ((w %*% (-b+1)/w[1,1] + b )/norm(w, "2"))[1]
} else if (dim(w)[2] == 2) {
margin_distance <- ((w %*% c(0, (-b+1)/w[1,2]) + b )/norm(w, "2"))[1]
} else {
stop("Dimension of w greater than 2")
}
data <- data %>%
dplyr::mutate(InMargin =
dplyr::if_else(abs(data$Distance) <= margin_distance,
TRUE, FALSE),
PointHyperplanePos = dplyr::if_else(Distance > 0, "Above",
as.character(NA)),
PointHyperplanePos = dplyr::if_else(Distance <0, 'Below',
PointHyperplanePos))
# Get rho points furthest from hyperplane but within margin
new_training_points <- data %>%
dplyr::filter(InMargin == TRUE) %>%
dplyr::group_by(PointHyperplanePos) %>%
dplyr::top_n(rho, wt = abs(Distance))
new_training_points <- data %>%
dplyr::group_by(PointHyperplanePos) %>%
dplyr::top_n(rho, wt = abs(Distance)) %>%
dplyr::bind_rows(new_training_points) %>%
dplyr::ungroup()
new_training_points
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.