R/iter_pivot.R

#' Iterative algorithm using pivot features
#'
#' An iterative semi-supervised learning approach using pivot features. Uses a
#' random forest classifier to train and predict probabilities.
#'
#' @param source A data frame containing observations and features for the
#'   source domains. Must contain a column named "Domain" which specifies the
#'   domain each observation belongs to and "Class" which gives the response.
#' @param target A data frame containing unlabelled observations and features
#'   for the target domain.
#' @param x_pivot A vector of strings. Gives the column names of features with
#'   common distribution between source and target domains.
#' @param x_nonpivot A vector of strings. Gives the column names of features
#'   that have a shifted distribution between the target and source domains.
#' @param rho Number of observations to be added to labelled target training set
#'   (actual number added is 4*rho).
#'
#' @return `iter_pivot` returns an object of class "iter_pivot".
#'
#'   The function `plot` produces a plot showing predictions for the most recent
#'   iteration in the two feature spaces.
#'
#'   An object of class "iter_pivot" is a list containing the following
#'   components:
#'
#'   * `final_pred` A data frame containing final predictions.
#'
#'   * `all_preds` A data frame containing predictions for each iteration and
#'   model.
#'
#' @examples
#' data(sim_pivot_data)
#' source <- sim_pivot_data[sim_pivot_data$Domain == "Source",]
#' target <- sim_pivot_data[sim_pivot_data$Domain == "Target",]
#' model <- iter_pivot(source = source, target = target, x_pivot = "Feature1",
#'                     x_nonpivot = "Feature2", rho = 3)
#' target$Prediction <- predict(model, newdata = target[c("Feature1", "Feature2")])
#' require(ggplot2)
#' plot(model,  aes(x = Feature1, y = Feature2))
#' plot(model,  aes(x = Feature1, y = Feature2), type = "Learning")
#'
#' @author Cameron Roach
#'
#' @importFrom magrittr "%>%"
#' @export
iter_pivot <- function(source, target, x_pivot, x_nonpivot, rho = 3) {

  source_domains <- as.character(unique(source$Domain))

  if (!("Domain" %in% names(target))) target$Domain <- "Target"

  # Do initial model fit for each source domain using pivot features only
  fit_S <- list()
  fit_T <- list()
  LS_p <- list()
  UT <- list()
  LT <- list()
  LT_final <- list()
  hyperplane_params <- dplyr::tibble()

  for (iS in source_domains) {
    # Initialise
    LS_p[[iS]] <- source %>%
      dplyr::filter(Domain == iS) %>%
      #dplyr::mutate(Index = paste0(iS, row_number())) %>%
      dplyr::select(dplyr::one_of(c("Class", x_pivot)))
    UT[[iS]] <- target %>%
      dplyr::mutate(Index = paste0(Domain, row_number()))

    fit_S[[iS]] <- e1071::svm(x = LS_p[[iS]][x_pivot],
                              y = LS_p[[iS]]$Class,
                              type="C-classification",
                              kernel="linear",
                              cost = 1,
                              scale = FALSE)
    hyperplane_params <- dplyr::tibble(
      Domain = iS,
      Iteration = 0,
      w = list(t(fit_S[[iS]]$coefs) %*% fit_S[[iS]]$SV),
      b = -fit_S[[iS]]$rho) %>%
      dplyr::bind_rows(hyperplane_params)

    tmp <- UT[[iS]]
    tmp$Prediction <- predict(fit_S[[iS]], UT[[iS]][x_pivot])
    tmp <- svm_margin_filter(fit_S[[iS]], tmp, x_pivot, rho)
    tmp$Iteration <- 0

    # Update labelled and unlabelled data
    LT[[iS]] <- tmp
    idx <- LT[[iS]]$Index
    UT[[iS]] <- dplyr::filter(UT[[iS]], !(Index %in% idx))
  }


  # TODO: Keep track of iteration number in labelled data. Add Iteration
  # variable and bind rows. Maybe also use a dataframe instead of a list with
  # Source domain as another column.
  for (iS in source_domains) {
    loop_index <- 1
    while (dim(UT[[iS]])[1] > 0) {
      fit_T[[iS]] <- e1071::svm(x = LT[[iS]][c(x_pivot, x_nonpivot)],
                                y = LT[[iS]]$Prediction,
                                type="C-classification",
                                kernel="linear",
                                cost = 1,
                                scale = FALSE)
      hyperplane_params <- dplyr::tibble(
        Domain = iS,
        Iteration = loop_index,
        w = list(t(fit_T[[iS]]$coefs) %*% fit_T[[iS]]$SV),
        b = -fit_T[[iS]]$rho) %>%
        dplyr::bind_rows(hyperplane_params)

      tmp <- UT[[iS]]
      tmp$Prediction <- predict(fit_T[[iS]],
                                UT[[iS]][c(x_pivot, x_nonpivot)])
      tmp <- svm_margin_filter(fit_T[[iS]], tmp, c(x_pivot, x_nonpivot), rho)
      tmp$Iteration <- loop_index

      # Update labelled and unlabelled data
      LT[[iS]] <- dplyr::bind_rows(LT[[iS]], tmp)
      idx <- LT[[iS]]$Index
      UT[[iS]] <- dplyr::filter(UT[[iS]], !(Index %in% idx))
      loop_index <- loop_index + 1
    }
  }

  iter_pivot_obj <- list(finalModel = fit_T,
                         data = LT,
                         source_domains = source_domains,
                         target_domain = as.character(target$Domain[1]),
                         rho = rho,
                         hyperplane_params = hyperplane_params)
  class(iter_pivot_obj) <- "iter_pivot"
  iter_pivot_obj
}

#' Predict method for iterative pivot fit
#'
#' Predicts classes based on `iter_pivot` model object.
#'
#' __TODO__: Need to decide if predictions should be based off:
#'
#' 1. Final fitted model.
#'
#' 2. Average of all fitted models created during training.
#'
#' 3. The predictions that happen during training.
#'
#' Using option 1 for the moment.
#'
#' @param object An object of class "`iter_pivot`".
#' @param newdata A data frame containing feature variables for prediction.
#'
#' @return Returns a vector of predictions for the target domain.
#'
#' @export
#'
#' @author Cameron Roach
predict.iter_pivot <- function(object, newdata) {
  predictions <- NULL
  for (iS in object$source_domains) {
    predictions <- data.frame(
      Domain = iS,
      newdata,
      Index = 1:dim(newdata)[1],
      Prediction = predict(object$finalModel[[iS]], newdata)
    ) %>%
      dplyr::bind_rows(predictions)
  }

  predictions %>%
    dplyr::group_by(Index, Domain, Prediction) %>%
    dplyr::summarise(n = n()) %>%
    dplyr::ungroup() %>%
    dplyr::top_n(1, n) %>%
    dplyr::arrange(Index) %>%
    .$Prediction
}

#' Plot results for iter_pivot object
#'
#' Plots the predicted and actual classes of an "iter_pivot" object.
#'
#' @param object An object of class "`iter_pivot`".
#' @param mapping List of aesthetic mappings to use for plot. Currently supports
#'   __x__ and __y__ aesthetics, both of which are required.
#' @param type A string specifying the plot type to be returned. Can select
#'   from,
#'
#'   * "Prediction" for final fitted values from each source model on target
#'   domain.
#'
#'   * "Learning" to show how the model has learned at each iteration.
#'
#' @param show_hyperplane Logical indicating if SVMhyperplane should be shown.
#' @param show_margin Logical indicating if SVM margin should be shown.
#'
#' @return Returns a "ggplot" object showing results of iterative training using
#'   pivot features.
#' @export
#'
#' @author Cameron Roach
plot.iter_pivot <- function(object, mapping = ggplot2::aes(),
                            type = "Prediction", show_hyperplane = TRUE,
                            show_margin = TRUE) {
  if (type == "Prediction") {
    p <- ggplot2::ggplot()
    mapping_all <- c(mapping,
                     ggplot2::aes(colour = Iteration,
                                  shape = Prediction))
    class(mapping_all) <- "uneval"
    for (iS in object$source_domains) {
      p_data <- dplyr::mutate(object$data[[iS]], Training_domain = iS)
      p <- p +
        ggplot2::geom_point(data = p_data, mapping_all) +
        ggplot2::facet_wrap(~Training_domain) +
        #ggplot2::scale_shape_manual(values = c(21, 22)) +
        ggplot2::ggtitle("Predicted classes during training",
                         paste("Iterative training on multiple domains with pivot features."))
    }
  } else if (type == "Learning") {
    p <- ggplot2::ggplot()
    mapping_all <- c(mapping,
                     ggplot2::aes(colour = Prediction))
    class(mapping_all) <- "uneval"

    for (iS in object$source_domains) {
      n_iters <- object$hyperplane_params %>%
        dplyr::filter(Domain == iS) %>%
        .$Iteration %>%
        max()

      for (iI in 0:n_iters) {
        p_data <- object$data[[iS]] %>%
          dplyr::mutate(Training_domain = iS,
                        Labelled = dplyr::if_else(Iteration < iI, TRUE, FALSE),
                        Iteration = iI)
        ab_data <- object$hyperplane_params %>%
          dplyr::filter(Domain == iS, Iteration == iI) %>%
          dplyr::mutate(Iteration = iI)

        p <- p +
          ggplot2::geom_point(data = dplyr::filter(p_data, Labelled == TRUE),
                              mapping_all) +
          ggplot2::geom_point(data = dplyr::filter(p_data, Labelled == FALSE),
                              mapping_all, colour = "grey", shape = 21, alpha = 0.7)

        # Add hyperplane and margin lines
        if (dim(ab_data$w[[1]])[2] == 1) {
          p <- p +
            ggplot2::geom_vline(data = ab_data,
                                ggplot2::aes(xintercept=-b/w[[1]][1,1]),
                                colour="blue") +
            ggplot2::geom_vline(data = ab_data,
                                ggplot2::aes(xintercept=(-b-1)/w[[1]][1,1]),
                                colour="orange") +
            ggplot2::geom_vline(data = ab_data,
                                ggplot2::aes(xintercept=(-b+1)/w[[1]][1,1]),
                                colour="orange")
        } else if (dim(ab_data$w[[1]])[2] == 2) {
          x_valid <- make.names(deparse(mapping$x))
          y_valid <- make.names(deparse(mapping$y))
          p <- p +
            ggplot2::geom_abline(data = ab_data,
                                 ggplot2::aes(intercept=-b/w[[1]][1, y_valid],
                                              slope=-w[[1]][1, x_valid]/w[[1]][1, y_valid]),
                                 colour="blue") +
            ggplot2::geom_abline(data = ab_data,
                                 ggplot2::aes(intercept=(-b-1)/w[[1]][1, y_valid],
                                              slope=-w[[1]][1, x_valid]/w[[1]][1,y_valid]),
                                 colour="orange") +
            ggplot2::geom_abline(data = ab_data,
                                 ggplot2::aes(intercept=(-b+1)/w[[1]][1, y_valid],
                                              slope=-w[[1]][1, x_valid]/w[[1]][1, y_valid]),
                                 colour="orange")
        } else if (dim(ab_data$w[[1]])[2] >2) {
          stop("Dimension of w greater than 2")
        }
      }
    }

    p <- p +
      facet_wrap(~Iteration) +
      #facet_grid(iI ~ Training_domain) +
      ggplot2::scale_shape_manual(values = c(21, 22)) +
      ggplot2::labs(title = "SVM classifications using iterative learning.")
  }
  p
}

#' Check if iter_pivot object
#'
#' Function to check if an object has class "iter_pivot".
#'
#' @param x Any `R` object.
#'
#' @return `is.iter_pivot` returns `TRUE` if its argument is an object with
#'   class "iter_pivot" and `FALSE` otherwise.
#' @export
#'
#' @author Cameron Roach
is.iter_pivot <- function(x) {
  inherits(x, "iter_pivot")
}

#' Filter points using SVM hyperplane
#'
#' Filters points using SVM hyperplane and margin. Four regions are considered
#' from which `rho` points are returned.
#'
#' The four regions of the sample space
#' are,
#'
#' 1. Points above the hyperplane and within the margin. Points closest to the
#' margin boundary are returned.
#' 2. Points below the hyperplane and within the margin. Points closest to the
#' margin boundary are returned.
#' 3. Points above the hyperplane. Points furthest from the hyperplane are
#' returned.
#' 4. Points below the hyperplane. Points furthest from the hyperplane are
#' returned.
#'
#' If there are less than `rho` points remaining in any of these spaces only the
#' remaining points will be returned.
#'
#' @param object Object of class "`svm`".
#' @param data Data frame to be filtered.
#' @param x Feature columns.
#' @param rho Number of points to be returned from each
#'
#' @return A data frame of filtered values that can be added to the labelled
#'   data set.
#'
#' @author Cameron Roach
svm_margin_filter <- function(object, data, x, rho) {
  # get parameters of hyperplane
  w <- t(object$coefs) %*% object$SV
  b <- -object$rho

  # Point distances
  hyperplane_distance <- (w %*% t(data[, x]) + b )/norm(w, "2")
  data$Distance <- as.vector(hyperplane_distance)

  if (dim(w)[2] == 1) {
    margin_distance <- ((w %*% (-b+1)/w[1,1] + b )/norm(w, "2"))[1]
  } else if (dim(w)[2] == 2) {
    margin_distance <- ((w %*% c(0, (-b+1)/w[1,2]) + b )/norm(w, "2"))[1]
  } else {
    stop("Dimension of w greater than 2")
  }

  data <- data %>%
    dplyr::mutate(InMargin =
                    dplyr::if_else(abs(data$Distance) <= margin_distance,
                                   TRUE, FALSE),
                  PointHyperplanePos = dplyr::if_else(Distance > 0, "Above",
                                                      as.character(NA)),
                  PointHyperplanePos = dplyr::if_else(Distance <0, 'Below',
                                                      PointHyperplanePos))

  # Get rho points furthest from hyperplane but within margin
  new_training_points <- data %>%
    dplyr::filter(InMargin == TRUE) %>%
    dplyr::group_by(PointHyperplanePos) %>%
    dplyr::top_n(rho, wt = abs(Distance))

  new_training_points <- data %>%
    dplyr::group_by(PointHyperplanePos) %>%
    dplyr::top_n(rho, wt = abs(Distance)) %>%
    dplyr::bind_rows(new_training_points) %>%
    dplyr::ungroup()
  new_training_points
}
camroach87/semisupervisr documentation built on May 13, 2019, 11:04 a.m.