R/KNN.R

#' Supervised Machine Learning K Nearest Neighbours
#' @description This is function is based on a popular K-Nearest Neighbour Algorith, and fits a k nearest neighbour model using fast search algorithms. KNN falls in the category of supervised machine learning
#'              algorithm which can be used for classification  and regression tasks.
#' @format \code{\link{R6Class}} object.
#' @section Usage:
#' For  details of the Usage refer to the  \bold{Method,Examples & Arguments} sections of the Documentation.
#' \preformatted{
#' best_model = KNN$new(k=1, proba=FALSE, algorithm=NULL, type="class")
#' best_model$fit(X_train, X_test, "target")
#' best_model$predict(type)
#' }
#' @section Methods:
#' \describe{
#'     \item{\code{$new()}}{Initialisization: This will initialize a new instance of the trainer}
#'     \item{\code{$fit()}}{Fitting Model :   By calling this method it trains the knn model and stores the  predictions}
#'     \item{\code{$predict()}}{Predictions : It returns the predictions previously stored when calling the fit Method}
#' }
#' @section Arguments:
#' \describe{
#'     \item{k}{The Number of Nearest Neighbours to be considered while making a prediction}
#'     \item{proba}{It specified if the probability should be computed or not by default the labelled value will be returned, default=FALSE}
#'     \item{algorithm}{It specifies the algorithm which is used to fit the model, Options are  'kd_tree','cover_tree','brute'}
#'     \item{type}{It specifies the type of problem to be solved , for example a  classification or regression problem , values to be specified are 'regr' or 'class'}
#' }
#' @export
#' @examples
#' data("iris")
#'
#' iris$Species <- as.integer(as.factor(iris$Species))
#'
#' x_train <- iris[1:110,]
#' x_test <- iris[111:150,]
#'
#' best_model <- KNN$new(k=3, proba=TRUE, type="class")
#' best_model$fit(x_train, x_test, 'Species')
#' preds <- best_model$predict(type="raw")

KNN <- R6Class("KNN", public = list(

  #' @field k the number of neighbours to consider while predicting ::: 1 by default but must be specified
  k = 1,
  #' @field proba must be set to True if probability needs to  be computed ::: by default the value is set to FALSE
  proba = FALSE,
  #' @field algorithm used to train the model, the  options  are 'kd_tree','cover_tree','brute' ::: by default no algorithm is specified
  algorithm = NULL,
  #' @field type  of problem to to be solved , the  regression or classification type must be specified, options include 'regr' and 'class' ::: by default the value is set to class
  type = "class",
  #' @field model
  model = NA,

  #' @details
  #' Create an instance of the new `KNN` object from the class.
  #'
  #' @param k k specidies the number of neighbours to be considering while making a prediction.
  #' @param proba proba specifies if probability ranging between 0-1 shall be computed,by default value is set to FALSE.
  #' @param algorithm algorithm specifies the algorithm or method used to fit the model, options include 'kd_tree','cover_tree', and 'brute'.
  #' @param type specifies the type of problem under study that needs to be solved for example a  regression or  a classification problem, options are  'regr' or 'class'.
  #' @return A `KNN` object that can be used to call fit and predict methods.
  #'
  #' @examples
  #' data("iris")
  #'
  #' iris$Species <- as.integer(as.factor(iris$Species))
  #'
  #' x_train <- iris[1:110,]
  #' x_test <- iris[111:150,]
  #'
  #' best_model <- KNN$new(k=3, proba=TRUE, type="class")
  #' best_model$fit(x_train, x_test, 'Species')
  #' preds <- best_model$predict(type="raw")

  initialize = function(k, proba, algorithm, type){
    if(!(missing(k))) self$k <- k
    if(!(missing(proba))) self$proba <- proba
    if(!(missing(algorithm))) self$algorithm <- algorithm
    if(!(missing(type))) self$type <- type
    superml::check_package("FNN")
  },

  #' @details
  #' Trains the the K Nearest Neighbour KNN model
  #'
  #' @param train_data must be R data frame  or matrix
  #' @param test_data must be a R data frame or matrix
  #' @param y Can be a character, name or value  of target variable
  #' @return NULL
  #'
  #' @examples
  #' data("iris")
  #'
  #' iris$Species <- as.integer(as.factor(iris$Species))
  #'
  #' x_train <- iris[1:110,]
  #' x_test <- iris[111:150,]
  #'
  #' best_model <- KNN$new(k=3, proba=TRUE, type="class")
  #' best_model$fit(x_train, x_test, 'Species')

  fit = function(train_data, test_data, y){

    data <- private$prepare_data(train_data, test_data, y)

    if(self$type == "class"){
      self$model <- FNN::knn(train = data$train_data
                             ,test = data$test_data
                             ,cl = data$y
                             ,k = self$k
                             ,prob = self$proba
                             ,algorithm = self$algorithm)
    } else if (self$type == "regr"){
      self$model <- FNN::knn.reg(train = data$train_data
                                 ,test = data$test_data
                                 ,y = data$y
                                 ,k = self$k
                                 ,algorithm = self$algorithm)
    }
  },

  #' @details
  #' Makes a Prediction based on the  Nearest Neigbours for  data
  #'
  #' @param type A Character, 'raw' for the exact labels else 'proba'
  #' @return A list of predictions based on the  neighbours
  #'
  #' @examples
  #' data("iris")
  #'
  #' iris$Species <- as.integer(as.factor(iris$Species))
  #'
  #' x_train <- iris[1:110,]
  #' x_test <- iris[111:150,]
  #'
  #' best_model <- KNN$new(k=3, proba=TRUE, type="class")
  #' best_model$fit(x_train, x_test, 'Species')
  #' preds <- best_model$predict(type="raw")

  predict = function(type="raw"){

    if (self$type == "class") {
      if (type == "raw") {
        return(as.numeric(as.character(self$model)))
      } else if (type == "proba") {
        return(attr(self$model, "proba"))
      }
    } else if (self$type == "regr") {
      return(self$model$pred)
    }

  }),

  private = list(

    prepare_data = function(train_data, test_data, y){

      train_data <- as.data.table(train_data)
      test_data <- as.data.table(test_data)

      if (!(y %in% names(train_data)))
        stop(sprintf("%s Not Available in the Given Training Data", y))

      # get dependent variable and store temporarily
      y_temp <- train_data[[y]]

      # select all independent features
      train_data <- train_data[,setdiff(names(train_data), y), with = F]

      # subset from test, just in case if the dependet variable is in test
      test_data <- test_data[, setdiff(names(test_data), y), with = F]

      # set dependent variable to y
      y <- y_temp

      if (ncol(test_data) != ncol(train_data))
        stop(sprintf('Train and Test data must not have
                             unequal number of Independent Variables.'))

      if (any(vapply(train_data, is.factor, logical(1)))
          | any(vapply(train_data, is.character, logical(1))))
        stop("Train Data must not Have Non-Numeric Variables.
                     Please Preprocess the data to convert all Variables  into integer ")

      if (any(vapply(test_data, is.factor, logical(1)))
          | any(vapply(test_data, is.character, logical(1))))
        stop("Test Data must not Have Non-Numeric Variables.
                     Please Preprocess the data to convert all Variables  into integer")

      # checking if target variable contains float values or NA values
      if (any(is.na(y)))
        stop("The Target Variable to be Predicted  Contains NA Values.")

      if (self$type=="class") {
        if (is.numeric(y)){
          if (!(all(y == floor(y))))
            stop("The Target Variable to be Predicted Contains Float values")
        }
      }

      return(list(train_data = train_data, test_data = test_data, y = y))
    }

  )
)
MalikShahidSultan/machinelearning documentation built on May 9, 2022, 8:32 p.m.