R/IterativeQuantileNearestNeighbors.R

Defines functions iqnn iqnn_predict iqnn_cv_predict iqnn_tune

Documented in iqnn iqnn_cv_predict iqnn_predict iqnn_tune

#--------------------------------------
#' Iterative Quantile Binned Nearest Neighbors
#'
#' @description Function for creating iterative quantile nearest neighbors model. Bin the training data, then store the binning definitions and bin statistics to be used to estimate for future testing data.
#'
#' @param data Data frame containing the response variable and numeric input variables from the training data
#' @param y Name of response variable column
#' @param mod_type Depends on response variables type: "reg" creates iqnn-regression for predicting numeric values, "class" creates iqnn-classifier for predicting categorical values
#' @param bin_cols vector of column names of variables to iteratively bin, ordered first to last
#' @param nbins vector of number of bins per step of iterative binning, ordered first to last
#' @param jit vector of margins for uniform jitter to each dimension to create seperability of tied obs due to finite precision
#' @param stretch TRUE/FALSE if will bins be given tolerance buffer
#' @param tol vector of tolerance values to stretch each dimension for future binning
#' 
#' @return list containing binned training data, binning definition, and bin statistics
#' @family iterative quantile nearest-neighbors functions
#' @export
#' 
#' @examples 
#' iqnn_mod <- iqnn(data=iris, y="Species", mod_type="class",
#'                  bin_cols=c("Sepal.Length","Sepal.Width","Petal.Width"),
#'                  nbins=c(3,5,2), jit=rep(0.001,3), tol = rep(0.001,3))
#' str(iqnn_mod)

iqnn <- function(data, y, mod_type="reg", bin_cols, nbins, jit = rep(0,length(bin_cols)), stretch=FALSE, tol = rep(0,length(bin_cols)) ){
  data <- as.data.frame(data)
  iq_bin <- iqbin(data=data, bin_cols=bin_cols, nbins=nbins, jit=jit,output="both")
  if(stretch) iq_bin <- iqbin_stretch(iq_bin, tol=tol)
  iq_bin$bin_def$y <- y
  iq_bin$bin_def$mod_type <- mod_type
  total_bins <- nrow(iq_bin$bin_def$bin_bounds)
  if(mod_type=="reg"){
    iq_bin$bin_def$bin_stats <- data.frame(pred = sapply(1:total_bins, function(b) mean(data[iq_bin$bin_data$data$bin_index==b,y], na.rm=TRUE)),
                                           obs = sapply(1:total_bins, function(b) sum(iq_bin$bin_data$data$bin_index==b)) )
  }else if(mod_type=="class"){
    iq_bin$bin_def$bin_stats <- data.frame(pred = sapply(1:total_bins, function(b) majority_vote(data[iq_bin$bin_data$data$bin_index==b,y])),
                                           obs = sapply(1:total_bins, function(b) sum(iq_bin$bin_data$data$bin_index==b)) )
  }else{return(print("mod_type must be either 'reg' or 'class'"))}
  attributes(iq_bin$bin_def)$iq_obj_type <- "iqnn"
  return(iq_bin$bin_def)
}
  
  

#--------------------------------------
#' Predict for test data using iqnn model
#'
#' @description Predict the response value for test data using iqnn model defined using training data from the \code{\link{iqnn}} function
#'
#' @param iqnn_mod iterative quantile nearest neighbors model generated by the \code{\link{iqnn}} function
#' @param test_data Data frame of test data to estimate response values for
#' @param type output "estimate", "binsize", or "both"
#' @param strict TRUE/FALSE: If TRUE Observations must fall within existing bins to be assigned; if FALSE the outer bins in each dimension are unbounded to allow outlying values to be assigned.
#' 
#' @return predicted responses, number of neighbors or both
#' @family iterative quantile nearest-neighbors functions
#' @export
#' @examples 
#' # Test Regression
#' test_index <- c(1,2,51,52,101,102)
#' iqnn_mod <- iqnn(data=iris[-test_index,], y="Petal.Length",
#'                  bin_cols=c("Sepal.Length","Sepal.Width","Petal.Width"), 
#'                  nbins=c(3,5,2), jit=rep(0.001,3), stretch=TRUE, tol=rep(.001,3))
#' test_data <- iris[test_index,]
#' iqnn_predict(iqnn_mod, test_data,strict=FALSE)
#' iqnn_predict(iqnn_mod, test_data,strict=TRUE)
#' iqnn_predict(iqnn_mod, test_data,type="both")
#'
#' # Test Classifier
#' iqnn_mod <- iqnn(data=iris[-test_index,], y="Species", mod_type="class", 
#'                  bin_cols=c("Sepal.Length","Sepal.Width","Petal.Width"), 
#'                  nbins=c(3,5,2), jit=rep(0.001,3))
#' test_data <- iris[test_index,]
#' iqnn_predict(iqnn_mod, test_data,strict=TRUE)
#' iqnn_predict(iqnn_mod, test_data,type="both",strict=FALSE)

iqnn_predict <- function(iqnn_mod,test_data, type="estimate",strict=FALSE){
  test_data <- as.data.frame(test_data)
  bin_index <- iqbin_assign(iqnn_mod, test_data, output="bin_index",strict=strict)
  if(type=="estimate") return(iqnn_mod$bin_stats$pred[bin_index])
  if(type=="binsize") return(iqnn_mod$bin_stats$obs[bin_index])
  if(type=="both") return(iqnn_mod$bin_stats[bin_index,])
}
  
  

#--------------------------------------
#' Cross Validated predictions for iqnn models
#'
#' @description Cross-validate an iqnn specification using k-fold scheme on given data
#'
#' @param data Data frame containing the response variable and numeric input variables from the training data
#' @param y Name of response variable column
#' @param mod_type Depends on response variables type: "reg" creates iqnn-regression for predicting numeric values, "class" creates iqnn-classifier for predicting categorical values
#' @param bin_cols vector of column names of variables to iteratively bin, ordered first to last
#' @param nbins vector of number of bins per step of iterative binning, ordered first to last
#' @param jit vector of margins for uniform jitter to each dimension to create seperability of tied obs due to finite precision
#' @param stretch TRUE/FALSE if will bins be given tolerance buffer
#' @param tol vector of tolerance values to stretch each dimension for future binning
#' @param strict TRUE/FALSE: If TRUE Observations must fall within existing bins to be assigned; if FALSE the outer bins in each dimension are unbounded to allow outlying values to be assigned.
#' @param cv_k integer specifying number of folds
#' 
#' @return cross validated predicted responses for all observations in data
#' @family iterative quantile nearest-neighbors functions
#' @export
#' @examples 
#' cv_preds <- iqnn_cv_predict(data=iris, y="Species",mod_type="class", 
#'                             bin_cols=c("Sepal.Length","Sepal.Width","Petal.Width"),
#'                             nbins=c(3,5,2), jit=rep(0.001,3), strict=FALSE, cv_k=10)
#' table(cv_preds, iris$Species)
#' 
#' cv_preds <- iqnn_cv_predict(data=iris, y="Petal.Length",mod_type="reg",
#'                             bin_cols=c("Sepal.Length","Sepal.Width","Petal.Width"),
#'                             nbins=c(3,5,2), jit=rep(0.001,3), strict=FALSE, cv_k=10)
#' table(cv_preds, iris$Species)

iqnn_cv_predict <- function(data, y, mod_type="reg", bin_cols, nbins, jit=rep(0,length(bin_cols)), stretch=FALSE, tol=rep(0,length(bin_cols)), strict=FALSE, cv_k=10){
  data <- as.data.frame(data)
  cv_cohorts <- make_cv_cohorts(data, cv_k)
  if(mod_type=="class") cv_preds <- factor(rep("NA", nrow(data)),levels(data[, y]))
  if(mod_type=="reg") cv_preds <- rep(NA, nrow(data))
  for(fold in 1:length(unique(cv_cohorts))){
    test_index <- which(cv_cohorts==fold)
    train_data_temp <- data[-test_index,]
    row.names(train_data_temp) <- 1:nrow(train_data_temp)
    iqnn_mod <- iqnn(train_data_temp, y=y, mod_type=mod_type, bin_cols=bin_cols, 
                     nbins=nbins, jit=jit,stretch=stretch, tol=tol)
    cv_preds[test_index] <- iqnn_predict(iqnn_mod, data[test_index,],strict=strict, type="estimate")
  }
  # if(mod_type=="class") cv_preds <- factor(cv_preds, labels=levels(data[,y]))
  cv_preds
}

#--------------------------------------
#' Tuning function for iqnn model
#'
#' @description Identify optimal number of bins per dimension for iterative quantile nearest-neighbor model using k-fold cross validation. 
#' Search through possible numbers of bins per dimension using a iterative rules defined in the \code{\link{make_nbins_list}} function.
#'
#' @param data Data frame containing the response variable and numeric input variables from the training data
#' @param y Name of response variable column
#' @param mod_type Depends on response variables type: "reg" creates iqnn-regression for predicting numeric values, "class" creates iqnn-classifier for predicting categorical values
#' @param bin_cols vector of column names of variables to iteratively bin, ordered first to last
#' @param nbins_range positive integer vector containing lower and upper bounds on number of bins in each dimension
#' @param jit vector of margins for uniform jitter to each dimension to create seperability of tied obs due to finite precision
#' @param stretch TRUE/FALSE if will bins be given tolerance buffer
#' @param tol vector of tolerance values to stretch each dimension for future binning
#' @param strict TRUE/FALSE: If TRUE Observations must fall within existing bins to be assigned; if FALSE the outer bins in each dimension are unbounded to allow outlying values to be assigned.
#' @param cv_k integer specifying number of folds
#' @param oom_search TRUE/FALSE: Only consider nbins that change number of neighbors by order of magnitudes
#' @param oom_base order of magnitude base value (default to powers of 2)
#' 
#' @return data frame with one row per binning specification with desriptive and performance statistics: bin dimensitions, number of bins, equivalent k-nearest neightbor size, performance (mean squared error or class error rate)
#' @family iterative quantile nearest-neighbors functions
#' @export
#' @examples 
#' # 10-fold CV
#' cv_tune1 <- iqnn_tune(data=iris, y="Petal.Length", mod_type="reg",  
#'                       bin_cols=c("Sepal.Length","Sepal.Width","Petal.Width"),
#'                       nbins_range=c(2,5), jit=rep(0.001,3), strict=FALSE, cv_k=10)
#' cv_tune1
#' # LOO CV
#' cv_tune2 <- iqnn_tune(data=iris, y="Petal.Length", mod_type="reg",  
#'                       bin_cols=c("Sepal.Length","Sepal.Width","Petal.Width"),
#'                       nbins_range=c(2,5), jit=rep(0.001,3), strict=FALSE, cv_k=nrow(iris))
#' cv_tune2

iqnn_tune <- function(data, y, mod_type="reg", bin_cols, nbins_range, jit=rep(0,length(bin_cols)), 
                      stretch=FALSE, tol=rep(0,length(bin_cols)), strict=FALSE, cv_k=10, oom_search=FALSE, oom_base=2){
 
  nbins_list <- make_nbins_list(nbins_range,length(bin_cols))
  cv_results <- data.frame(bin_dims = sapply(nbins_list, function(x) paste(x,collapse="X")),    
                           nn_equiv = sapply(nbins_list, function(x) (cv_k-1)/cv_k*nrow(data)/prod(x)),
                           nbins_total=NA)
  cv_results$nbins <- nbins_list
  fold_n <- floor(nrow(data)*((cv_k-1)/cv_k))

  if(oom_search){
    unique_nn_size <- as.integer(oom_base^(1:floor(log(fold_n, base=oom_base))))
  } else {
    unique_nn_size <- as.integer(unique(round(cv_results$nn_equiv)))
  }
  keeper_idx <- sort(unique(sapply(1:length(unique_nn_size), function(i) which.min(abs(cv_results$nn_equiv-unique_nn_size[i])))))
  
  for(t in keeper_idx){
    cv_preds <- iqnn_cv_predict(data=data, y=y, mod_type=mod_type, bin_cols=bin_cols, nbins=nbins_list[[t]], jit=jit, 
                                stretch=stretch,tol=tol, strict=FALSE, cv_k=cv_k)
    if(mod_type=="reg") cv_results$MSE[t] <- mean((data[,y]-cv_preds)^2)
    if(mod_type=="class") cv_results$error[t] <- sum(cv_preds!=data[,y]) / nrow(data)
    cv_results$nbins_total[t] <- prod(nbins_list[[t]])
  }
  cv_results <- stats::na.omit(cv_results)
  if(mod_type=="reg") cv_results$RMSE <- sqrt(cv_results$MSE)
  return(cv_results)
}
kmaurer/iqbin documentation built on Jan. 1, 2020, 6:48 p.m.