#--------------------------------------
#' Iterative Quantile Binned Nearest Neighbors
#'
#' @description Function for creating iterative quantile nearest neighbors model. Bin the training data, then store the binning definitions and bin statistics to be used to estimate for future testing data.
#'
#' @param data Data frame containing the response variable and numeric input variables from the training data
#' @param y Name of response variable column
#' @param mod_type Depends on response variables type: "reg" creates iqnn-regression for predicting numeric values, "class" creates iqnn-classifier for predicting categorical values
#' @param bin_cols vector of column names of variables to iteratively bin, ordered first to last
#' @param nbins vector of number of bins per step of iterative binning, ordered first to last
#' @param jit vector of margins for uniform jitter to each dimension to create seperability of tied obs due to finite precision
#' @param stretch TRUE/FALSE if will bins be given tolerance buffer
#' @param tol vector of tolerance values to stretch each dimension for future binning
#'
#' @return list containing binned training data, binning definition, and bin statistics
#' @family iterative quantile nearest-neighbors functions
#' @export
#'
#' @examples
#' iqnn_mod <- iqnn(data=iris, y="Species", mod_type="class",
#' bin_cols=c("Sepal.Length","Sepal.Width","Petal.Width"),
#' nbins=c(3,5,2), jit=rep(0.001,3), tol = rep(0.001,3))
#' str(iqnn_mod)
iqnn <- function(data, y, mod_type="reg", bin_cols, nbins, jit = rep(0,length(bin_cols)), stretch=FALSE, tol = rep(0,length(bin_cols)) ){
data <- as.data.frame(data)
iq_bin <- iqbin(data=data, bin_cols=bin_cols, nbins=nbins, jit=jit,output="both")
if(stretch) iq_bin <- iqbin_stretch(iq_bin, tol=tol)
iq_bin$bin_def$y <- y
iq_bin$bin_def$mod_type <- mod_type
total_bins <- nrow(iq_bin$bin_def$bin_bounds)
if(mod_type=="reg"){
iq_bin$bin_def$bin_stats <- data.frame(pred = sapply(1:total_bins, function(b) mean(data[iq_bin$bin_data$data$bin_index==b,y], na.rm=TRUE)),
obs = sapply(1:total_bins, function(b) sum(iq_bin$bin_data$data$bin_index==b)) )
}else if(mod_type=="class"){
iq_bin$bin_def$bin_stats <- data.frame(pred = sapply(1:total_bins, function(b) majority_vote(data[iq_bin$bin_data$data$bin_index==b,y])),
obs = sapply(1:total_bins, function(b) sum(iq_bin$bin_data$data$bin_index==b)) )
}else{return(print("mod_type must be either 'reg' or 'class'"))}
attributes(iq_bin$bin_def)$iq_obj_type <- "iqnn"
return(iq_bin$bin_def)
}
#--------------------------------------
#' Predict for test data using iqnn model
#'
#' @description Predict the response value for test data using iqnn model defined using training data from the \code{\link{iqnn}} function
#'
#' @param iqnn_mod iterative quantile nearest neighbors model generated by the \code{\link{iqnn}} function
#' @param test_data Data frame of test data to estimate response values for
#' @param type output "estimate", "binsize", or "both"
#' @param strict TRUE/FALSE: If TRUE Observations must fall within existing bins to be assigned; if FALSE the outer bins in each dimension are unbounded to allow outlying values to be assigned.
#'
#' @return predicted responses, number of neighbors or both
#' @family iterative quantile nearest-neighbors functions
#' @export
#' @examples
#' # Test Regression
#' test_index <- c(1,2,51,52,101,102)
#' iqnn_mod <- iqnn(data=iris[-test_index,], y="Petal.Length",
#' bin_cols=c("Sepal.Length","Sepal.Width","Petal.Width"),
#' nbins=c(3,5,2), jit=rep(0.001,3), stretch=TRUE, tol=rep(.001,3))
#' test_data <- iris[test_index,]
#' iqnn_predict(iqnn_mod, test_data,strict=FALSE)
#' iqnn_predict(iqnn_mod, test_data,strict=TRUE)
#' iqnn_predict(iqnn_mod, test_data,type="both")
#'
#' # Test Classifier
#' iqnn_mod <- iqnn(data=iris[-test_index,], y="Species", mod_type="class",
#' bin_cols=c("Sepal.Length","Sepal.Width","Petal.Width"),
#' nbins=c(3,5,2), jit=rep(0.001,3))
#' test_data <- iris[test_index,]
#' iqnn_predict(iqnn_mod, test_data,strict=TRUE)
#' iqnn_predict(iqnn_mod, test_data,type="both",strict=FALSE)
iqnn_predict <- function(iqnn_mod,test_data, type="estimate",strict=FALSE){
test_data <- as.data.frame(test_data)
bin_index <- iqbin_assign(iqnn_mod, test_data, output="bin_index",strict=strict)
if(type=="estimate") return(iqnn_mod$bin_stats$pred[bin_index])
if(type=="binsize") return(iqnn_mod$bin_stats$obs[bin_index])
if(type=="both") return(iqnn_mod$bin_stats[bin_index,])
}
#--------------------------------------
#' Cross Validated predictions for iqnn models
#'
#' @description Cross-validate an iqnn specification using k-fold scheme on given data
#'
#' @param data Data frame containing the response variable and numeric input variables from the training data
#' @param y Name of response variable column
#' @param mod_type Depends on response variables type: "reg" creates iqnn-regression for predicting numeric values, "class" creates iqnn-classifier for predicting categorical values
#' @param bin_cols vector of column names of variables to iteratively bin, ordered first to last
#' @param nbins vector of number of bins per step of iterative binning, ordered first to last
#' @param jit vector of margins for uniform jitter to each dimension to create seperability of tied obs due to finite precision
#' @param stretch TRUE/FALSE if will bins be given tolerance buffer
#' @param tol vector of tolerance values to stretch each dimension for future binning
#' @param strict TRUE/FALSE: If TRUE Observations must fall within existing bins to be assigned; if FALSE the outer bins in each dimension are unbounded to allow outlying values to be assigned.
#' @param cv_k integer specifying number of folds
#'
#' @return cross validated predicted responses for all observations in data
#' @family iterative quantile nearest-neighbors functions
#' @export
#' @examples
#' cv_preds <- iqnn_cv_predict(data=iris, y="Species",mod_type="class",
#' bin_cols=c("Sepal.Length","Sepal.Width","Petal.Width"),
#' nbins=c(3,5,2), jit=rep(0.001,3), strict=FALSE, cv_k=10)
#' table(cv_preds, iris$Species)
#'
#' cv_preds <- iqnn_cv_predict(data=iris, y="Petal.Length",mod_type="reg",
#' bin_cols=c("Sepal.Length","Sepal.Width","Petal.Width"),
#' nbins=c(3,5,2), jit=rep(0.001,3), strict=FALSE, cv_k=10)
#' table(cv_preds, iris$Species)
iqnn_cv_predict <- function(data, y, mod_type="reg", bin_cols, nbins, jit=rep(0,length(bin_cols)), stretch=FALSE, tol=rep(0,length(bin_cols)), strict=FALSE, cv_k=10){
data <- as.data.frame(data)
cv_cohorts <- make_cv_cohorts(data, cv_k)
if(mod_type=="class") cv_preds <- factor(rep("NA", nrow(data)),levels(data[, y]))
if(mod_type=="reg") cv_preds <- rep(NA, nrow(data))
for(fold in 1:length(unique(cv_cohorts))){
test_index <- which(cv_cohorts==fold)
train_data_temp <- data[-test_index,]
row.names(train_data_temp) <- 1:nrow(train_data_temp)
iqnn_mod <- iqnn(train_data_temp, y=y, mod_type=mod_type, bin_cols=bin_cols,
nbins=nbins, jit=jit,stretch=stretch, tol=tol)
cv_preds[test_index] <- iqnn_predict(iqnn_mod, data[test_index,],strict=strict, type="estimate")
}
# if(mod_type=="class") cv_preds <- factor(cv_preds, labels=levels(data[,y]))
cv_preds
}
#--------------------------------------
#' Tuning function for iqnn model
#'
#' @description Identify optimal number of bins per dimension for iterative quantile nearest-neighbor model using k-fold cross validation.
#' Search through possible numbers of bins per dimension using a iterative rules defined in the \code{\link{make_nbins_list}} function.
#'
#' @param data Data frame containing the response variable and numeric input variables from the training data
#' @param y Name of response variable column
#' @param mod_type Depends on response variables type: "reg" creates iqnn-regression for predicting numeric values, "class" creates iqnn-classifier for predicting categorical values
#' @param bin_cols vector of column names of variables to iteratively bin, ordered first to last
#' @param nbins_range positive integer vector containing lower and upper bounds on number of bins in each dimension
#' @param jit vector of margins for uniform jitter to each dimension to create seperability of tied obs due to finite precision
#' @param stretch TRUE/FALSE if will bins be given tolerance buffer
#' @param tol vector of tolerance values to stretch each dimension for future binning
#' @param strict TRUE/FALSE: If TRUE Observations must fall within existing bins to be assigned; if FALSE the outer bins in each dimension are unbounded to allow outlying values to be assigned.
#' @param cv_k integer specifying number of folds
#' @param oom_search TRUE/FALSE: Only consider nbins that change number of neighbors by order of magnitudes
#' @param oom_base order of magnitude base value (default to powers of 2)
#'
#' @return data frame with one row per binning specification with desriptive and performance statistics: bin dimensitions, number of bins, equivalent k-nearest neightbor size, performance (mean squared error or class error rate)
#' @family iterative quantile nearest-neighbors functions
#' @export
#' @examples
#' # 10-fold CV
#' cv_tune1 <- iqnn_tune(data=iris, y="Petal.Length", mod_type="reg",
#' bin_cols=c("Sepal.Length","Sepal.Width","Petal.Width"),
#' nbins_range=c(2,5), jit=rep(0.001,3), strict=FALSE, cv_k=10)
#' cv_tune1
#' # LOO CV
#' cv_tune2 <- iqnn_tune(data=iris, y="Petal.Length", mod_type="reg",
#' bin_cols=c("Sepal.Length","Sepal.Width","Petal.Width"),
#' nbins_range=c(2,5), jit=rep(0.001,3), strict=FALSE, cv_k=nrow(iris))
#' cv_tune2
iqnn_tune <- function(data, y, mod_type="reg", bin_cols, nbins_range, jit=rep(0,length(bin_cols)),
stretch=FALSE, tol=rep(0,length(bin_cols)), strict=FALSE, cv_k=10, oom_search=FALSE, oom_base=2){
nbins_list <- make_nbins_list(nbins_range,length(bin_cols))
cv_results <- data.frame(bin_dims = sapply(nbins_list, function(x) paste(x,collapse="X")),
nn_equiv = sapply(nbins_list, function(x) (cv_k-1)/cv_k*nrow(data)/prod(x)),
nbins_total=NA)
cv_results$nbins <- nbins_list
fold_n <- floor(nrow(data)*((cv_k-1)/cv_k))
if(oom_search){
unique_nn_size <- as.integer(oom_base^(1:floor(log(fold_n, base=oom_base))))
} else {
unique_nn_size <- as.integer(unique(round(cv_results$nn_equiv)))
}
keeper_idx <- sort(unique(sapply(1:length(unique_nn_size), function(i) which.min(abs(cv_results$nn_equiv-unique_nn_size[i])))))
for(t in keeper_idx){
cv_preds <- iqnn_cv_predict(data=data, y=y, mod_type=mod_type, bin_cols=bin_cols, nbins=nbins_list[[t]], jit=jit,
stretch=stretch,tol=tol, strict=FALSE, cv_k=cv_k)
if(mod_type=="reg") cv_results$MSE[t] <- mean((data[,y]-cv_preds)^2)
if(mod_type=="class") cv_results$error[t] <- sum(cv_preds!=data[,y]) / nrow(data)
cv_results$nbins_total[t] <- prod(nbins_list[[t]])
}
cv_results <- stats::na.omit(cv_results)
if(mod_type=="reg") cv_results$RMSE <- sqrt(cv_results$MSE)
return(cv_results)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.