Nothing
#' Train and assess accuracy of a k-NN model
#'
#' This function trains a k-NN model from response variables (Y) and predictors
#' (X) at reference observations using the package yaImpute (see
#' \code{\link[yaImpute]{yai}}). By default, the distance between observations
#' is obtained from the proximity matrix of random forest regression or
#' classification trees. Optionally, training and testing sets can be provided
#' to return the accuracy of the trained k-NN model.
#'
#' If performing model validation, the function trains a kNN model from the
#' training set, finds the k NN of the validation set and imputes the response
#' variables from the k NN. If \code{k = 1}, only the closest NN value is
#' imputed. If k > 1, the imputed value can be either the closest NN value, the
#' mean, median or distance weighted mean of the k NN values.This is controlled
#' by the arguments \code{impute.cont} or \code{impute.fac}.
#'
#' If inTest = NULL, all rows that are not in inTrain will be used for model
#' testing. If inTrain = NULL, all rows that are not in inTest will be used for
#' model training. If both inTrain and inTest are NULL, all rows of x and y will
#' be used for training and no testing is performed.
#'
#' The final model returned by \code{findNN} is trained from all observations of
#' \code{x} and \code{y}.
#'
#'@param x A dataframe or SpatialPointsDataFrame of predictors variables X for
#' reference observations. Row names of X are used as identification of
#' reference observations.
#'@param y A dataframe or SpatialPointsDataFrame of response variables Y for the
#' reference observations. Row names of Y are used as identification of
#' reference observations.
#'@param inTrain Optional. A list obtained from
#' \code{\link[foster]{partition}}indicating which rows of x and y go to
#' training.
#'@param inTest Optional list indicating which rows of x and y go to validation.
#' If left NULL, all rows that are not in \code{inTrain} are used for
#' validation.
#'@param k Integer. Number of nearest neighbors
#'@param method Character. Which nearness metrics is used to compute the nearest
#' neighbors. Default is \code{"randomForest"}. Other methods are listed in
#' \code{\link[yaImpute]{yai}}
#'@param impute.cont Character. The method used to compute the imputed
#' continuous variables. Can be \code{"closest"}, \code{"mean"},
#' \code{"median"} or \code{"dstWeighted"}. Default is \code{"closest"} if
#' \code{k = 1} and \code{"dstWeighted"} if \code{k > 1}. See
#' \code{\link[yaImpute]{impute.yai}} for more details.
#'@param impute.fac Character. The method used to compute the imputed values for
#' factors. Default value is the same as \code{impute.cont}. See
#' \code{\link[yaImpute]{impute.yai}} for more details.
#'@param ntree Number of classification or regression trees drawn for each
#' response variable. Default is 500
#'@param mtry Number of X variables picked randomly to split each node. Default
#' is sqrt(number of X variables)
#'@param rfMode By default, \code{rfMode} is set to \code{""} which forces
#' \code{\link[yaImpute]{yai}} to create random forest regression trees instead
#' of classification trees for continuous variables. Can be set to
#' \code{"buildClasses"} if wanting continuous variables to be converted to
#' classes and forcing random forest to build classification trees. (See
#' \code{\link[yaImpute]{yai}})
#'@param ... Other arguments passed to \code{\link[yaImpute]{yai}} (e.g.
#' \code{"rfXsubsets"})
#'
#'@return A list containing the following objects: \describe{
#' \item{\code{model}}{A \code{yai} object, the trained k-NN model}
#' \item{\code{preds}}{A data.frame with observed and predicted values of the
#' testing set for each response variables} }
#'
#'@seealso \code{\link[yaImpute]{yai}}, \code{\link[yaImpute]{newtargets}},
#' \code{\link[yaImpute]{impute.yai}}, \code{\link[foster]{accuracy}}
#'
#' @examples
#' # Load data in memory
#' # X_vars_sample: Predictor variables at sample (from getSample)
#' # Y_vars_sample: Response variables at sample (from getSample)
#' # train_idx: Rows of X_vars_sample and Y_vars_sample that are used for
#' # training (from (partition))
#' load(system.file("extdata/examples/example_trainNN.RData",package="foster"))
#'
#' set.seed(1234) #for example reproducibility
#' kNN <- trainNN(x = X_vars_sample,
#' y=Y_vars_sample,
#' inTrain = train_idx,
#' k = 1,
#' method = "randomForest",
#' ntree = 200)
#' @export
trainNN <- function(x,
y,
inTrain = NULL,
inTest = NULL,
k = 1,
method = "randomForest",
impute.cont = NULL,
impute.fac = NULL,
ntree = 500,
mtry = NULL,
rfMode = "",
...) {
if (length(k) > 1) stop("Support only a single k value")
if (dim(x)[1] != dim(y)[1]) stop("x an y must have the same number of rows")
if (class(x) %in% "SpatialPointsDataFrame") {
x <- spdf2df(x)
}
if (class(y) %in% "SpatialPointsDataFrame") {
y <- spdf2df(y)
}
if (is.null(inTrain) & is.null(inTest)) {
message("No training or validation set provided.")
isTest <- FALSE
}
if(!is.null(inTrain)) {
isTest <- TRUE
if(is.list(inTrain)) {
nfolds_train <- length(inTrain)
}else{
nfolds_train <- 1
inTrain <- list(inTrain)
}
}
if(!is.null(inTest)) {
isTest <- TRUE
if(is.list(inTest)) {
nfolds_test <- length(inTest)
}else{
nfolds_test <- 1
inTest <- list(inTest)
}
}
if (is.null(inTrain)) {
nfolds_train <- 0
}
if (is.null(inTest)) {
nfolds_test <- 0
}
if (nfolds_train > nfolds_test &!is.null(inTest)) {
inTest <- NULL
message("Performing cross-validation: inTest argument ignored. Test samples are determined from the supplied training folds.")
}
if (nfolds_train < nfolds_test &!is.null(inTrain)) {
inTrain <- NULL
message("Performing cross-validation: inTrain argument ignored. Validation samples are determined from the supplied testing folds.")
}
# Number of folds is either 1 or the max number of folds in either test or training
nfolds <- max(c(nfolds_train, nfolds_test))
# Set rules for imputation
if (k == 1) {
impute.cont <- "closest"
impute.fac <- "closest"
} else {
if (is.null(impute.cont)) impute.cont <- "dstWeighted"
if (is.null(impute.fac)) impute.fac <- impute.cont
}
if (isTest) {
# List storing all predictions and validation
preds_out <- list()
for (n in 1:nfolds) {
if (is.null(inTest)) {
train_fold <- inTrain[[n]]
test_fold <- setdiff(seq(1, dim(x)[1], 1), train_fold)
}else if (is.null(inTrain)) {
test_fold <- inTest[[n]]
train_fold <- setdiff(seq(1, dim(x)[1], 1), test_fold)
}else{
test_fold <- inTest[[n]]
train_fold <- inTrain[[n]]
}
# Make sure than row indices of train_fold and test_fold are valid
if(any(!c(test_fold,train_fold) %in% seq_len(dim(x)[1]))) {
stop("Invalid training or validation row indices in inTrain or inTest")
}
if(anyDuplicated(c(test_fold,train_fold)) != 0) {
warning(sprintf("Fold %d: duplicated row indices in training and validation", n))
}
X_tr <- x[train_fold, ]
Y_tr <- y[train_fold, ]
X_val <- x[test_fold, ]
Y_val <- y[test_fold, ]
#Train yai
yai_object <- yaImpute::yai(x = X_tr,
y = Y_tr,
method = method,
k = k,
mtry = mtry,
ntree = ntree * ncol(y),
rfMode = rfMode,
...)
# Find NN and impute at validation
yai_newtrgs <- yaImpute::newtargets(yai_object, X_val)
Y_val_predicted <- yaImpute::impute(yai_newtrgs,
method = impute.cont,
method.factor = impute.fac,
observed = FALSE)
Y_val_predicted <- Y_val_predicted[, colnames(Y_tr)]
Y_val_predicted <- data.frame(ID = rownames(Y_val_predicted), Y_val_predicted)
Y_val <- data.frame(ID = rownames(Y_val), Y_val)
Y_pred <- reshape2::melt(Y_val_predicted, measure_vars = colnames(Y_tr),
value.name = "preds", id.vars = "ID")
Y_val <- reshape2::melt(Y_val, measure.vars = colnames(Y_tr),
value.name = "obs", id.vars = "ID")
preds <- merge(Y_val, Y_pred, by = c("ID", "variable"))
# Keep track of fold
preds$Fold <- n
preds_out[[n]] <- preds
}
preds_out <- do.call(rbind, preds_out)
}else{ #No testing/validation
preds_out <- NULL
# Once validation is done, the final model is trained with all observations
}
yai_object_final <- yaImpute::yai(x = x,
y = y,
method = method,
k = k,
mtry = mtry,
ntree = ntree * ncol(y),
rfMode = rfMode,
...)
# Add imputation mode to be used with predictTrgs
yai_object_final$impute.cont <- impute.cont
yai_object_final$impute.fac <- impute.fac
out <- list(
model = yai_object_final,
preds = preds_out
)
return(out)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.