R/cv.R

Defines functions kfold spatial_k_fold k_fold

Documented in k_fold

#' K-fold
#' 
#' A K-fold setting for crossvalidation
#' 
#' @param data A data.frame with the whole data to be split into training and test folds.
#' @param dependent_variable A string with the name of the target variable
#' @param k_partitions Number of folds to be used in the K-Fold setting
#' @param seed Setting a seed to reproducibility of the cross validation
#' @param as_data_frame A Boolean to return the \code{x_train} and \code{x_test} partitions as \code{data.frame}
#' @export
k_fold <- function(data,
                   dependent_variable="y",
                   k_partitions,
                   seed=NULL,
                   as_data_frame = FALSE){

     # Setting the seed.
     set.seed(seed)

     # Creating the object k_fold
     k_fold_validation <- list()

     # Creating the test_ratio
     partitions_index <- kfold(x = data,k = k_partitions)

     # Without use any package
     # if(nrow(data)%%k_partitions!=0){
     # partitions_index <- sample(rep(1:k_partitions,floor(nrow(data)/k_partitions)+1),
     #                            size = nrow(data),replace = FALSE)
     # }else{
     #   partitions_index <- sample(rep(1:k_partitions,k_partitions,
     #                              size = nrow(data),replace = FALSE))
     # }
     for(k in 1:k_partitions){
          # Training values
          cat("Running repetition number ",k,"\n")

          # Training
          if(as_data_frame){
               x_train = data[partitions_index!=k,colnames(data)!="y",drop=FALSE] #%>% as.matrix
               y_train = data[partitions_index!=k,colnames(data)=="y",drop=FALSE] #%>% as.matrix

               # Test
               x_test = data[partitions_index==k,colnames(data)!="y",drop=FALSE] #%>% as.matrix
               y_test = data[partitions_index==k,colnames(data)=="y",drop=FALSE] #%>% as.matrix
          } else {
               x_train = as.matrix(data[partitions_index!=k,colnames(data)!="y",drop=FALSE]) #%>% as.matrix
               y_train = as.matrix(data[partitions_index!=k,colnames(data)=="y",drop=FALSE]) #%>% as.matrix

               # Test
               x_test = as.matrix(data[partitions_index==k,colnames(data)!="y",drop=FALSE]) #%>% as.matrix
               y_test = as.matrix(data[partitions_index==k,colnames(data)=="y",drop=FALSE]) #%>% as.matrix

          }

          # Saving the data split
          k_fold_validation[[k]]<-list(x_train=x_train,y_train=y_train,x_test=x_test,y_test=y_test)
     }

     return(k_fold_validation)

}

# Exporting the kfold function
#' @importFrom stats kmeans
spatial_k_fold <- function(data,dependent_variable="y",k_partitions,seed=NULL,as_data_frame = FALSE){

        # Setting the seed.
        set.seed(seed)

        # Creating the object k_fold
        k_fold_validation <- list()

        # Creating the test_ratio
        partitions_index <- kfold(x = data,k = k_partitions)
        partitions_index <- kmeans(x = data[,colnames(data)!="y",drop=FALSE],centers = k_partitions)$cluster

        # Without use any package
        # if(nrow(data)%%k_partitions!=0){
        # partitions_index <- sample(rep(1:k_partitions,floor(nrow(data)/k_partitions)+1),
        #                            size = nrow(data),replace = FALSE)
        # }else{
        #   partitions_index <- sample(rep(1:k_partitions,k_partitions,
        #                              size = nrow(data),replace = FALSE))
        # }
        for(k in 1:k_partitions){
                # Training values
                cat("Running repetition number ",k,"\n")

                # Training
                if(as_data_frame){
                        x_train = data[partitions_index!=k,colnames(data)!="y",drop=FALSE] #%>% as.matrix
                        y_train = data[partitions_index!=k,colnames(data)=="y",drop=FALSE] #%>% as.matrix

                        # Test
                        x_test = data[partitions_index==k,colnames(data)!="y",drop=FALSE] #%>% as.matrix
                        y_test = data[partitions_index==k,colnames(data)=="y",drop=FALSE] #%>% as.matrix
                } else {
                        x_train = as.matrix(data[partitions_index!=k,colnames(data)!="y",drop=FALSE]) #%>% as.matrix
                        y_train = as.matrix(data[partitions_index!=k,colnames(data)=="y",drop=FALSE]) #%>% as.matrix

                        # Test
                        x_test = as.matrix(data[partitions_index==k,colnames(data)!="y",drop=FALSE]) #%>% as.matrix
                        y_test = as.matrix(data[partitions_index==k,colnames(data)=="y",drop=FALSE]) #%>% as.matrix

                }

                # Saving the data split
                k_fold_validation[[k]]<-list(x_train=x_train,y_train=y_train,x_test=x_test,y_test=y_test)
        }

        return(k_fold_validation)

}

# PROBLEM TO LOAD
# Author: Robert Hijmans
# January 2010
# License GPL3


# FROOMM PACKAGE DISMOO

kfold <- function(x, k=5, by=NULL) {

     singlefold <- function(obs, k) {
          if (k==1) {
               return(rep(1, obs))
          } else {
               i <- obs / k
               if (i < 1) {
                    stop('insufficient records:', obs, ', with k=', k)
               }
               i <- round(c(0, i * 1:(k-1), obs))
               times = i[-1] - i[-length(i)]

               group <- c()
               for (j in 1:(length(times))) {
                    group <- c( group, rep(j, times=times[j]) )
               }

               r <- order(stats::runif(obs))
               return(group[r])
          }
     }

     if (is.vector(x)) {
          if (length(x) == 1) {
               if (x > 1) {
                    x <- 1:x
               }
          }
          obs <- length(x)
     }  else {
          obs <- nrow(x)
     }
     if (is.null(by)) {
          return(singlefold(obs, k))
     }

     by = as.vector(as.matrix(by))
     if (length(by) != obs) {
          stop('by should be a vector with the same number of records as x')
     }
     un <- unique(by)
     group <- vector(length=obs)
     for ( u in un ) {
          i = which(by==u)
          kk = min(length(i), k)
          if (kk < k) warning('lowered k for by group: ', u  ,'  because the number of observations was  ',  length(i))
          group[i] <- singlefold(length(i), kk)
     }
     return(group)
}
MateusMaiaDS/gpbart documentation built on Jan. 26, 2024, 7:07 a.m.