flutils: Feng Li's utility functions

Documented in data.partition

#' Partition data index for cross validation.
#'
#'
#' @param nObs "positive numeric"
#' @param args list N: no. of subsets,  method: how to partition
#' @return "list" The prediction subsets.
#' @references Li Villani Kohn 2010
#' @author Feng Li, Department of Statistics, Stockholm University, Sweden.
#' @note First version: Mon Sep 20 21:08:01 CEST 2010;
#'       Current:       Mon Sep 20 21:08:11 CEST 2010.
#' @export
data.partition <- function(nObs, args)
{
    partiMethod <- args[["partiMethod"]]
    N.subsets <- args[["N.subsets"]]
    testRatio <- args[["testRatio"]]

    if(nObs < N.subsets ||
       (N.subsets < 1 & (tolower(partiMethod) != "time-series")))
    {
                                        # Observation smaller than subsets.
        stop("No. of subsets should be equal or smaller than no. of obs and at least one.")
    }

    ## disable warnings when data length is not a multiple of split variable which is what I
    ## want.  split the knots in a smart way. e.g. split 20 knots into 3 folds
    obs.label <- 1:nObs
    suppressWarnings(out.systematic <- split(obs.label, 1:(N.subsets)))

    if(tolower(partiMethod) == "systematic")
    {
        ## Select testing samples in a sequential order
        out <- list()

        if(N.subsets == 1 && length(testRatio) != 0)
        {
            out[["1"]] <- floor(seq(1,  nObs, length.out = ceiling(nObs*testRatio)))
        }
        else
        {
            out <- out.systematic
        }
    }
    else if(tolower(partiMethod) == "random")
    {## Randomly select testing samples
        out <- list()

        if(N.subsets == 1 && length(testRatio) != 0)
        {
            out[["1"]] <- sample(obs.label, ceiling(nObs*testRatio))
        }
        else
        {
            obs.label.new <- obs.label
            for(i in 1:N.subsets)
            {
                out[[i]] <- sample(obs.label.new, length(out.systematic[[i]]))
                obs.label.new <-  obs.label.new[!obs.label.new%in%out[[i]]]
            }
        }
    }
    else if(tolower(partiMethod) == "ordered")
    {
        out <- list()

        if(N.subsets == 1 && length(testRatio) != 0)
        {
            testLen <- ceiling(nObs*testRatio)
            out[["1"]] <- (nObs - testLen + 1):nObs
        }
        else
        {
            start <- 1
            for(i in 1:N.subsets)
            {
                out[[i]] <- start:(start+length(out.systematic[[i]])-1)
                start <- out[[i]][length(out[[i]])] +1
            }
        }
    }
    else if(tolower(partiMethod) == "time-series")
    {
        ## Use old data as testing data and recent data to preform predictions, the argument
        ## "testRatio" is used.
        if(N.subsets  != 1)
        {
            stop("Currently for time series,  the last ", testRatio*100,
                 "% observations are used as a single test sample. No cross-validation applied.")

        }

        if(length(testRatio) == 0)
        {
            stop("testRatio needed for cross-validation with `partiMethod == 'time-series'` !")
        }


        testLen <- ceiling(nObs*testRatio) # Make sure at least one is available
        start <- (nObs-testLen+1)

        ##obs.label.ts <- start:nObs
        ## suppressWarnings(out.systematic.ts <- split(obs.label.ts, 1:N.subsets))

        out <- list()
        ## for(i in 1:N.subsets)
        ## {
        out[["1"]] <- start:nObs
        ## start <- out[[i]][length(out[[i]])] +1
        ##  }
    }
    else
    {
        stop("The partitioning method is not implemented!")
    }
    names(out) <- NULL # remove the name. I don't like it.
    return(out)
}