R/cross_validation.R

##' {Split the data into test and train sets. I'm reasonably certain that this is completely obsoleted by the createDataPartition function in the caret package}
##' {See description - this does not do any kind of balancing to equalise class probabilites or response distribution across splits} ..
##' @title train_test_sets
##' @param x the variable to split on
##' @param data the data to split
##' @return Some kind of list?
##' @author Richie Morrisroe
train_test_sets <- function (x, data) {
    testlist <- list()
    trainlist <- list()
    for (i in 1:length(x)) {
        split <- x[[i]]
        fold.train <- data[split,]
        fold.test <- data[-eval(split),]
        trainlist[[i]] <- fold.train
        testlist[[i]] <- fold.test
    }
    traintestlist <- c(trainlist, testlist)

    train.names <- paste(names(x), ".Train",sep="")
    test.names <- paste(names(x), ".Test",sep="")
    listnames <- c(train.names, test.names)
    names(traintestlist) <- listnames
    traintestlist
}
##' {Takes the result of TestTrainSplit and actually splits it}
##' {See DESC}
##' @title seperate_test_and_train
##' @param data a list of dataframes with sampled data
##' @param test return test if true, else train
##' @return a dataframe
##' @author Richie Morrisroe
seperate_test_and_train <- function(data, test=TRUE) {
    if(test) {
        indtest <- grep("Test$", names(data))
        res <- data[indtest]
    }
    else {
        indtrain <- grep("Train$", names(data))
        res <- data[indtrain]
    }
    res
}
##' {This was a wrapper around train to test the theory that the RF was actually predicting placebo perfectly}
##' {See desc} 
##' @title train_folds
##' @param data a dataframe 
##' @param Form the formula to fit the model with
##' @param control control parameters for train
##' @param sizes something something
##' @param metric otherthing otherthing
##' @param updown who knows?
##' @return model fits from the cv process
##' @author Richie Morrisroe
train_folds <- function(data, Form, control, sizes, metric, updown) {
    cvresults <- list()
    for (i in 1:length(data)) {
        #again, another bug that could be caused by lexical scoping
        res <- caret::train(form=Form, data=data,
                            na.action="na.omit",
                            size=sizes,
                            metric=metric,
                            maximise=updown,
                            control=rfeControl)
        cvresults[[i]] <- res
    }
    names(cvresults) <- names(data)
    cvresults
}


##' {A function for performing repeated Cross-validation}
##' {See description}
##' @title repeat_cv
##' @param form a formula describing the model
##' @param data the data to fit on
##' @param method not implemented
##' @param n number of repeats
##' @param responsevariable dependent variable
##' @param ... arguments passed to the train function
##' @return a nested list containing the confusionMatrix for the model on test data, and the accuracy parameter
##' @author Richie Morrisroe
repeat_cv <- function(form, data, method=method, n, responsevariable, ...) {
    res <- vector(length=n, mode="list")
    Accuracy <- vector(length=n, mode="numeric")
    data2 <- na.omit(data)
    variable <- grep(responsevariable, x=names(data))
    for (i in 1:n) {
        print(i)
        trainind <- with(data2,
                         caret::createDataPartition(
                             data[,variable], p=0.8, list=FALSE))
        trainset <- data2[trainind,]
        testset <- data2[-trainind,]
        train.res <- caret::train(formula=form, data=trainset, ...)

        train.pred <- predict(train.res, testset)
        res[[i]] <- caret::confusionMatrix(train.pred, testset[,responsevariable])
        Accuracy[i] <- res[[i]]$overall[1]
    }
##this was a wierd one, lintr complains if last action in function is assigment
    res2 <- list(res, Accuracy)
    res2
}
##' {calculate the root mean square error of approximattion for a dataframe containing columns named pred and obs} 
##' {See Desc} 
##' @title rmsea
##' @param data a dataframe containing pred and obs columns
##' @param pred_col prediction column
##' @param obs_col observation column
##' @return a scalar number for the RMSEA
##' @author Richie Morrisroe
rmsea <- function(data, pred_col=pred, obs_col=obs) {
    erro <- with(data, pred_col - obs_col)
    err_sq <- erro ^ 2
    root_err <- sqrt(mean(err_sq))
    return(root_err)
}
richiemorrisroe/thesisR documentation built on May 27, 2019, 8:43 a.m.