gamRF: Random forest fit to residuals from GAM model

Description Usage Arguments Value Note Author(s) References See Also Examples

Description

Fit model using gam() from mgcv, then use random forest regression with residuals. Check perfomance of this hybrid model for predictions to newdata, if supplied.

Usage

1
2
gamRF(formlist, yvar, data, newdata = NULL, rfVars, method = "GCV.Cp", 
    printit = TRUE, seed = NULL)

Arguments

formlist

List of rght hand sides of formulae for GAM models.

yvar

Character string holding y-variable name.

data

Data

newdata

Optionally, supply test data.

rfVars

Names of explanatory variables for the randomForest model.

method

Smoothing parameter estimation method for use of gam(). See gam.

printit

Should a summary of results (error rates) be printed?

seed

Set a seed to make result repeatable.

Value

A vector of test data accuracies for the hybrid models (one for each element of formlist), plus test error mean square and OOB error mean square for the use of randomForest().

Note

The best results are typically obtained when a relatively low degree of freedom GAM model is used. It seems advisable to use those variables for the GAM fit that seem likely to be similar in their effect irrespective of geographic location.

Author(s)

John Maindonald <john.maindonald@anu.edu.au>

References

J. Li, A. D. Heap, A. Potter and J. J. Daniell. 2011. Application of Machine Learning Methods to Spatial Interpolation of Environmental Variables. Environmental Modelling and Software 26: 1647-1656. DOI: 10.1016/j.envsoft.2011.07.004.

See Also

CVgam

Examples

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
if(length(find.package("sp", quiet=TRUE))>0){
data("meuse", package="sp")
meuse <- within(meuse, {levels(soil) <- c("1","2","2")
                        ffreq <- as.numeric(ffreq)
                        loglead <- log(lead)}
)
form <- ~ dist + elev + ffreq + soil
rfVars <- c("dist", "elev", "soil", "ffreq", "x", "y")
## Select 90 out of 155 rows
sub <- sample(1:nrow(meuse), 90)
meuseOut <- meuse[-sub,]
meuseIn <- meuse[sub,]
gamRF(formlist=list("lm"=form), yvar="loglead", rfVars=rfVars,
                    data=meuseIn, newdata=meuseOut)
}

## The function is currently defined as
function (formlist, yvar, data, newdata = NULL, rfVars, method = "GCV.Cp", 
    printit = TRUE, seed = NULL) 
{   if(!is.null(seed))set.seed(seed)
    errRate <- numeric(length(formlist)+2)
    names(errRate) <- c(names(formlist), "rfTest", "rfOOB")
    ytrain <- data[, yvar]
    xtrain <- data[, rfVars]
    xtest <- newdata[, rfVars]
    ytest = newdata[, yvar]
    res.rf <- randomForest(x = xtrain, y = ytrain, 
                           xtest=xtest, 
                           ytest=ytest)
    errRate["rfOOB"] <- mean(res.rf$mse)
    errRate["rfTest"] <- mean(res.rf$test$mse)    
    GAMhat <- numeric(nrow(data))
    for(nam in names(formlist)){
      form <- as.formula(paste(c(yvar, paste(formlist[[nam]])), collapse=" "))
      train.gam <- gam(form, data = data, method = method)
      res <- resid(train.gam)
      cvGAMms <- sum(res^2)/length(res)
      if (!all(rfVars %in% names(newdata))) {
        missNam <- rfVars[!(rfVars %in% names(newdata))]
        stop(paste("The following were not found in 'newdata':", 
                   paste(missNam, collapse = ", ")))
      }
      GAMtesthat <- predict(train.gam, newdata = newdata)
      GAMtestres <- ytest - GAMtesthat
      Gres.rf <- randomForest(x = xtrain, y = res, xtest = xtest, 
                              ytest = GAMtestres)
      errRate[nam] <- mean(Gres.rf$test$mse)
    }
    if (printit) 
        print(round(errRate, 4))
    invisible(errRate)
}

Example output

    lm rfTest  rfOOB 
0.1078 0.1168 0.1404 
function (formlist, yvar, data, newdata = NULL, rfVars, method = "GCV.Cp", 
    printit = TRUE, seed = NULL) 
{
    if (!is.null(seed)) 
        set.seed(seed)
    errRate <- numeric(length(formlist) + 2)
    names(errRate) <- c(names(formlist), "rfTest", "rfOOB")
    ytrain <- data[, yvar]
    xtrain <- data[, rfVars]
    xtest <- newdata[, rfVars]
    ytest = newdata[, yvar]
    res.rf <- randomForest(x = xtrain, y = ytrain, xtest = xtest, 
        ytest = ytest)
    errRate["rfOOB"] <- mean(res.rf$mse)
    errRate["rfTest"] <- mean(res.rf$test$mse)
    GAMhat <- numeric(nrow(data))
    for (nam in names(formlist)) {
        form <- as.formula(paste(c(yvar, paste(formlist[[nam]])), 
            collapse = " "))
        train.gam <- gam(form, data = data, method = method)
        res <- resid(train.gam)
        cvGAMms <- sum(res^2)/length(res)
        if (!all(rfVars %in% names(newdata))) {
            missNam <- rfVars[!(rfVars %in% names(newdata))]
            stop(paste("The following were not found in 'newdata':", 
                paste(missNam, collapse = ", ")))
        }
        GAMtesthat <- predict(train.gam, newdata = newdata)
        GAMtestres <- ytest - GAMtesthat
        Gres.rf <- randomForest(x = xtrain, y = res, xtest = xtest, 
            ytest = GAMtestres)
        errRate[nam] <- mean(Gres.rf$test$mse)
    }
    if (printit) 
        print(round(errRate, 4))
    invisible(errRate)
}

gamclass documentation built on Nov. 14, 2020, 1:07 a.m.