exec/test_RF_trees.R

library(randomForest)
#library(doParallel)
library(ROCR)
library(irr)
library(pscl)
#registerDoParallel(cores=detectCores(all.tests=TRUE))

print(DATA_DIRECTORY)
dir <- DATA_DIRECTORY
load(paste0(dir, '/full_data.rda'))
load(paste0(dir, '/FIA_mortality_TREE_level.rda')); rm(dir)
trees <- FIA_mortality_TREE_level; rm(FIA_mortality_TREE_level)

t0 <- sapply(trees, function(x) nrow(trees) - sum(is.na(x)))
trees <- trees[, as.logical(t0)]; rm(t0)

# magic names for columns
c0 <- 'PREV_STATUS_CD'
c1 <- 'STATUSCD'

df0 <- data.frame(trees[[c1]] == 2, trees[[c0]] == 1)
both <- rowSums(df0); rm(df0)
trees$MORT_EVENT <- both == 2; rm(both, c0, c1)

trees <- dplyr::left_join(trees, full_data, by = 'PLT_CN'); rm(full_data)

mort_df <- FIA_mortality_with_explanatory
colnames(mort_df)[which(colnames(mort_df) == 'LON')] <- 'lon'
colnames(mort_df)[which(colnames(mort_df) == 'LAT')] <- 'lat'
mm <- c('lon', 'lat', 'ELEV', 'forest_type', 'in_fire_perim', 'is_public', 'SLOPE', 'ASPECT')
trees <- dplyr::left_join(trees, mort_df[, mm], by = c('lon', 'lat'))
mm <- mm[-c(1, 2)]

# STOCKING has NA's
cc <- c('INVYR', 'SPCD', 'DIA', 'PREVDIA', 'HT',
        'RGI', 'MIR', 'NDMI', 'NDVI',
        'prcp', 'tmax', 'tmin', 'swe', 'dayl', 'srad', 'vp',
        'BLDFIE', 'CECSOL', 'CLYPPT', 'CRFVOL', 'OCSTHA', 'PHIHOX', 'SNDPPT', 'BDTICM')
cc <- c(cc, mm)
lapply(trees[, cc], function(x) table(is.na(x)))

tt <- is.na(trees[, cc])
tt0 <- which(rowSums(tt) == 0)
trees <- trees[tt0, ]; rm(tt, tt0)

balanceMort <- function(trees) {
  mbt <- table(trees$MORT_EVENT)['TRUE']
  findx <- which(trees$MORT_EVENT == 'FALSE')
  set.seed(1)
  f0 <- sample(findx, size = table(trees$MORT_EVENT)['FALSE'] - mbt)
  trees <- trees[-f0, ]
  return(trees)
}
#trees <- balanceMort(trees); rm(balanceMort)
#wt_ratio <- round(table(trees$MORT_EVENT)['FALSE'] / table(trees$MORT_EVENT)['TRUE'], 3)

# balanceMort = under sampling??
# Using classwt = c('FALSE' = 1, 'TRUE' = 9) ... 8% / 2.3% / 60%
# Setting cutoff = c(0.9, 0.1) ... 19.6% / 18.6% / 28.4%
nmin <- as.integer(table(trees$MORT_EVENT)['TRUE'])
stratified_mort_rf <- randomForest::randomForest(formula = as.factor(MORT_EVENT) ~ SPCD + PREVDIA + HT + prcp + tmax + tmin + swe + dayl + srad + vp + BLDFIE + CECSOL + CLYPPT + CRFVOL + PHIHOX + SNDPPT + BDTICM + OCSTHA + ELEV + in_fire_perim + is_public + SLOPE + ASPECT,
                                      data = trees, ntree = 500, importance = T, do.trace = T,
                                      proximity = F, keep.forest = T, mtry = 14,
                                      strata = as.factor(trees$MORT_EVENT),
                                      sampsize = rep(nmin, 2)
                                      )
# stratified err rate is 14.7/12.9/30.7% respectively

save(stratified_mort_rf, file = paste0(DATA_DIRECTORY, '/stratified_mort_rf.rda')); rm(stratified_mort_rf)
load(paste0(DATA_DIRECTORY, '/stratified_mort_rf.rda'))

# full mort breaks RAM, but the last observed OOB error is 8/2.5/57.8% for overall/alive/dead respectively
full_mort_rf <- randomForest::randomForest(formula = as.factor(MORT_EVENT) ~ SPCD + PREVDIA + HT + prcp + tmax + tmin + BLDFIE + CECSOL + CLYPPT + CRFVOL + PHIHOX + SNDPPT + BDTICM,
                                           data = trees, ntree = 500, importance = T, do.trace = T,
                                           proximity = F, keep.forest = T, mtry = 9
                                           #strata = as.factor(trees$MORT_EVENT),
                                           #sampsize = rep(nmin, 2)
)

save(full_mort_rf, file = paste0(DATA_DIRECTORY, '/full_mort_rf.rda')); rm(full_mort_Rf)


mort_rf <- stratified_mort_rf

randomForest::varImpPlot(mort_rf)
mort_rf$confusion
mort_pred <- predict(mort_rf, type = 'prob')
mort_votes <- mort_pred[, 2]
pred <- ROCR::prediction(mort_votes, mort_rf$y)
perf <- ROCR::performance(pred, measure = 'auc')@y.values[[1]]
perf

irr::agree(data.frame(mort_rf$predicted, mort_rf$y))
irr::kappa2(data.frame(mort_rf$predicted, mort_rf$y))
# Stratified: 85.3% agree, 0.408 kappa

mort_glm <- glm(formula = as.factor(MORT_EVENT) ~ SPCD + PREVDIA + HT + prcp + tmax + tmin + BLDFIE + CECSOL + CLYPPT + CRFVOL + PHIHOX + SNDPPT + BDTICM + in_fire_perim + is_public + SLOPE + ASPECT,
                family = binomial(logit), data = trees)
pscl::pR2(mort_glm)
summary(mort_glm)
anova(mort_glm, test = 'Chisq')
step_glm <- MASS::stepAIC(mort_glm)
# swe, TEXMHT, and OCSTHA come out as not useful for each model - varImpPlot on RF agreed with this
bmcnellis/RSFIA documentation built on June 1, 2019, 7:40 a.m.