#' Plot ML Results and Summary
#'
#' @param data_folder The folder where the results from the rNCV are contained.
#' @param figure_title The name you would like for the generated figures.
#' @param include_pdp Set to TRUE if you would like to generate partial dependence plots. \strong{WARNING:} Setting this option to TRUE will significantly increase run time.
#' @param sum_prefix Label of output file name. Make sure this is the same label as specified in \code{predict_one()}.
#' @param ourDir If you would like to save the output files into somewhere other than the working directory, specify that here. Make sure the folder name ends with '/'.
#' @export
summarize_one <- function(file_name, figure_title, sum_prefix, outDir = '', include_pdp = FALSE){
load(file_name)
# Get a summary of model performance
summ <- rNCV.perf.summ(res.rncv)
#get classical r^2, as 1-SSresid/SStotal, to compare with Henry's/caret's output
predicted_data <- data.frame(cbind(res.rncv$y.pred.comb, data.rncv[, var_to_predict]))
#since glueing on a single column doesn't keep its name
names(predicted_data)[ncol(predicted_data)] <- var_to_predict
true_mean <- mean(predicted_data[, var_to_predict])
ss_total <- sum((predicted_data[, var_to_predict] - true_mean)^2)
r2s <- c()
#ncol - 1 since I added a column for Y
for (i in 1:(ncol(predicted_data) - 1)){
#compute r^2 for each repetition
ss_resid <- sum((predicted_data[, var_to_predict] - predicted_data[, i])^2)
this_r2 <- 1 - ss_resid/ss_total
r2s <- c(r2s, this_r2)
}
mean_r2 <- mean(r2s)
se_r2 <- sd(r2s)
new_row <- data.frame(dataset = 'test', metric = 'Rsquared', method = 'StackClscR2', m = mean_r2, se = se_r2)
summ <- rbind(summ, new_row)
# Save the summary
#write.csv(summ[summ$metric==metrics[3],], paste0(outDir, sum_prefix,'_summary.csv'))
write.csv(summ, paste0(outDir, sum_prefix,'_summary.csv'))
# new plot function
summary_plot(summ, figure_title)
varimp_vals <- varImp_rNCV(res.rncv)
#how are univariate correlations related to varimp values?
corrM <- cor(data.rncv[, !(names(data.rncv) %in% c('id', 'LC_Category'))], use = 'pairwise.complete')
uni_cors <- corrM[rownames(corrM) %in% c(var_to_predict),]
uni_cors <- as.data.frame(uni_cors)
#r^2 values
#uni_cors <- uni_cors * uni_cors
#merge univariate correlations with variable importance
combined_vals <- cbind(varimp_vals, uni_cors[match(varimp_vals$variable, rownames(uni_cors)),])
names(combined_vals)[names(combined_vals) == 'mean'] <- 'ML_Varimp'
names(combined_vals)[names(combined_vals) == 'uni_cors[match(varimp_vals$variable, rownames(uni_cors)), ]'] <- 'r'
combined_vals$r2 <- combined_vals$r * combined_vals$r
cor_val <- cor(combined_vals$r2 , combined_vals$ML_Varimp)
#print("Variables with Highest Importance")
#print(head(combined_vals[seq(dim(combined_vals)[1],1), c('variable', 'ML_Varimp', 'r2')]))
varimp <- combined_vals[seq(dim(combined_vals)[1],1), c('variable', 'ML_Varimp', 'r', 'r2')]
write.csv(varimp, paste0(outDir, sum_prefix, '_VarImp.csv'), row.names = FALSE)
plot(combined_vals$r2, combined_vals$ML_Varimp, xlab = 'Univariate r^2', ylab = 'ML Variable Importance',
main = paste('r =', round(cor_val, digits = 2)))
#varimp_plot(varimp)
# choose some predictors for pd plots
sel.var <- combined_vals$variable[length(combined_vals$variable):(length(combined_vals$variable) - 3)]
par(mfrow=c(2,2))
if (include_pdp){
pdNCV(sel.var, resp.var=var_to_predict, nRep=5, nFolds=5,
dir.path=paste0(outDir, '.'), file.root=paste0('.', sum_prefix,'-pdpFile'), res.rncv, stack.wt=NULL)
}
scatter_data <- cbind(rowMeans(res.rncv$y.pred.comb), data.rncv[, c('LC_Category', var_to_predict)])
names(scatter_data)[1] <- 'Yhat'
names(scatter_data)[names(scatter_data) == var_to_predict] <- 'Y'
ggplot(scatter_data, aes(x = Y, y = Yhat, color = LC_Category)) + geom_point() + ggtitle(var_to_predict) + geom_smooth()
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.