if(!require(wesanderson)){
devtools::install_github("jhollist/wesanderson")
}
if(!require(edarf)){
devtools::install_github("zmjones/edarf")
}
if(!require(condprob2)){
devtools::install_github("jhollist/condprob2")
}
if(!require(pander)){
  install.packages("pander")
}
if(!require(tidyr)){
  install.packages("tidyr")
}
if(!require(party)){
  install.packages("party")
}

if(!require(broom)){
  install.packages("broom")
}

if(!require(interpretR)){
  install.packages("interpretR")
}

if(!require(doParallel)){
  install.packages("doParallel")
}
if(!require(viridis)){
  devtools::install_github("CRAN/viridis")
}

if(!require(LakeTrophicModelling)){
  devtools::install_github("USEPA/LakeTrophicModelling",quick=TRUE)
}

library("viridis")
library("wesanderson")
library("LakeTrophicModelling")
library("knitr")
library("ggplot2")
library("sp")
library("rgdal")
library("e1071")
library("dplyr")
library("tidyr")
library("condprob2")
library("party")
library("broom")
library("edarf")
#library("caret")
library("randomForest")
library("doParallel")
opts_chunk$set(dev = 'jpeg', dpi=300, fig.width=7.5, knitr.table.format="html")
data(LakeTrophicModelling)

#
#Checks for existing cache (from another project)
#Else if Repeats running chunk outside of knitr
if(file.exists("prior_cache")){
  for(i in gsub(".rdb","",list.files("prior_cache",".rdb",full.names=T))){
    lazyLoad(i)
  }
} else if(file.exists("vignettes/prior_cache")){
  for(i in gsub(".rdb","",list.files("vignettes/prior_cache",".rdb",full.names=T))){
    lazyLoad(i)
  }
} else if(file.exists("../prior_cache")){
  for(i in gsub(".rdb","",list.files("../prior_cache",".rdb",full.names=T))){
    lazyLoad(i)
  }
}
#multiple caches... silly Jeff
#
if(file.exists("manuscript_cache/latex")){
  for(i in gsub(".rdb","",list.files("manuscript_cache/latex",".rdb",full.names=T))){
    lazyLoad(i)
  }
} else if(file.exists("vignettes/manuscript_cache/latex")){
  for(i in gsub(".rdb","",list.files("vignettes/manuscript_cache/latex",".rdb",full.names=T))){
    lazyLoad(i)
  }
} else if(file.exists("../manuscript_cache/latex")){
  for(i in gsub(".rdb","",list.files("../manuscript_cache/latex",".rdb",full.names=T))){
    lazyLoad(i)
  }
}

# Table Captions from @DeanK on 
#http://stackoverflow.com/questions/15258233/using-table-caption-on-r-markdown-file-using-knitr-to-use-in-pandoc-to-convert-t

knit_hooks$set(tab.cap = function(before, options, envir) {
                  if(!before) { 
                    paste('\n\n:', options$tab.cap, sep='') 
                  }
                })
default_output_hook = knit_hooks$get("output")
knit_hooks$set(output = function(x, options) {
  if (is.null(options$tab.cap) == FALSE) {
    x
  } else
    default_output_hook(x,options)
})
#All Variables
#Clean Up Data - Complete Cases
predictors_all <- predictors_all[predictors_all!="DATE_COL"]
all_dat <- data.frame(ltmData[predictors_all],LogCHLA=log10(ltmData$CHLA))
row.names(all_dat)<-ltmData$NLA_ID
all_dat <- all_dat[complete.cases(all_dat),]  

#GIS Variables
#Clean Up Data - Complete Cases
gis_dat <- data.frame(ltmData[predictors_gis],LogCHLA=log10(ltmData$CHLA))
row.names(gis_dat)<-ltmData$NLA_ID
gis_dat <- gis_dat[complete.cases(gis_dat),]
#ecoregion shape
wsa9<-readOGR("../inst/extdata/","wsa9_low48",verbose=FALSE)
################################################################################
#This chunk sets up the 'all' and 'gis' data, does variable selection (takes few
# hours to run) then run the randomForest.
# Results stored in prior_cache and loaded in setup chunk
################################################################################
#Model 1: All Variables

#Variable Selection
all_vs <- varsel_regression_rf(all_dat$LogCHLA,all_dat[,names(all_dat)!="LogCHLA"],
                     ntree=5000,prog=T)  
all_vs_plot <- varsel_plot(all_vs) #Used plot to identify approxiamte minimum
all_vars_select <- unlist(all_vs$vars[19]) #top 20 variables

#Final Model - all variables
all_rf<-randomForest(y=all_dat$LogCHLA,x=all_dat[,all_vars_select], 
                         ntree=5000,importance=TRUE,proximity=TRUE,
                          keep.forest=TRUE,keep.inbag=TRUE)

################################################################################
#Model 2: GIS Only Variables

#Variable Selection
gis_vs <- varsel_regression_rf(gis_dat$LogCHLA,gis_dat[,names(gis_dat)!="LogCHLA"],
                     ntree=5000,prog=T)  
gis_vs_plot <- varsel_plot(gis_vs) #Used plot to identify approxiamte minimum
gis_vars_select <- unlist(gis_vs$vars[14]) #top 15 variables

#Final Model - gis variables
gis_rf<-randomForest(y=gis_dat$LogCHLA,x=gis_dat[,gis_vars_select], 
                       ntree=5000,importance=TRUE,proximity=TRUE,
                        keep.forest=TRUE,keep.inbag=TRUE)
data_def <- read.csv("../inst/extdata/data_def.csv", stringsAsFactors = FALSE)
all_imp <- importance(all_rf)
gis_imp <- importance(gis_rf)
var_importance<-rbind(varImportance(all_imp,"All variables"),
                      varImportance(gis_imp,"GIS variables"))
var_importance<-inner_join(var_importance,data_def[c("variable_names","description")],"variable_names")
var_importance<-var_importance[order(var_importance$model_id,-var_importance$mean_decrease_gini, decreasing=FALSE),]                     
################################################################################
#Register parallel backend for partial dependence
#numcores<-parallel::detectCores()
#cl <- makeCluster(numcores)
#registerDoParallel(cl)
#co <- 100

################################################################################
#All variables partial dependence
all_rf_turb_pd<-partialPlot(all_rf,all_dat,"TURB",n.pt=co,plot=FALSE)
all_rf_ntl_pd<-partialPlot(all_rf,all_dat,"NTL",n.pt=co,plot=FALSE)
all_rf_ptl_pd<-partialPlot(all_rf,all_dat,"PTL",n.pt=co,plot=FALSE)
all_rf_elev_pd<-partialPlot(all_rf,all_dat,"ELEV_PT",n.pt=co,plot=FALSE)
all_rf_toc_pd<-partialPlot(all_rf,all_dat,"TOC",n.pt=co,plot=FALSE)
all_rf_np_pd<-partialPlot(all_rf,all_dat,"NPratio",n.pt=co,plot=FALSE)

################################################################################
#GIS Only partial dependence
gis_rf_ecor_pd<-partialPlot(gis_rf,gis_dat,"WSA_ECO9",plot=FALSE)
gis_rf_crops_pd<-partialPlot(gis_rf,gis_dat,"CropsPer_3000m",n.pt=co,plot=FALSE)
gis_rf_elev_pd<-partialPlot(gis_rf,gis_dat,"ELEV_PT",n.pt=co,plot=FALSE)
gis_rf_lat_pd<-partialPlot(gis_rf,gis_dat,"AlbersY",n.pt=co,plot=FALSE)
gis_rf_conifer_pd<-partialPlot(gis_rf,gis_dat,"EvergreenPer_3000m",n.pt=co,
                               plot=FALSE)
gis_rf_mndepth_pd<-partialPlot(gis_rf,gis_dat,"MeanDepthCorrect",n.pt=co,
                               plot=FALSE)
#stopCluster(cl)
#Predict and Original Chl a
all_pred <- predict(all_rf)
all_orig <- all_rf$y
gis_pred <- predict(gis_rf)
gis_orig <- gis_rf$y

#Trophic State Catergories
ts_4_brks <- c(min(log10(ltmData$CHLA),na.rm=T)-1,log10(2),log10(7),log10(30),max(log10(ltmData$CHLA),na.rm=T)+1)
ts_4_cats <- c("oligo","meso","eu","hyper")
ts_2_brks <- c(min(log10(ltmData$CHLA),na.rm=T)-1,log10(7),max(log10(ltmData$CHLA),na.rm=T)+1)
ts_2_cats <- c("oligo/meso","eu/hyper")

#Orig and Predict Trophic State Categories
all_ts4_pred <- cut(all_pred,ts_4_brks,ts_4_cats)
all_ts4_orig <- cut(all_orig,ts_4_brks,ts_4_cats)
all_ts2_pred <- cut(all_pred,ts_2_brks,ts_2_cats)
all_ts2_orig <- cut(all_orig,ts_2_brks,ts_2_cats)
gis_ts4_pred <- cut(gis_pred,ts_4_brks,ts_4_cats)
gis_ts4_orig <- cut(gis_orig,ts_4_brks,ts_4_cats)
gis_ts2_pred <- cut(gis_pred,ts_2_brks,ts_2_cats)
gis_ts2_orig <- cut(gis_orig,ts_2_brks,ts_2_cats)

#Trophic State Confusion Matrices
all_ts4_conmat <- table(all_ts4_pred,all_ts4_orig)
all_ts2_conmat <- table(all_ts2_pred,all_ts2_orig)
gis_ts4_conmat <- table(gis_ts4_pred,gis_ts4_orig)
gis_ts2_conmat <- table(gis_ts2_pred,gis_ts2_orig)

#Trophic State Total Accuracy and Kappa
all_ts4_total_acc <- classAgreement(all_ts4_conmat)$diag
all_ts4_kappa <- classAgreement(all_ts4_conmat)$kappa
all_ts2_total_acc <- classAgreement(all_ts2_conmat)$diag
all_ts2_kappa <- classAgreement(all_ts2_conmat)$kappa
gis_ts4_total_acc <- classAgreement(gis_ts4_conmat)$diag
gis_ts4_kappa <- classAgreement(gis_ts4_conmat)$kappa
gis_ts2_total_acc <- classAgreement(gis_ts2_conmat)$diag
gis_ts2_kappa <- classAgreement(gis_ts2_conmat)$kappa
#Per tree chl a predictions - All Variables
all_rf_ts_prob <- class_prob_rf(all_rf,all_dat,ts_4_brks,ts_4_cats,T)
gis_rf_ts_prob <- class_prob_rf(gis_rf,gis_dat,ts_4_brks,ts_4_cats,T)
all_rf_ts_prob$model <- rep("All variables",nrow(all_rf_ts_prob))
gis_rf_ts_prob$model <- rep("GIS only",nrow(gis_rf_ts_prob))
all_ci <- data.frame(ecdf_ks_ci(all_rf_ts_prob$max))
gis_ci <- data.frame(ecdf_ks_ci(gis_rf_ts_prob$max))
all_rf_ts_prob <- data.frame(all_rf_ts_prob,all_ci[,2:3])
gis_rf_ts_prob <- data.frame(gis_rf_ts_prob,gis_ci[,2:3])
prob_cdf_fig <- prob_cdf(all_rf_ts_prob, gis_rf_ts_prob,x="Predicition Probability",y="Proportion of Samples")
#This also writes out cp1 and cp2 to global env.
cond_acc_fig <- condAccuracy(all_rf_ts_prob,gis_rf_ts_prob,xImpair=0,R=1,xlab="Xc >= Prediction Probability")

\singlespace

\vspace{2mm}\hrule

Abstract
Productivity of lentic ecosystems is well studied and it is widely accepted that as nutrient inputs increase, productivity increases and lakes transition from lower trophic state (e.g., oligotrophic) to higher trophic states (e.g., eutrophic). These broad trophic state classifications are good predictors of ecosystem condition, services (e.g., recreation and aesthetics), and disservices (e.g., harmful algal blooms). While the relationship between nutrients and trophic state provides reliable predictions, it requires in situ water quality data in order to parameterize the model. This limits the application of these models to lakes with existing and, more importantly, available water quality data. To address this, we take advantage of the availability of a large national lakes water quality database (i.e., the National Lakes Assessment), land use/land cover data, lake morphometry data, other universally available data, and apply data mining approaches to predict trophic state. Using these data and random forests, we first model chlorophyll a, then classify the resultant predictions into trophic states. The full model estimates chlorophyll a with both in situ and universally available data. The mean squared error and adjusted R\textsuperscript{2} of this model was r round(all_rf$mse[all_rf$ntree],2) and r round(all_rf$rsq[all_rf$ntree],2), respectively. The second model uses universally available GIS data only. The mean squared error was r round(gis_rf$mse[gis_rf$ntree],2) and the adjusted R\textsuperscript{2} was r round(gis_rf$rsq[gis_rf$ntree],2). The accuracy of the trophic state classifications derived from the chlorophyll a predictions were r round(all_ts4_total_acc,2)*100% for the full model and r round(gis_ts4_total_acc,2)*100% for the "GIS only" model. Random forests extend the usefulness of the class predictions by providing prediction probabilities for each lake. This allows us to make trophic state predictions and also indicate the level of uncertainty around those predictions. For the full model, these predicted class probabilities ranged from r round(min(all_rf_ts_prob$max),2) to r round(max(all_rf_ts_prob$max),2). For the GIS only model, they ranged from r round(min(gis_rf_ts_prob$max),2) to r round(max(gis_rf_ts_prob$max),2). It is our conclusion that in situ data are required for better predictions, yet GIS and universally available data provide trophic state predictions, with estimated uncertainty, that still have the potential for a broad array of applications. The source code and data for this manuscript are available from https://github.com/USEPA/LakeTrophicModelling.

\vspace{3mm}\hrule \doublespace

Keywords: Harmful Algal Blooms; Cyanobacteria; Open Science; Nutrients; National Lakes Assessment

Introduction

Productivity in lentic systems is often categorized across a range of trophic states (e.g., the trophic continuum) from early successional (i.e., oligotrophic) to late successional lakes (i.e., hypereutrophic) with lakes naturally occurring across this range [@carlson1977trophic]. Oligotrophic lakes occur in nutrient poor areas or have a more recent geologic history, are often found in higher elevations, have clear water, and are usually favored for drinking water or direct contact recreation (e.g., swimming). Lakes with higher productivity (e.g., mesotrophic and eutrophic lakes) have greater nutrient loads, tend to be less clear, have greater density of aquatic plants, and often support more diverse and abundant fish communities. Higher primary productivity is not necessarily a predictor of poor ecological condition as it is natural for lakes to shift from lower to higher trophic states but this is a slow process [@rodhe1969crystallization]. However, at the highest productivity levels (hypereutrophic lakes) biological integrity is compromised [@hasler1969cultural; @schindler2008algal; @smith1999eutrophication].

Monitoring trophic state allows for rapid assessment of a lakes biological productivity and identification of lakes with unusually high productivity (e.g., hypereutrophic). These cases are indicative of lakes under greater anthropogenic nutrient loads, also known as cultural eutrophication, and are more likely to be at risk of fish kills, beach fouling, and harmful algal blooms [@smith1998cultural;@smith1999eutrophication;@smith2006eutrophication]. Given the association between trophic state and many ecosystem services and disservices, being able to accurately model trophic state could provide a first cut at identifying lakes with the potential for harmful algal blooms (i.e., from cyanobacteria) or other problems associated with cultural eutrophication. This type of information could be used for setting priorities for management and allow for more efficient use of limited resources.

As trophic state and related indices can be best defined by a number of in situ water quality parameters (modeled or measured), most models have used this information as predictors [@carvalho_cyanobacterial_2011;@milstead2013estimating;@imboden1978dynamic;@salas1991simplified]. This leads to accurate models, but these data are often sparse and not always available, thus limiting the population of lakes for which we can make predictions. A possible solution for this issue is to build models that use widely available data that are correlated to many of the in situ variables. For instance, landscape metrics of forests, agriculture, wetlands, and urban land in contributing watersheds have all been shown to explain a significant proportion of the variation (ranging from 50-86%, depending on study) in nutrients in receiving waters [@jones2001predicting; @jones2004importance; @seilheimer2013landscape]. Building on these previously identified associations might allow us to use only landscape and other universally available data to build models. Identifying predictors using this type of ubiquitous data would allow for estimating trophic state in both monitored and unmonitored lakes. Furthermore, being able to classify a large number of lakes would have implications for the management of lakes. A broader discussion of ecological classification and resource management is beyond the scope of this paper, but see [@carpenter1999class] for more information on this topic.

Many published models of nutrients and trophic state in freshwater systems are based on linear modelling methods such as standard least squares regression or linear mixed models [@jones2004importance; @jones2001predicting]. While these methods have proven to be reliable, they have limitations (e.g., independence, distribution assumptions, and outlier sensitivity). Using data mining approaches, such as random forests, avoids many of the limitations, may reduce bias, and often provides better predictions [@cutler2007random; @peters2007random; @breiman2001random; @delgado2014]. For instance, random forests are non-parametric and thus the data do not need to come from a specific distribution (e.g., Gaussian) and can contain collinear variables [@cutler2007random]. Second, random forests work well with very large numbers of predictors [@cutler2007random]. Lastly, random forests can deal with model selection uncertainty as predictions are based upon a consensus of many models and not just a single model selected with some measure of goodness of fit.

The research presented here builds on past work in three areas. First, we built, assessed, and compared two random forest models of chlorophyll a with 1) in situ and universally available GIS data and then 2) universally available GIS data only. Second, we converted the chlorophyll a estimates, for both models, to trophic state and assessed prediction accuracy and uncertainty. Third, we examined the important predictors for both models. Lastly, to promote transparency in our work, the analysis code and data are available as an R package from https://github.com/USEPA/LakeTrophicModelling.

Methods

Data and Study Area

We utilized three primary sources of data for this study, the National Lakes Assessment (NLA), the National Land Cover Dataset (NLCD), and lake morphometery modeled from the NHDPlus and National Elevation Data Set [@usepa2009national;@homer2004development;@xian2009updating;@hollister2010volume;@hollister_predicting_2011;@lakemorpho2014]. All datasets are national in extent and provide a unique snapshot view of the condition of lakes in the conterminous United States during the summer of 2007.

The NLA dataset was collected during the summer of 2007 and the final datasets were released in 2009 [@usepa2009national]. With consistent methods and metrics collected at over 1000 locations across the conterminous United States (Figure \ref{fig:nlaMap}), the NLA provides a unique opportunity to examine broad scale patterns in lake productivity. The NLA collected data on biophysical measures of lake water quality and habitat as well as an assessment of the phytoplankton community. For this analysis, we only use the various water quality measurements from the National Lakes Assessment [@usepa2009national]. Additionally, the NLA included ecological regions as defined in the Wadeable Streams Assessment (Figure \ref{fig:ecoregion_map}) [@usepa2006wadeable; @omernik1987ecoregions].

Adding to the monitoring data collected via the NLA, we used the 2006 NLCD data to examine landscape-level drivers of trophic status in lakes. The NLCD is a national land use/land cover dataset that also provides estimates of impervious surface. We calculated total proportion of each NLCD land use land cover class and total percent impervious surface within a 3 kilometer buffer surrounding each lake [@homer2004development;@xian2009updating]. We chose this buffer distance for several reasons. First, in some preliminary efforts we tried a variety of scales (300 m, 1.5 km, and 3 km), and they had little impact on prediction accuracy. Second, since we also include local lake specific variables (see below) as well as the broader scale ecoregions, we chose the 3km buffer as it made intuitive sense as representative of land use impacts that would not be accounted for these other variables. While many regional classifications and scales have been shown to be effective [e.g., @cheruvelil2013multi], we chose a three kilometer buffer as it represented an intermediate scale that is greater than immediate parcels but smaller than regional and whole-basin measures.

Local, lake specific characteristics have been show to be important [@read2015importance]. Thus to account for this, we used measures of lake morphometry (i.e., depth, volume, fetch, etc.). As these data are difficult to obtain for large numbers of lakes over broad regions, we used modeled estimates of lake morphometry [@hollister2010volume;@hollister_predicting_2011;@lakemorpho2014]. These included: surface area, shoreline length, Shoreline Development, Maximum Depth, Mean Depth, Lake Volume, Maximum Lake Length, Mean Lake Width, Maximum Lake Width, and Fetch.

Predicting Trophic State with Random Forests

Random forest is a machine learning algorithm that aggregates numerous decision trees in order to obtain a consensus prediction of the response categories [@breiman2001random]. Bootstrapped sample data are recursively partitioned according to a given random subset of predictor variables and a predetermined number of decision trees are developed. With each new tree, the sample data subset is randomly selected and with each new split, the subset of predictor variables are randomly selected. For a more detail description of random forests see Breiman [-@breiman2001random] and Cutler et al. [-@cutler2007random].

Random forests are able to handle numerous correlated variables without a decrease in prediction accuracy; however, one possible shortcoming of this approach is that the resulting model may be difficult to interpret, thus selecting the most important variables is an important first step. Several methods have been proposed to do this with random forest. For instance, this is a problem often faced in gene selection and in that field, a variable selection method based on random forest has been successfully applied and implemented in the R Language as the varSelRF package [@diaz2006gene], but this is limited to classification problems. Additionally, others have suggested alternative variable importance measures, but this is only needed with a large number of categorical variables which are selected against with traditional random forest approach [@strobl2007bias].

In our case, we predicted a continuous variable, chlorophyll a, directly thus varSelRF, does not apply, and nearly all of our variables are continuous so the approach suggested by Strobl [-@strobl2007bias] is not necessary. Thus we developed an approach, similar to varSelRF but applied to random forest with regression trees. With this approach we fit a full random forest model that includes all variables and a large number of trees. We then rank the variables using the increase in mean square error, which has been shown to be a less biased metric of importance than the mean decrease in the Gini coefficient [@strobl2007bias]. Using this ranking, we then iterate through the variables and create a random forest with the top two variables and record mean square error and adjusted R\textsuperscript{2} of the resultant random forest. We then repeat this process by adding the next most important variable in order of importance. With this information we identify both the top variables and the point at which adding variables does not improve the fit of the overall model. These variables are selected and used as the "reduced model." With this method, a minimum set of variables that maximizes model accuracy is provided. This allows us to start with a full suite of predictor variables from which to select a minimum, easier to interpret set of variables.

Model Details

We used the randomForest package in R to build predictive models of chlorophyll a with two sets of predictors [@liaw2002randomForest]. The first included in situ and universally available GIS predictors. We refer to this as the "All variables" model. For the second model we used just the universally available data (i.e., no in situ information). This is referred to as the "GIS only" model. A list of all considered variables is in Appendix 1. Our separation of predictors was chosen so that we could highlight the additional predictive performance provided by adding the in situ water quality variables on top of the GIS only variables. Lastly, we used only complete cases (i.e., missing data were removed) so the total number of observations varied among models.

Our modelling work flow was as follows:

  1. Identify a minimal set of variables from the full suite of variables (Appendix 1) that maximize accuracy of the random forest algorithm. This minimal set of variables, the reduced model, is calculated for each of the models.
  2. Using R's randomForest package, we develop two random forest models with 5000 trees ("All variables" and "GIS only").
  3. Assess model performance for both the predicted chlorophyll a and for categorical trophic state classifications. Trophic state was defined using the NLA chlorophyll a trophic state cut offs (Table \ref{tab:trophicStateTable}).
  4. Examine importance and partial dependence of the most important variables.

Measures of Model Performance and Variable Importance

We assessed the performance of the random forest two ways. First we compared the root mean square error and the adjusted R\textsuperscript{2} of the models. Second, we examined the accuracy of the model predictions when converted to trophic states classes via a confusion matrix (Table \ref{tab:trophicStateTable}). A confusion matrix shows agreement and disagreement in a tabular form with predicted values forming the columns of the matrix and observed values, the rows. From this tabulated information we calculated the total accuracy (i.e., percent correctly predicted) and the kappa coefficient, which takes into account the error expected by chance alone (i.e., the off diagonal values of the matrix) [@cohen1960coefficient; @hubert1985comparing]. The kappa coefficient can range from -1 to 1 with 0 equaling the agreement expected by chance alone. Values greater than 0 represent agreement greater than would be expected by chance. A kappa coefficient greater than approximately 0.6 is considered "substantial" agreement [@landis1977measurement]. Negative values are rare and would indicate no agreement between the predicted and observed values. We use kappa as a means of comparison across models as well as within subsets of a given model. Additionally, random forest builds each tree on bootstrapped, random subsets of the original data, thus, a separate independent validation dataset is not required and random forest error estimates are expected to be unbiased [@breiman2001random].

Random forests explicitly measure variable importance with two metrics: mean decrease in Gini and percent increase in mean squared error. These measure the impact on the overall model when a particular variable is included and thus can be used to assess importance [@breiman2001random]. The Gini Index has been shown to have a bias [@strobl2007bias], thus, we used percent increase in mean squared error to assess variable importance. Lastly, partial dependence plots provide a mechanism to examine the partial relationship between individual variables and the response variable [@jones2015exploratory]. We examined these plots for the top variables as assigned by percent increase in mean squared error for each the reduced models.

Trophic State Probabilities

One of the powerful features of random forests is the ability to aggregate a very large number of competing models or trees. Each tree provides an independent prediction or vote for a possible outcome. In the context of our chlorophyll a models, we have 5,000 estimates of chlorophyll a for each lake. We convert these values to trophic states (Table \ref {tab:trophicStateTable}) then count up total votes for each class and divide by total possible votes to get an estimate of the probability that a lake is in a given trophic state. For instance, for a single lake (National Lake Assessment ID = NLA06608-0005), the vote probabilities for the "All variables" model were r 100*round(all_rf_ts_prob[all_rf_ts_prob$nla_id=="NLA06608-0005",][1],2)% for oligotrophic, r 100*round(all_rf_ts_prob[all_rf_ts_prob$nla_id=="NLA06608-0005",][2],2)% for mesotrophic, r 100*round(all_rf_ts_prob[all_rf_ts_prob$nla_id=="NLA06608-0005",][3],2)% for eutrophic, and r 100*round(all_rf_ts_prob[all_rf_ts_prob$nla_id=="NLA06608-0005",][4],2)% for hypereutrophic. The maximum probability provides the predicted class, in this case oligotrophic, and suggests little uncertainty in this prediction. We refer to this value as the "prediction probability."

Further, we might expect higher total accuracy for lakes that have more certain predictions. This should be evident by looking at the Kappa coefficient of lakes given their prediction probability is at or above a certain probability. To test this we use an approach similar to one outlined by Paul and MacDonald [-@paul2005development] and implemented by Hollister et al. [-@hollister2008cprob] and examine the change in Kappa coefficient as a function of the prediction probability for both models.

Results

Our complete dataset included r nrow(ltmData) lakes; however r sum(is.na(ltmData$CHLA)) lakes did not have chlorophyll a data. Thus, the base dataset for our modelling was conducted on data for r nrow(ltmData[!is.na(ltmData$TS_CHLA_4),]) lakes. The lakes were well distributed across the four trophic state categories (Table \ref{tab:trophicStateTable}) and spatially throughout the United States (Figure \ref{fig:nlaMap}).

Models: All Variables

The model built with all predictors used r nrow(all_dat) total observations, had a mean squared error of r round(all_rf$mse[5000],2) and and R\textsuperscript{2} of r round(all_rf$rsq[5000],2). The accuracy of the four trophic states was r round(all_ts4_total_acc,3)*100% and the kappa coefficient was r round(all_ts4_kappa,2) (Table \ref{tab:Confusion_All_4}). The variable selection process identified a reduced model with 20 variables (Figure \ref{fig:all_varsel_figure}). The six most important variables were turbidity, total phosphorus, total nitrogen, elevation, total organic carbon, and N:P ratio (Figures \ref{fig:All_Importance}). The role that each played in predicting chlorophyll a varied (Figure \ref{fig:all_partial_dependence}).

Models: GIS Only Variables

The GIS only model was built using r nrow(gis_dat) total observations, had a mean squared error of r round(gis_rf$mse[5000],2) and and R\textsuperscript{2} r round(gis_rf$rsq[5000],2). Four trophic states were predicted with a total accuracy of r round(gis_ts4_total_acc,3)*100% and had a kappa coefficient of r round(gis_ts4_kappa,2) (Table \ref{tab:Confusion_GIS_4}). The variable selection process for this model produced a reduced model with 15 variables (Figure \ref{fig:gis_varsel_figure}). The six most important variables were ecoregion, percent cropland, elevation, latitude, percent evergreen forest, and mean lake depth (Figures \ref{fig:GIS_Importance} & \ref{fig:all_partial_dependence}).

Trophic State Probabilities

The "All variables" model provides more certain model predictions with a median prediction probability of r round(median(apply(all_rf_ts_prob[1:4],1,max)),2) versus r round(median(apply(gis_rf_ts_prob[1:4],1,max)),2) for the "GIS only" model (Figure \ref{fig:prob_cdf}). Additionally, the Kappa coefficient of the predictions is a function of this uncertainty. Lakes with more certain predictions were more accurately classified and had higher Kappa coefficients (Figure \ref{fig:cond_prob_fig}). For both models, when prediction probabilities are approximately 0.8 or higher, the models had a Kappa coefficient of ~1. This represents r round(sum(all_rf_ts_prob$max>0.8)/nrow(all_dat),2)*100% of the lakes for the "All variables" model and r round(sum(gis_rf_ts_prob$max>0.8)/nrow(gis_dat),2)*100% of the lakes for the "GIS only" model. A Kappa coefficient of 0.6 or higher is considered "substantial" agreement [@landis1977measurement]. For the "GIS only" model this is seen with r round(sum(gis_rf_ts_prob$max>cp2[cp2$kappa >= 0.6,][1,1])/nrow(gis_dat),2)*100% of the lakes. Lastly, as prediction probabilities increased, the difference in kappa coefficient between the two models decreased (Figure \ref{fig:cond_prob_fig} & Tables \ref{tab:cond_prob_tab_all} & \ref{tab:cond_prob_tab_gis} ).

Discussion

Trophic State Probabilities

Not surprisingly, lakes with more certain predictions (i.e., higher prediction probabilities) were more accurately predicted (Figure \ref{fig:cond_prob_fig}). The fact that the difference in accuracy (as measured by the Kappa coefficient) between the two models decreased as certainty in the prediction increased suggests that models with lower overall accuracy, such as the "GIS only" model, may have acceptable accuracy for many individual cases (Tables \ref{tab:cond_prob_tab_all} & \ref{tab:cond_prob_tab_gis}). Additionally, the prediction probabilities may be mapped for each of the four classes (Figure \ref{fig:gis_probability_map}). The spatial patterns show little variability between the "All variables" and "GIS only" models, thus we only show the results from the more broadly applicable "GIS only" model (Figure \ref{fig:gis_probability_map}).

This map provides several insights. First, since low uncertainty is associated with high accuracy, this map shows the broad spatial patterns of lake trophic state across the United States (i.e darker colors more likely to be correctly predicted). Hypereutrophic lakes are much more commonly predicted in the Midwest and southeastern United States. Clear, oligotrophic lakes are in the northwestern United States, through the western mountains and in the northeastern united states. The middle trophic states are more evenly distributed across the country. Lastly, this particular map is very similar to simply mapping the raw data. However, it highlights what could be done if the "GIS only" model were used to map data without measured chlorophyll a values which would provide probabilities of given trophic states for all lakes in the United States.

Partial dependencies of explanatory variables

In line with past predictive modelling of chlorophyll a concentrations the "All variables" model selected the water quality variables (turbidity, total organic carbon, total nitrogen, total phosphorus, and N:P ratios) as important variables [@downing2001predicting]. While there is variation in the response of chlorophyll a to changes in nutrient concentrations, the general pattern suggests that limiting nutrients have predictable impacts. If we examine the partial dependencies of these variables we see a general linear increase in log chlorophyll a with nitrogen, phosphorus and organic carbon concentrations (Figure \ref{fig:all_partial_dependence}). This relationship holds until nutrient concentrations become saturated. The partial dependency plots (Figure \ref{fig:all_partial_dependence}) for the nitrogen:phosphorus ratio is more complicated, indicating that for ratios less than ~14 chlorophyll a increases but after ~14 there is marked decrease. The effect of the nitrogen phosphorus ratio on chlorophyll has been the subject of considerable research and our results are consistent with the majority of the findings suggesting that at low ratio values nitrogen is limiting [@downing1992nitrogen; @smith2009eutrophication]. Conversely, at higher ratios the phosphorus levels may be limiting. This would be a cause for concern with linear models; however, linearity is not an assumption of tree-based modelling approaches such as random forest.

Turbidity was selected as the most important variable in the "All variables" model. The partial dependency analysis shows that, similar to the nutrients discussed above, log chlorophyll a increases with increased turbidity. At first this may seem counter intuitive since we might expect productivity to decrease as turbidity increases, and therefore light availability decreases [@tilzer1988secchi; @bilotta2008understanding]. However, algal biomass can contribute heavily to measures of turbidity and we expect greater productivity to lead to increased turbidity [@hansson1992factors]. We interpret this pattern as indicating that as chlorophyll a concentrations increase we see a concomitant increase in turbidity due to increased algal cell densities.

Elevation was selected as an important predictive variable in both the all variables and the GIS only models; the partial dependencies (Figures \ref{fig:all_partial_dependence} & \ref{fig:gis_partial_dependence}) indicate a negative relationship between elevation and chlorophyll a concentration that is probably due to fact that the location of mountains in the United States is the spatial inverse of the distribution of agricultural and urban lands. As elevation increases we expect decreased loads due to smaller watershed contributing areas. In contrast lower elevation sites will have larger drainage areas and greater potential for increased nutrient loads from urban and agricultural sources.

The variables in the "GIS only" model captured the large scale spatial pattern of the trophic status gradient of lakes across the United States. In addition to elevation, mentioned above, the model was most sensitive to latitude and ecoregion. In general, chlorophyll a concentrations are highest in the Southern portions of the study area where temperatures can be higher (a known driver of productivity), elevations lower, and agricultural impacts more pronounced. Likewise ecoregion (see Figures \ref{fig:ecoregion_map} & \ref{fig:gis_partial_dependence}) has a pronounced affect indicting continental scale effects of land use and geography. Agriculturally dominated landscapes such as the Temperate Plains, Southern Plains, and Coastal Plains show the highest levels of Chlorophyll a. Whereas high elevation zones (Western Mountains), arid lands (Xeric), Northern habitats (Upper Midwest) have lower concentrations.

Further evidence for the role of land use/land cover variables, and similar to results from Read et. al. [-@read2015importance], is shown by the selection of the percent cropland and percent evergreen forest variables. As indicated by the partial dependency plots (Figure \ref{fig:gis_partial_dependence}), chlorophyll a increases with cropland and decreases with evergreen cover. It is not surprising that croplands were selected given the overwhelming impact of agriculture on the eutrophication process. Evergreens and chlorophyll a concentrations show a negative association (Figure \ref{fig:gis_partial_dependence}). As the percent of evergreens increases we are likely to see increased elevation and soil differences that limit agriculture.

Lastly, morphometry (e.g., depth) also proved to be important in the prediction of lake trophic state [@genkai2005eutrophication]. As morphometry shows little to no broad scale spatial pattern and is unique to a given lake, these data are likely illuminating the local, lake scale drivers such as in-lake nutrient processing and residence time.

Conclusions

Our research goals were to explore the utility of a widely used data mining algorithm, random forests, in the modelling of chlorophyll a and lake trophic state. Further, we hoped to examine the utility of these models when built with only ubiquitous GIS data, which allows estimation of trophic state for all lakes in the United States. The "All variables" model had an RMSE of r round(all_rf$mse[all_rf$ntree],2) and an adjusted R\textsuperscript{2} of r round(all_rf$rsq[all_rf$ntree],2) whereas, the GIS only models had an RMSE of r round(gis_rf$mse[gis_rf$ntree],2) and the adjusted R\textsuperscript{2} was r round(gis_rf$rsq[gis_rf$ntree],2). Our total accuracy in predicting chlorophyll a based trophic states was r round(all_ts4_total_acc,2)*100% for the "All variables" model and r round(gis_ts4_total_acc,2)*100% for the "GIS only" model.

While the "GIS only" model showed lower prediction accuracies than the "All variables" model, the association between the uncertainty of prediction and total accuracy (Figure \ref{fig:cond_prob_fig} and Tables Tables \ref{tab:cond_prob_tab_all} & \ref{tab:cond_prob_tab_gis}) suggest that the "GIS only" model will provide reasonable estimates of trophic state for many lakes across the United States. Furthermore, we can map the uncertainty of the predictions, thus, we know the spatial patterns and location of the lakes for which we are certain, or not, of their predicted trophic state. Given this and that these models may be applied to any lake in the United States we can recommend using this model.

Future iterations of this modelling effort may be able to utilize modeled predictions of nutrients to improve accuracy and also maintain broad applicability [@milstead2013estimating]. Changes such as these have several advantages. First, this would allow for estimating changes to chlorophyll a and trophic state as a function of changing nutrient loads, which are expected due to climate change [@jones2014lake; @adrian2009lakes; @jeppesen2011climate; @moss2011allied]. Second, with the ability to make predictions for most lakes in the United States, the "GIS only" models could be used as a source of information on national scale phenomena. For example, predictions of chlorophyll a, with measures of uncertainty, could be used in efforts to scale up the contributions from lakes to broad scale estimates of gross primary production.

For the "All variables" model, the in situ water quality variables drove the predictions. This is not surprising. For the "GIS only"" model, the results were more nuanced. Three broad categories were routinely being selected as important: broad scale spatial patterns in trophic state, land use/land cover controls of trophic state, and local, lake-scale control driven by lake morphometry.

Our results raise three important considerations related to managing eutrophication. First, the broad scale patterning, indicated by ecoregion as an important variable, suggests regional trends. This is noteworthy because it suggests that efforts to monitor, model and manage eutrophication and cyanobacteria should be undertaken at both national and regional levels. This corroborates past findings that regional drivers are important for water quality [@cheruvelil2013multi]. Second, while direct control of water quality in lakes would have a large impact, the land use/land cover drivers (i.e., non-point sources) of water quality are also important, and better management of the spatial distribution of important classes such as forest and agriculture can provide some level of control on trophic state and amount of cyanobacteria present. Third, in-lake processes (i.e., residence time, nutrient cycling, etc.) are, as expected, important and need to be part of any management strategy. Building on these efforts through updated models, direct prediction of cyanobacteria, and additional information on the regional differences will help us get a better handle on the broad scale dynamics of productivity in lakes and the potential risk to human health from cyanobacteria blooms.

Acknowledgements

We would like to thank Farnaz Nojavan, Nathan Schmucker, John Kiddon, Joe LiVolsi, Tim Gleason, and Wayne Munns for constructive reviews of this paper. This paper has not been subjected to Agency review. Therefore, it does not necessary reflect the views of the Agency. Mention of trade names or commercial products does not constitute endorsement or recommendation for use. This contribution is identified by the tracking number ORD-011075 of the Atlantic Ecology Division, Office of Research and Development, National Health and Environmental Effects Research Laboratory, US Environmental Protection Agency.

\newpage

Figures

###REDO
#create list of all selected variables and assign a color
vars<-ls(pattern="_vars")
all_vars<-vector()
for(i in vars){
  all_vars<-c(all_vars,get(i))
}
all_vars<-unique(all_vars)

#all_movies_palette<-vector()
#for(i in 1:dim(namelist)[1]){
#  all_movies_palette<-c(all_movies_palette,wes.palette(namelist[i,2],
#                                                       namelist[i,1]))
#}
#all_movies_palette<-unique(all_movies_palette)
#zissou<-c(wes.palette(5,"Zissou"),wes.palette(5,"Rushmore"))
#col_lu<-data.frame(variables=all_vars,hexcode=sample(all_movies_palette,length(all_vars)))

```r",cache=FALSE} state<-map_data('state') lakes_alb<-data.frame(ltmData[["AlbersX"]],ltmData[["AlbersY"]],data=ltmData$TS_CHLA_4) p4s<-"+proj=aea +lat_1=29.5 +lat_2=45.5 +lat_0=37.5 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +datum=NAD83 +units=m +no_defs" ll<-"+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs" lakes_alb_sp<-SpatialPointsDataFrame(coordinates(lakes_alb[,1:2]),proj4string=CRS(p4s),data = lakes_alb["data"]) lakes_dd<-spTransform(lakes_alb_sp,CRS=CRS(ll)) lakes_dd<-data.frame(coordinates(lakes_dd),lakes_dd$data)

mycolor<-wes.palette(5,"Rushmore")

mycolor<-c(mycolor[1],mycolor[2],mycolor[4])

mycolor<-c("grey25","white","black")

cbPalette <- c("#0072B2","#999999","#F0E442","#D55E00")

cbPalette <- viridis(4) names(lakes_dd)<-c("long","lat","data") nlaMap(state,lakes_dd,mycolor,cats=TRUE,catColor=cbPalette)

\newpage

```r",cache=FALSE,message=FALSE}
wsa9@data$WSA_9<-factor(wsa9@data$WSA_9,levels=c("NAP","CPL","SAP","TPL","UMW","SPL",
                                                   "NPL","WMT","XER"),
                           labels=c("Northern\nAppalachians (NAP)",
                                    "Coastal\nPlains (CPL)",
                                    "Southern\nAppalachians (SAP)",
                                    "Temporate\nPlains (TPL)",
                                    "Upper\nMidwest (UMW)","Southern\nPlains (SPL)",
                                    "Northern\nPlains (NPL)","Western\nMountains (WMT)",
                                    "Xeric (XER)"),
                           ordered=TRUE)
#my_col<-data.frame(WSA_9=c(NA,levels(wsa9@data$WSA_9)),color=viridis(10))
#wsa9@data<-merge(wsa9@data,my_col)
ecor_map(wsa9)

\newpage

```r",cache=FALSE} varsel_plot(all_vs)

\newpage

```r",cache=FALSE}
importancePlot(all_rf, data_def=data_def,type='acc',size=3)#,aes(colour=all_ts4_color))

\newpage

```r",cache=FALSE, warning=FALSE,fig.height=7} multiplot(partial_plot(all_rf_turb_pd, x="Turbidity (NTU)", y=expression(paste('Log10 Chl ', italic("a"),' (',mu,'g/L)'))), partial_plot(all_rf_ntl_pd, x=expression(paste('Total Nitrogen',' (',mu,'g/L)')), y=expression(paste('Log10 Chl ', italic("a"),' (',mu,'g/L)'))), partial_plot(all_rf_ptl_pd, x=expression(paste('Total Phosporus',' (',mu,'g/L)')), y=expression(paste('Log10 Chl ', italic("a"),' (',mu,'g/L)'))), partial_plot(all_rf_elev_pd, x="Elevation (m)", y=expression(paste('Log10 Chl ', italic("a"),' (',mu,'g/L)'))), partial_plot(all_rf_toc_pd, x="Total Organic Carbon (mg/L)", y=expression(paste('Log10 Chl ', italic("a"),' (',mu,'g/L)'))), partial_plot(all_rf_np_pd, x="N:P Ratio", y=expression(paste('Log10 Chl ', italic("a"),' (',mu,'g/L)'))), layout=matrix(1:6,ncol=2,byrow=TRUE))

\newpage

```r",cache=FALSE}
varsel_plot(gis_vs)

\newpage

```r",cache=FALSE} importancePlot(gis_rf, data_def=data_def,type='acc',size=3)#,aes(colour=all_ts4_color))

\newpage

```r",cache=FALSE, warning=FALSE, message=FALSE,fig.height=7}

#Order Ecoregions E-W
gis_rf_ecor_pd$x <- factor(gis_rf_ecor_pd$x,levels=c("NAP","CPL","SAP","TPL","UMW","SPL",
                                                   "NPL","WMT","XER"),
                           ordered=TRUE)

multiplot(partial_plot(gis_rf_ecor_pd,
                       x="Ecoregion",
                       y=expression(paste('Log10 Chl ', italic("a"),' (',mu,'g/L)'))),
          partial_plot(gis_rf_crops_pd,
                       x="Cropland (%)",
                       y=expression(paste('Log10 Chl ', italic("a"),' (',mu,'g/L)'))),
          partial_plot(gis_rf_elev_pd,
                       x="Elevation (m)",
                       y=expression(paste('Log10 Chl ', italic("a"),' (',mu,'g/L)'))),
          partial_plot(gis_rf_lat_pd,
                       x="Latitude (m)",
                       y=expression(paste('Log10 Chl ', italic("a"),' (',mu,'g/L)'))),
          partial_plot(gis_rf_conifer_pd,
                       x="Conifer (%)",
                       y=expression(paste('Log10 Chl ', italic("a"),' (',mu,'g/L)'))),
          partial_plot(gis_rf_mndepth_pd,
                       x="Mean Lake Depth (m)",
                       y=expression(paste('Log10 Chl ', italic("a"),' (',mu,'g/L)'))),
          layout=matrix(1:6,ncol=2,byrow=TRUE))

\newpage

```r",cache=FALSE}

Generated in hack chunk

prob_cdf_fig

\newpage

```r",cache=FALSE,warning=FALSE}
#Generated in hack chunk
cond_acc_fig

\newpage

```r trophic states \label{fig:gis_probability_map}",cache=FALSE,fig.height=7} state<-map_data('state') lakes_alb<-data.frame(ltmData[["AlbersX"]],ltmData[["AlbersY"]], ltmData[["NLA_ID"]]) p4s<-"+proj=aea +lat_1=29.5 +lat_2=45.5 +lat_0=37.5 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +datum=NAD83 +units=m +no_defs" ll<-"+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs" lakes_alb_sp<-SpatialPointsDataFrame(coordinates(lakes_alb[,1:2]),proj4string=CRS(p4s), data=data.frame(nla_id = as.character(lakes_alb[,3]))) lakes_dd<-spTransform(lakes_alb_sp,CRS=CRS(ll)) lakes_dd<-data.frame(coordinates(lakes_dd),lakes_dd$nla_id) names(lakes_dd)<-c("long","lat","nla_id") ts_prob_map(state,lakes_dd,gis_rf_ts_prob)

\newpage

```r trophic state.  Shows spatial patterns of prediction uncertainty.  \\label{fig:predicted_prob_map}",cache=FALSE}

pred_prob_map(state,lakes_dd,gis_rf_ts_prob)

\newpage

```r",cache=FALSE,fig.height=7} density_plot(gis_rf_ts_prob,xvar="max",x="Predicted Probability")

\newpage

#Tables

```r",cache=FALSE}
ts_4<-c("oligotrophic","mesotrophic","eutrophic","hypereutrophic")
ts_3<-c("oligotrophic","mesotrophic/eutrophic","mesotrophic/eutrophic","hypereutrophic")
ts_2<-c("oligotrophic/mesotrophic","oligotrophic/mesotrophic","eutrophic/hypereutrophic","eutrophic/hypereutrophic")
co<-c("<= 2",">2-7",">7-30",">30")
xdf<-data.frame(ts_4,ts_2,co)
names(xdf)<-c("Trophic State (4 class)","Trophic State (2 class)",
              "$\\mu$g/L Cut-off")
kable(xdf)

\newpage

```r",cache=FALSE}

NEED TO FIX ALL OF THESE - Tables not DFs and need to get class accuracy added

cmt<-table_to_df(round(all_ts4_conmat,2)) cmt$Class_Accuracy <- apply(cmt,1,sum) for(i in 1:nrow(cmt)){ cmt$Class_Accuracy[i]<-round((cmt[i,i]/cmt$Class_Accuracy[i])*100,2) } dimnames(cmt)[[2]][5]<-'Class Accuracy (%)' kable(cmt,row.names=T)

\newpage

```r",cache=FALSE}
cmt<-table_to_df(round(gis_ts4_conmat,2)) 
cmt$Class_Accuracy <- apply(cmt,1,sum)
for(i in 1:nrow(cmt)){
  cmt$Class_Accuracy[i]<-round((cmt[i,i]/cmt$Class_Accuracy[i])*100,2)
}
dimnames(cmt)[[2]][5]<-'Class Accuracy (%)'
kable(cmt,row.names=T)

\newpage

```r",cache=FALSE} all_min_acc<-cp1[1,4] all_50_acc<-cp1[cp1[,1]>=0.5,][1,4] all_60_acc<-cp1[cp1[,1]>=0.6,][1,4] all_70_acc<-cp1[cp1[,1]>=0.7,][1,4] all_80_acc<-cp1[cp1[,1]>=0.8,][1,4] all_90_acc<-cp1[cp1[,1]>=0.9,][1,4] all_min_perc<-(nrow(all_rf_ts_prob[all_rf_ts_prob$max >= cp1[1,1],])/ nrow(all_rf_ts_prob)) all_50_perc<-(nrow(all_rf_ts_prob[all_rf_ts_prob$max >= cp1[cp1[,1]>=0.5,][1,1],])/ nrow(all_rf_ts_prob)) all_60_perc<-(nrow(all_rf_ts_prob[all_rf_ts_prob$max >= cp1[cp1[,1]>=0.6,][1,1],])/ nrow(all_rf_ts_prob)) all_70_perc<-(nrow(all_rf_ts_prob[all_rf_ts_prob$max >= cp1[cp1[,1]>=0.7,][1,1],])/ nrow(all_rf_ts_prob)) all_80_perc<-(nrow(all_rf_ts_prob[all_rf_ts_prob$max >= cp1[cp1[,1]>=0.8,][1,1],])/ nrow(all_rf_ts_prob)) all_90_perc<-(nrow(all_rf_ts_prob[all_rf_ts_prob$max >= cp1[cp1[,1]>=0.9,][1,1],])/ nrow(all_rf_ts_prob)) all_min_samp<-nrow(all_rf_ts_prob[all_rf_ts_prob$max >= cp1[1,1],]) all_50_samp<-nrow(all_rf_ts_prob[all_rf_ts_prob$max >= cp1[cp1[,1]>=0.5,][1,1],]) all_60_samp<-nrow(all_rf_ts_prob[all_rf_ts_prob$max >= cp1[cp1[,1]>=0.6,][1,1],]) all_70_samp<-nrow(all_rf_ts_prob[all_rf_ts_prob$max >= cp1[cp1[,1]>=0.7,][1,1],]) all_80_samp<-nrow(all_rf_ts_prob[all_rf_ts_prob$max >= cp1[cp1[,1]>=0.8,][1,1],]) all_90_samp<-nrow(all_rf_ts_prob[all_rf_ts_prob$max >= cp1[cp1[,1]>=0.9,][1,1],])

condprob_t<-data.frame(pred_prob=c("All","0.50","0.60","0.70","0.80","0.90"), all_acc = c(all_min_acc,all_50_acc,all_60_acc, all_70_acc,all_80_acc,all_90_acc), all_perc = c(all_min_perc,all_50_perc,all_60_perc, all_70_perc,all_80_perc,all_90_perc), all_samp = c(all_min_samp,all_50_samp,all_60_samp, all_70_samp,all_80_samp,all_90_samp)) condprob_t[,2:3]<-round(condprob_t[,2:3],2) condprob_t[,2:3]<-100*condprob_t[,2:3] names(condprob_t)<-c("Prediction Prob.", "Kappa Coefficient", "Percent of Sample", "Number of Samples") kable(condprob_t,row.names=F)

\newpage

```r",cache=FALSE}
gis_min_acc<-cp2[1,4]
gis_50_acc<-cp2[cp2[,1]>=0.5,][1,4]
gis_60_acc<-cp2[cp2[,1]>=0.6,][1,4]
gis_70_acc<-cp2[cp2[,1]>=0.7,][1,4]
gis_80_acc<-cp2[cp2[,1]>=0.8,][1,4]
gis_90_acc<-cp2[cp2[,1]>=0.9,][1,4]
gis_min_perc<-(nrow(gis_rf_ts_prob[gis_rf_ts_prob$max >= cp2[1,1],])/
                 nrow(gis_rf_ts_prob))
gis_50_perc<-(nrow(gis_rf_ts_prob[gis_rf_ts_prob$max >= cp2[cp2[,1]>=0.5,][1,1],])/
                 nrow(gis_rf_ts_prob))
gis_60_perc<-(nrow(gis_rf_ts_prob[gis_rf_ts_prob$max >= cp2[cp2[,1]>=0.6,][1,1],])/
                 nrow(gis_rf_ts_prob))
gis_70_perc<-(nrow(gis_rf_ts_prob[gis_rf_ts_prob$max >= cp2[cp2[,1]>=0.7,][1,1],])/
                 nrow(gis_rf_ts_prob))
gis_80_perc<-(nrow(gis_rf_ts_prob[gis_rf_ts_prob$max >= cp2[cp2[,1]>=0.8,][1,1],])/
                 nrow(gis_rf_ts_prob))
gis_90_perc<-(nrow(gis_rf_ts_prob[gis_rf_ts_prob$max >= cp2[cp2[,1]>=0.9,][1,1],])/
                 nrow(gis_rf_ts_prob))
gis_min_samp<-nrow(gis_rf_ts_prob[gis_rf_ts_prob$max >= cp2[1,1],])
gis_50_samp<-nrow(gis_rf_ts_prob[gis_rf_ts_prob$max >= cp2[cp2[,1]>=0.5,][1,1],])
gis_60_samp<-nrow(gis_rf_ts_prob[gis_rf_ts_prob$max >= cp2[cp2[,1]>=0.6,][1,1],])
gis_70_samp<-nrow(gis_rf_ts_prob[gis_rf_ts_prob$max >= cp2[cp2[,1]>=0.7,][1,1],])
gis_80_samp<-nrow(gis_rf_ts_prob[gis_rf_ts_prob$max >= cp2[cp2[,1]>=0.8,][1,1],])
gis_90_samp<-nrow(gis_rf_ts_prob[gis_rf_ts_prob$max >= cp2[cp2[,1]>=0.9,][1,1],])

condprob_t<-data.frame(pred_prob=c("All","0.50","0.60","0.70","0.80","0.90"),
                       gis_acc = c(gis_min_acc,gis_50_acc,gis_60_acc,
                                   gis_70_acc,gis_80_acc,gis_90_acc),
                       gis_perc = c(gis_min_perc,gis_50_perc,gis_60_perc,
                                   gis_70_perc,gis_80_perc,gis_90_perc),
                       gis_samp = c(gis_min_samp,gis_50_samp,gis_60_samp,
                                   gis_70_samp,gis_80_samp,gis_90_samp))
condprob_t[,2:3]<-round(condprob_t[,2:3],2)
condprob_t[,2:3]<-100*condprob_t[,2:3]
names(condprob_t)<-c("Prediction Prob.",
                     "Kappa Coefficient",
                     "Percent of Sample",
                     "Number of Samples")
kable(condprob_t,row.names=F)
#pander::pandoc.table(condprob_t,keep.line.breaks=T,split.tables=Inf)

\newpage

Appendix 1. Variable Definitions

data_def <- read.csv("../inst/extdata/data_def.csv", stringsAsFactors = FALSE)
data_def<-arrange(data_def,type, variable_names)
mn<-vector("numeric",length(data_def$variable_names))
se<-vector("numeric",length(data_def$variable_names))
for(i in 1:length(data_def$variable_names)){
  if(is.numeric(ltmData[,data_def$variable_names[i]])){
    mn[i]<-mean(ltmData[,data_def$variable_names[i]],na.rm=TRUE)
    se[i]<-sd(ltmData[,data_def$variable_names[i]],na.rm=TRUE)/sqrt(length(ltmData[,data_def$variable_names[i]]))
  } else {
    mn[i]<-NA
    se[i]<-NA
  }
}
data_def<-data.frame(data_def,round(mn,1),round(se,1))
##Get Summary Stats and Try to Fix ug\l in table
names(data_def)<-c("Variable Names","Description", "Source","Mean","Std. Error")
kable(data_def,row.names=FALSE,align = 'l',padding=0)

\newpage

#Send session info to a file: 
capture.output(sessionInfo(),file="sessioninfo.txt")

References



USEPA/LakeTrophicModelling documentation built on Oct. 15, 2020, 4:13 p.m.