#' @title R6 Class for storing inputs and data for analysis and simulations
#'
#' @description R6 Class for storing user specified inputs and processing data
#' for the analysis/simulation and Rx building steps of the OFPE data cycle.
#' This object includes user selections such as the field and year of data
#' to export from an OFPE database and the type of data (grid or observed) for analysis
#' and simulation/prescription generation.
#'
#' Inputs can be supplied directly to this class during instantiation, however
#' this is NOT recommended except for advanced users. It is recommended that the
#' user supplies the database connection and uses the interactive selection
#' methods to select user inputs.
#'
#' This class stores inputs from the user and has the methods for for exporting
#' data from the database and processing the data for analysis, simulation, and
#' prescription building.
#' @seealso \code{\link{DBCon}} for database connection class,
#' \code{\link{ModClass}} for model fitting class that relies on data in DatClass,
#' \code{\link{SimClass}} for simulation class that rely on data in DatClass.
#' @export
DatClass <- R6::R6Class(
"DatClass",
public = list(
#' @field dbCon Database connection object connected to an OFPE formatted
#' database, see DBCon class.
dbCon = NULL,
#' @field farmername Name of the farmer that owns the selected field.
farmername = NULL,
#' @field fieldname Name of the field for analysis. Selected from
#' the 'all_farms.fields' table of an OFPE formatted database.
fieldname = NULL,
#' @field respvar Response variable(s) to optimize experimental inputs based
#' off of. The user can select 'Yield' and/or 'Protein' based on data
#' availability. User must select at least 'Yield'.
respvar = NULL,
#' @field expvar Experimental variable to optimize, select/input
#' 'As-Applied Nitrogen' or 'As-Applied Seed Rate'. This is the type of
#' input that was experimentally varied across the field as part of the
#' on-farm experimentation.
expvar = NULL,
#' @field sys_type Provide the type of system used in the experiment.
#' This determines the price used for calculating net-return and for
#' the net-return of the opposite type. Select from "Conventional" and
#' "Organic". The net-returns will be calculated with the corresponding
#' economic data based on this choice, and the 'NRopp' management
#' scenario (see SimClass$executeSim) will be based on the opposite
#' (e.g. if you are growing conventional wheat, the management outcome
#' 'NRopp' shows the net-return calculated from organically grown wheat).
#' In the example, organic prices are calculated from 0 N fertilizer rates,
#' however with seeding rates it is purely the difference in the price received
#' used to calculate net-return.
sys_type = NULL,
#' @field yldyears The year(s) of interest for the yield response
#' variables in the selected field. This must be a named list with the
#' specified field names.
yldyears = NULL,
#' @field proyears The year(s) of interest for the protein response
#' variables in the selected field. This must be a named list with the
#' specified field names.
proyears = NULL,
#' @field mod_grid Select whether to use gridded or observed data
#' locations for the analysis step. See the 'AggInputs' class for more
#' information on the 'GRID' option. The user must have aggregated data
#' with the specified GRID option prior to this step. (i.e. you will not
#' have access to data aggregated with the 'Grid' option if you have not
#' executed the process of aggregation with the 'Grid' option selected. The
#' same principle applies for the 'Observed' option. It is recommended that
#' the analysis is performed with 'Observed' data, and for the simulation to
#' be performed with 'Grid' data.
mod_grid = NULL,
#' @field sim_grid Select whether to use gridded or observed data
#' locations for the simulation and subsequent prescription building step.
#' See the 'AggInputs' class for more information on the 'GRID' option.
#' The user must have aggregated data with the specified GRID option prior
#' to this step. (i.e. you will not have access to data aggregated with the
#' 'Grid' option if you have not executed the process of aggregation with
#' the 'Grid' option selected. The same principle applies for the 'Observed'
#' option. It is recommended that the analysis is performed with 'Observed'
#' data, and for the simulation to be performed with 'Grid' data.
sim_grid = NULL,
#' @field dat_used Option for the length of year to use data in the analysis,
#' simulation, and prescription building steps. See the 'AggInputs' class
#' documentation for more information on the 'dat_used' selection.
dat_used = NULL,
#' @field center TRUE/FALSE. Option for whether to center explanatory data around
#' each explanatory variables mean or to use the raw observed explanatory varaible
#' data. Centering is recommended as it puts variables on similar scales and makes
#' the model fitting process less error prone.
center = NULL,
#' @field split_pct Select the percentage of data to use for the training dataset
#' in the analysis step. The training dataset is used to fit the model to each
#' of the crop responses. The difference will be split into a validation dataset
#' that is used to evaluate the model performance on data it has not 'seen' before.
split_pct = NULL,
#' @field clean_rate Select the maximum rate that could be realistically be applied
#' by the application equipment (sprayer or seeder). This is used for a rudimentary
#' cleaning of the data that removes observations with as-applied rates above this
#' user supplied threshold. Rates above this threshold should be able to be classified
#' as machine measurement errors. For example, based on knowledge of the prescription/
#' experiment applied and taking into account double applications on turns, a rate
#' for as-applied nitrogen might be something like 300 - 400 lbs N/acre.
clean_rate = NULL,
#' @field mod_dat Based off of the user selections such as 'mod_grid', this is a named
#' list for each response variable ('yld' and/or 'pro'). The data in each of
#' these lists are processed and then split into training and validation datasets. This
#' data is used for the model fitting and evaluations steps.
mod_dat = NULL,
#' @field sim_dat Based off of the user selections such as 'sim_grid', this is a named
#' list for each year specified in the SimClass 'sim_years' field. The data in each of
#' these lists are processed and used in the Monte Carlo simulation.
sim_dat = NULL,
#' @field mod_num_means Named vector of the means for each numerical covariate, including
#' the experimental variable. This is used for converting centered data back to the
#' original form. The centering process does not center four numerical variables; the
#' x and y coordinates, the response variable (yld/pro), and the experimental variable.
#' This is for the data specified from the analysis data inputs (grid specific).
mod_num_means = NULL,
#' @field sim_num_means Named vector of the means for each numerical covariate, including
#' the experimental variable. This is used for converting centered data back to the
#' original form. The centering process does not center three numerical variables; the
#' x and y coordinates, the response variable (yld/pro) and the experimental variable.
#' This is for the data specified from the analysis data inputs (grid specific).
sim_num_means = NULL,
#' @field opp_sys_type Opposite of the user selected system type ('sys_type'). This is
#' used to select the correct price received to calculate 'NRopp' in the Monte Carlo
#' simulation.
opp_sys_type = NULL,
#' @field fieldname_codes Data.frame with a column for the names of the fields selected
#' by the user and a corresponding code. This is used in the simulation data when being
#' passed to C++ functions as purely numeric matrices.
fieldname_codes = NULL,
#' @field SI Logical, whether to use SI units. If TRUE, yield and experimental data are
#' converted to kg/ha. If FALSE, the default values from the database are used. These are
#' bu/ac for yield and lbs/ac for experimental data (nitrogen or seed).
SI = NULL,
#' @param dbCon Database connection object connected to an OFPE formatted
#' database, see DBCon class.
#' @param farmername Name of the farmer that owns the selected field.
#' @param fieldname Name of the field to for analysis. Selected from
#' the 'all_farms.fields' table of an OFPE formatted database.
#' @param respvar Response variable(s) to optimize experimental inputs based
#' off of. The user can select 'Yield' and/or 'Protein' based on data
#' availability. User must select at least 'Yield'.
#' @param expvar Experimental variable to optimize, select/input
#' 'As-Applied Nitrogen' or 'As-Applied Seed Rate'. This is the type of
#' input that was experimentally varied across the field as part of the
#' on-farm experimentation.
#' @param sys_type Provide the type of system used in the experiment.
#' This determines the price used for calculating net-return and for
#' the net-return of the opposite type. Select from "Conventional" and
#' "Organic". The net-returns will be calculated with the corresponding
#' economic data based on this choice, and the 'NRopp' management
#' scenario (see SimClass$executeSim) will be based on the opposite
#' (e.g. if you are growing conventional wheat, the management outcome
#' 'NRopp' shows the net-return calculated from organically grown wheat).
#' In the example, organic prices are calculated from 0 N fertilizer rates,
#' however with seeding rates it is purely the difference in the price received
#' used to calculate net-return.
#' @param yldyears The year(s) of interest for the yield response
#' variables in the selected field. This must be a named list with the
#' specified field names.
#' @param proyears The year(s) of interest for the protein response
#' variables in the selected field. This must be a named list with the
#' specified field names.
#' @param mod_grid Select whether to use gridded or observed data
#' locations for the analysis step. See the 'AggInputs' class for more
#' information on the 'GRID' option. The user must have aggregated data
#' with the specified GRID option prior to this step. (i.e. you will not
#' have access to data aggregated with the 'Grid' option if you have not
#' executed the process of aggregation with the 'Grid' option selected. The
#' same principle applies for the 'Observed' option. It is recommended that
#' the analysis is performed with 'Observed' data, and for the simulation to
#' be performed with 'Grid' data.
#' @param dat_used Option for the length of year to use data in the analysis,
#' simulation, and prescription building steps. See the 'AggInputs' class
#' documentation for more information on the 'dat_used' selection.
#' @param center TRUE/FALSE. Option for whether to center explanatory data around
#' each explanatory variables mean or to use the raw observed explanatory varaible
#' data. Centering is recommended as it puts variables on similar scales and makes
#' the model fitting process less error prone.
#' @param split_pct Select the percentage of data to use for the training dataset
#' in the analysis step. The training dataset is used to fit the model to each
#' of the crop responses. The difference will be split into a validation dataset
#' that is used to evaluate the model performance on data it has not 'seen' before.
#' @param SI Logical, whether to use SI units. If TRUE, yield and experimental data are
#' converted to kg/ha. If FALSE, the default values from the database are used. These are
#' bu/ac for yield and lbs/ac for experimental data (nitrogen or seed).
#' @param clean_rate Select the maximum rate that could be realistically be applied
#' by the application equipment (sprayer or seeder). This is used for a rudimentary
#' cleaning of the data that removes observations with as-applied rates above this
#' user supplied threshold. Rates above this threshold should be able to be classified
#' as machine measurement errors. For example, based on knowledge of the prescription/
#' experiment applied and taking into account double applications on turns, a rate
#' for as-applied nitrogen might be something like 300 - 400 lbs N/acre. NOTE: make sure to
#' specify in the correct units, for example if SI = FALSE specify in lbs/ac, otherwise
#' in kg/ha.
#' @return A new 'AggInputs' object.
initialize = function(dbCon,
farmername = NULL,
fieldname = NULL,
respvar = NULL,
expvar = NULL,
sys_type = NULL,
yldyears = NULL,
proyears = NULL,
mod_grid = NULL,
dat_used = NULL,
center = NULL,
split_pct = NULL,
SI = FALSE,
clean_rate = NULL) {
stopifnot(!is.null(dbCon))
self$dbCon <- dbCon
if (!is.null(farmername)) {
stopifnot(is.character(farmername))
self$farmername <- farmername
}
if (!is.null(fieldname)) {
stopifnot(is.character(fieldname))
self$fieldname <- fieldname
}
if (!is.null(respvar)) {
stopifnot(is.character(respvar),
any(grepl("Yield|Protein", respvar)))
self$respvar <- ifelse(respvar == "Yield", "yld", "pro")
stopifnot(any(grepl("yld", self$respvar)))
}
if (!is.null(expvar)) {
stopifnot(is.character(expvar),
any(grepl("As-Applied Nitrogen|As-Applied Seed Rate", expvar)))
self$expvar <- ifelse(expvar == "As-Applied Nitrogen", "aa_n", "aa_sr")
}
if (!is.null(sys_type)) {
stopifnot(is.character(sys_type),
any(grepl("Conventional|Organic", sys_type)))
self$sys_type <- ifelse(sys_type == "Conventional", "conv", "org")
self$opp_sys_type <- ifelse(self$sys_type == "conv", "org", "conv")
}
if (!is.null(yldyears)) {
stopifnot(is.list(yldyears),
any(grepl(paste(self$fieldname, collapse = "|"),
names(yldyears))),
length(yldyears) == length(fieldname))
self$yldyears <- yldyears
}
if (!is.null(proyears)) {
stopifnot(is.list(proyears),
any(grepl(paste(self$fieldname, collapse = "|"),
names(proyears))),
length(proyears) == length(fieldname))
self$proyears <- proyears
}
if (!is.null(mod_grid)) {
stopifnot(is.character(mod_grid),
grepl("Grid|Observed", mod_grid))
self$mod_grid <- ifelse(mod_grid == "Grid", "grid", "obs")
}
if (!is.null(dat_used)) {
stopifnot(is.character(dat_used))
self$dat_used <- ifelse(dat_used == "Decision Point",
"decision_point",
"full_year")
}
if (!is.null(center)) {
stopifnot(is.logical(center))
self$center <- center
}
if (!is.null(self$yldyears) & !is.null(self$proyears)) {
private$years <- list(yldyears=self$yldyears,
proyears=self$proyears)
} else {
if (!is.null(self$yldyears)) {
private$years <- list(yldyears=self$yldyears)
}
if (!is.null(self$proyears)) {
private$years <- list(proyears=self$proyears)
}
}
if (!is.null(split_pct)) {
stopifnot(is.numeric(split_pct))
self$split_pct <- split_pct
}
if (!is.null(clean_rate)) {
stopifnot(is.numeric(clean_rate))
self$clean_rate <- clean_rate
}
stopifnot(is.logical(SI))
self$SI <- SI
},
#' @description
#' Interactive method for selecting inputs related to the data used in the
#' analysis, simulation, and subsequent prescription generation steps. The
#' description below describes the process of interactively selecting the
#' necessary parameters needed for the automated analysis, simulation, and
#' prescription building.
#'
#' The user first selects a farmer for which they want to analyze a field
#' from, which is used to compile a list of available fields ready for
#' analysis, indicated by its presence in the farmername_a schema of the
#' OFPE database.
#'
#' The user then selects the response variables to optimize on and the
#' experimental variable to optimize. The user must know what data is
#' available for the specific field (i.e. if the user select 'Protein' they
#' must have aggregated protein data for the specified field, or if the
#' user selects 'As-Applied Seed Rate' seed rates must have been the
#' experimental variable of interest when aggregating data).
#'
#' The user then selects the location of aggregated data to use for both
#' the analysis and simulation/prescription building steps. The user also
#' needs to select the length of the year for which 'current' year data
#' was aggregated for (March 30th decision point or the full year).
#'
#' The user also has the choice of which vegetation index data to use as
#' covariates, as well as the preferred source for precipitation and
#' growing degree day data. Finally, the user has the option of whether
#' to center covariate data or to use the raw observed data for analysis
#' and simulation and the percent of data to use in the training data for
#' model fitting. The rest of the data is withheld for validation.
#' @param None No arguments needed because passed in during class
#' instantiation.
#' @return A 'DatClass' object with complete user selections.
selectInputs = function() {
private$.selectFarmer(self$dbCon$db)
private$.selectRespvar()
private$.selectField(self$dbCon$db)
private$.selectYears(self$dbCon$db)
private$.selectExpvar()
private$.selectSystemType()
private$.selectAggLocs()
private$.selectAggLOY()
private$.selectCenter()
private$.selectDatSplitPct()
private$.selectSI()
private$.selectCleanRate()
},
#' @description
#' This function calls the private methods for data gathering and
#' processing. The data gather step takes the user selected inputs
#' for the field, the response variables, and the data types ('mod_grid')
#' and exports the appropriate data into a a list, called 'mod_dat' with
#' lists, named for each response variable ('yld' and/or
#' 'pro') with each data type data from all fields selected.
#'
#' The processing step goes through each data frame contained in the
#' nested 'mod_dat' list and trims the data based on
#' the user selections for the vegetation index and precipitation and
#' growing degree day sources. If the user selected to center the
#' covariate data, the values of each variable will be subtracted from
#' the mean of that variable. In this case, a named
#' vector of each variable and the mean will be created for reverting
#' back to observed values.
#'
#' After this step, the data in 'mod_dat' is split into training and
#' validation sets based on the percentage of data the user selected
#' to include in the training dataset.
#' @param None No arguments needed because passed in during class
#' @return A named list with training and validation data, called
#' 'mod_dat', for each response variable ('yld' and/or 'pro').
setupDat = function() {
self$mod_dat <- private$.fetchDat(self$mod_grid, self$respvar) %>%
lapply(private$.processDat) %>%
invisible()
self$mod_num_means <- as.list(self$respvar) %>%
`names<-`(self$respvar)
self$mod_num_means <- lapply(self$mod_dat, private$.findMeans)
self$mod_dat <- mapply(private$.centerDat,
self$mod_dat,
self$mod_num_means,
SIMPLIFY = FALSE)
private$.splitDat()
},
#' @description
#' This function calls the private methods for data gathering and
#' processing. The gathering process takes the vector of simulation
#' years and gathers the appropriate 'sat' data from the OFPE database
#' and then processes the data using the same parameters as for the
#' data used in the model fitting process.
#' @param sim_years Vector of years available in the database
#' to gather to simulate management outcomes in.
#' @return A data.table with the user specified data for the simulation.
getSimDat = function(sim_years) {
self$sim_dat <- as.list(sim_years) %>%
`names<-`(sim_years)
self$sim_dat <- lapply(self$sim_dat,
private$.gatherSatDat,
'sat',
self$fieldname,
'grid')
self$sim_dat <- lapply(self$sim_dat,
private$.processSatDat) %>%
lapply(data.table::as.data.table) %>%
invisible()
self$sim_num_means <- as.list(self$respvar) %>%
`names<-`(self$respvar)
self$sim_num_means <- lapply(self$sim_dat, private$.findMeans)
self$sim_dat <- mapply(private$.centerDat,
self$sim_dat,
self$sim_num_means,
SIMPLIFY = FALSE) %>%
lapply(private$.makeAllSimColsNumeric)
}
# TODO
# more fields
# like year dat (if selected)
),
private = list(
years = NULL,
.selectFarmer = function(db) {
self$farmername <- as.character(select.list(
unique(DBI::dbGetQuery(db, "SELECT farmer FROM all_farms.farmers")$farmer),
multiple = FALSE,
title = "Select farm to analyze a field in."))
},
.selectRespvar = function() {
respVar <- as.character(select.list(
c("Yield", "Protein"),
multiple = TRUE,
title = "Select response variable(s) to optimize on. In some cases protein data is not available. However, either yield or protein must be selected."
))
self$respvar <- ifelse(respVar == "Yield", "yld", "pro")
stopifnot(any(grepl("yld", self$respvar)))
},
.selectExpvar = function() {
expVar <- as.character(select.list(
c("As-Applied Nitrogen", "As-Applied Seed Rate"),
title = "Select experimental variable."))
self$expvar <- ifelse(expVar == "As-Applied Nitrogen", "aa_n", "aa_sr")
},
.selectSystemType = function() {
sys_type <- as.character(select.list(
c("Conventional", "Organic"),
title = "Select your system type."))
self$sys_type <- ifelse(sys_type == "Conventional", "conv", "org")
self$opp_sys_type <- ifelse(self$sys_type == "conv", "org", "conv")
},
.selectField = function(db) {
flds <- rep(list(NA), length(self$respvar))
for (i in 1:length(flds)) {
tabExist <- DBI::dbGetQuery(
db,
paste0("SELECT EXISTS (
SELECT 1
FROM information_schema.tables
WHERE table_schema = '", self$farmername, "_a'
AND table_name = '", self$respvar[i], "')"
)) %>%
as.numeric() %>%
as.logical()
if (tabExist) {
flds[[i]] <- DBI::dbGetQuery(
db,
paste0("SELECT DISTINCT field FROM ",
self$farmername, "_a.", self$respvar[i], ";")
)$field
} else {
self$respvar[i] <- NA
}
}
flds <- unlist(flds) %>%
na.omit()
self$respvar <- self$respvar[!is.na(self$respvar)]
self$fieldname <-
as.character(select.list(
unique(flds),
multiple = TRUE,
title = "Select field(s) to analyze data for. Multiple can be selected if desired (i.e. sec1east and sec1west)."
)
)
},
.selectYears = function(db) {
for (i in 1:length(self$respvar)) {
years <- rep(list(NA), length(self$fieldname)) %>%
`names<-`(self$fieldname)
for (j in 1:length(years)) {
years[[j]] <- as.character(select.list(unique(DBI::dbGetQuery(
db,
paste0("SELECT DISTINCT year
FROM ", self$farmername, "_a.", self$respvar[i], " ", self$respvar[i], "
WHERE ", self$respvar[i], ".field = '", self$fieldname[j], "'"))$year),
multiple = TRUE,
title = paste0("Select years from ",
self$fieldname[j],
" to get ", ifelse(self$respvar[i] == "yld", "Yield", "Protein"),
" data for to include in analysis.")))
}
if (self$respvar[i] == "yld") {
self$yldyears <- years
} else {
self$proyears <- years
}
}
if (!is.null(self$yldyears) & !is.null(self$proyears)) {
private$years <- list(yldyears=self$yldyears,
proyears=self$proyears)
} else {
if (!is.null(self$yldyears)) {
private$years <- list(yldyears=self$yldyears)
}
if (!is.null(self$proyears)) {
private$years <- list(proyears=self$proyears)
}
}
},
.selectAggLocs = function() {
gridOrObs <- as.character(select.list(
c("Grid", "Observed"),
title = paste0("Select whether to use gridded ('Grid') or observed ('Observed') data locations for the analysis step.")
))
self$mod_grid <- ifelse(gridOrObs == "Grid", "grid", "obs")
},
.selectAggLOY = function() {
data_used <- as.character(select.list(
c("Decision Point", "Full Year"),
title = paste0("Select the data constraint for determining the time span for which to gather data.")
))
self$dat_used <- ifelse(data_used == "Decision Point",
"decision_point",
"full_year")
},
.selectCenter = function() {
self$center <- as.logical(select.list(
c(TRUE, FALSE),
title = paste0("Select whether to center explanatory data (TRUE) or to use the measured explanatory data (FALSE).")
))
},
.selectDatSplitPct = function() {
self$split_pct <- as.numeric(readline(
"Provide the percentage of data to use as a training dataset for model fitting. The rest of the data will be witheld for model validation: "
))
},
.selectSI = function() {
self$SI <- as.logical(select.list(
c(TRUE, FALSE),
title = paste0("Select whether to use SI units. If so, yield and experimental data will be converted to kg/ha. Otherwise the default is bu/ac and lbs/ac.")
))
},
.selectCleanRate = function() {
self$clean_rate <- as.numeric(readline(
"Provide the threshold for as-applied rates above which are obvious machine measurment errors (e.g. 350 lbs/N/acre): "
))
},
.fetchDat = function(GRID, respvar) {
dat <- as.list(respvar) %>%
`names<-`(respvar)
dat <- mapply(
private$.importDBdat,
dat,
private$years,
MoreArgs = list(GRID = GRID),
SIMPLIFY = FALSE
)
return(dat)
},
.importDBdat = function(respvar, years, GRID) {
dat <- rep(list(NA),
length(years)) %>%
`names<-`(names(years))
#fieldname <- names(dat)
dat <- mapply(private$.gatherDBdat,
years,
respvar,
self$fieldname,
MoreArgs = list(GRID = GRID)) %>%
data.table::rbindlist()
return(dat)
},
.gatherDBdat = function(years, respvar, fieldname, GRID) {
dat <- as.list(years) %>%
`names<-`(years)
dat <- lapply(dat,
private$.getDBdat,
respvar,
fieldname,
GRID)
return(dat)
},
.getDBdat = function(year, respvar, fieldname, GRID) {
OFPE::removeTempFarmerTables(self$dbCon$db, self$farmername)
tt <- invisible(
DBI::dbSendQuery(
self$dbCon$db,
paste0(
"CREATE TABLE ", self$farmername,"_a.temp AS (SELECT *
FROM ", self$farmername, "_a.", respvar, " ", respvar,"
WHERE field = '", fieldname, "'
AND year = '", year, "'
AND grid = '", GRID, "'
AND datused = '", self$dat_used,"');"
)
)
)
DBI::dbClearResult(tt)
tt <- invisible(
DBI::dbSendQuery(
self$dbCon$db,
paste0(
"ALTER TABLE ",
self$farmername, "_a.temp
DROP COLUMN geometry;"
)
)
)
DBI::dbClearResult(tt)
db_dat <- invisible(
DBI::dbGetQuery(
self$dbCon$db,
paste0("SELECT * FROM ", self$farmername, "_a.temp;")
)
)
tt <- invisible(
DBI::dbSendQuery(
self$dbCon$db,
paste0(
"DROP TABLE ", self$farmername, "_a.temp;"
)
)
)
DBI::dbClearResult(tt)
## TEMP - REMOVE!
# set.seed(342134)
# if (respvar == "yld") {
# db_dat <- db_dat[runif(nrow(db_dat) * 0.05, 1, nrow(db_dat)), ]
# } else {
# db_dat <- db_dat[runif(nrow(db_dat) * 0.1, 1, nrow(db_dat)), ]
# }
## TEMP - REMOVE!
return(db_dat)
},
.processDat = function(dat) {
dat <- private$.trimCols(
dat, c("grid", "size", "datused", "farmer", "prev_year")) %>%
private$.makeFactors() %>%
private$.makeOLMmeans() %>%
private$.convertRespAndExpDat() %>%
private$.cleanDat()
return(dat)
},
.processSatDat = function(dat) {
dat <- private$.trimCols(
dat, c("grid", "size", "datused", "farmer", "prev_year")) %>%
private$.makeFactors() %>%
private$.makeOLMmeans() %>%
private$.cleanDat()
return(dat)
},
.trimCols = function(dat, trim_cols) {
return(dat[, !(names(dat) %in% trim_cols), with = FALSE])
},
.makeFactors = function(dat) {
dat$field <- factor(dat$field)
dat$year <- factor(dat$year)
dat$musym <- factor(dat$musym)
dat$grtgroup <- factor(dat$grtgroup)
dat$texture0cm <- factor(dat$texture0cm)
dat$texture10cm <- factor(dat$texture10cm)
dat$texture30cm <- factor(dat$texture30cm)
dat$texture60cm <- factor(dat$texture60cm)
dat$texture100cm <- factor(dat$texture100cm)
dat$texture200cm <- factor(dat$texture200cm)
return(dat)
},
## take the mean of each OLM data by depth
.makeOLMmeans = function(dat) {
cols <- grep("bulkdensity", names(dat))
dat$bulkdensity <- rowMeans(dat[, cols, with = FALSE], na.rm = TRUE)
cols <- grep("claycontent", names(dat))
dat$claycontent <- rowMeans(dat[, cols, with = FALSE], na.rm = TRUE)
cols <- grep("sandcontent", names(dat))
dat$sandcontent <- rowMeans(dat[, cols, with = FALSE], na.rm = TRUE)
cols <- grep("phw", names(dat))
dat$phw <- rowMeans(dat[, cols, with = FALSE], na.rm = TRUE)
cols <- grep("watercontent", names(dat))
dat$watercontent <- rowMeans(dat[, cols, with = FALSE], na.rm = TRUE)
cols <- grep("carboncontent", names(dat))
dat$carboncontent <- rowMeans(dat[, cols, with = FALSE], na.rm = TRUE)
cols <- grep("texture", names(dat))
dat$texture <- apply(dat[, cols, with = FALSE], 1, private$.Mode)
return(dat)
},
.Mode = function(x, na.rm = FALSE) {
if(na.rm){
x = x[!is.na(x)]
}
ux <- unique(x)
return(ux[which.max(tabulate(match(x, ux)))])
},
.convertRespAndExpDat = function(dat) {
if (self$SI) {
cols <- grep("aa_n|aa_sr", names(dat))
for (i in 1:length(cols)) {
# 1 lbs/ac = 1.12085 kg/ha
nm <- as.symbol(cols[i])
dat <- dat[,(cols[i]) := dat[, cols[i], with = FALSE][[1]] * 1.12085]
}
if (any(grepl("yld", names(dat)))) {
cols <- grep("yld", names(dat))
for (i in 1:length(cols)) {
# 1 bu/ac (assuming 60lbs/bu) = 67.251 kg/ha
nm <- as.symbol(cols[i])
dat <- dat[,(cols[i]) := dat[, cols[i], with = FALSE][[1]] * 67.251]
}
}
}
return(dat)
},
.cleanDat = function(dat) {
if (any(grepl("aa_n|aa_sr|yld|pro", names(dat)))) {
col <- names(dat)[grep("^aa_n$|^aa_sr$", names(dat))]
for(i in 1:length(col)){
rows <- which(dat[, names(dat) %in% col[i], with = FALSE] < self$clean_rate)
dat <- dat[rows, ]
rows <- which(dat[, names(dat) %in% col[i], with = FALSE] >= 0)
dat <- dat[rows, ]
dat <- na.omit(dat, col)
}
col <- names(dat)[grep("^yld$|^pro$", names(dat))]
for(i in 1:length(col)){
rows <- which(dat[, names(dat) %in% col[i], with = FALSE] > 0)
dat <- dat[rows, ]
dat <- na.omit(dat, col)
}
}
return(dat)
},
.centerDat = function(dat, num_means) {
dat_list <- split(dat, dat$year)
dat_list <- mapply(private$.centerFun,
dat_list,
num_means,
SIMPLIFY = FALSE)
dat <- data.table::rbindlist(dat_list)
return(dat)
},
.centerFun = function(subdat, num_means) {
num_names <- names(num_means)
if (self$center) {
no_cent_cols <- c("^x$", "^y$", "cell_id", "field",
"size", "grid", "datused", "farmer",
"year", "prev_year", "geometry",
"grtgroup", "texture0cm", "texture10cm",
"texture30cm", "texture60cm",
"texture100cm", "texture200cm", "musym",
"texture")
no_cent_col_ids <- grep(paste(no_cent_cols, collapse = "|"), names(subdat))
dfc <- subdat %>% dplyr::select(-no_cent_col_ids) %>%
as.data.frame()
for (i in 1:ncol(dfc)) {
if (is.numeric(dfc[, i])) {
dfc[, i] <- dfc[, i] - mean(dfc[, i], na.rm = T)
}
}
names(dfc) <- paste0(names(dfc), "_cent")
subdat <- cbind(subdat, dfc)
rm(dfc) # save space in mem
}
return(subdat)
},
.findMeans = function(dat) {
num_names <- names(dat)[sapply(dat, is.numeric)]
num_names <- num_names[!grepl(paste0("^x$|^y$|^yld$|^pro$"), num_names)]
num_means <- rep(as.list(NA), length(unique(dat$year))) %>%
`names<-`(unique(dat$year))
means <- by(dat[, num_names, with = FALSE], dat$year, sapply, mean, na.rm = TRUE)
for (i in 1:length(unique(dat$year))) {
means[[i]][grep(paste0("^", self$expvar, "$"), names(means[[i]]))] <- 0 # don't center exp var
if (self$center) {
num_means[[i]] <- means[[i]]
} else {
num_means[[i]] <- rep(0, length(num_names)) %>%
`names<-`(num_names)
}
}
return(num_means)
},
.splitDat = function() {
set.seed(6201994)
self$mod_dat <- lapply(self$mod_dat, private$.splitDatTrnVal) %>%
`names<-`(names(self$mod_dat))
},
.splitDatTrnVal = function(dat) {
if (all(is.na(dat$musym))) {
sub_list <- split(dat, list(dat$field, dat$year))
} else {
sub_list <- split(dat, list(dat$field, dat$year, dat$musym))
}
split_dat <- lapply(sub_list, private$.dualSplit, self$split_pct)
out_trn <- sapply(split_dat,'[',"trn") %>%
data.table::rbindlist()
out_val <- sapply(split_dat,'[',"val") %>%
data.table::rbindlist()
out_dat <- list(trn = out_trn, val = out_val)
return(out_dat)
},
.subDat = function(STRING, dat) {
YEAR <- substr(STRING,
stringr::str_locate(STRING, "20")[1],
stringr::str_locate(STRING, "20")[1] + 3)
FIELD <- gsub(paste0(".", YEAR), "", STRING)
dat <- subset(dat, dat$field == FIELD & dat$year == YEAR)
return(dat)
},
.dualSplit = function(dat, trnprop) {
trnprop <- trnprop * 0.01
valprop <- 1 - trnprop
sum_sam <- round(nrow(dat) * trnprop) +
round(nrow(dat) * valprop)
dif <- nrow(dat) - sum_sam
if (dif == 0) {
subL <- split(
dat,
sample(c(rep("trn", round(nrow(dat) * trnprop)),
rep("val", round(nrow(dat) * valprop))))
)
} else {
subL <- split(
dat,
sample(c(rep("trn", round(nrow(dat) * trnprop) + dif),
rep("val", round(nrow(dat) * valprop))))
)
}
return(subL)
},
.gatherSatDat = function(year, respvar, fieldname, GRID) {
dat_list <- as.list(fieldname) %>%
`names<-`(fieldname)
for (i in 1:length(dat_list)) {
dat_list[[i]] <- private$.getDBdat(year, respvar, fieldname[i], GRID)
}
dat <- data.table::rbindlist(dat_list)
return(dat)
},
.makeAllSimColsNumeric = function(dat) {
stopifnot(!is.null(dat$field),
!is.null(dat$cell_id))
cell_id_split <- stringr::str_split(dat$cell_id, "_", simplify = FALSE) %>%
lapply(as.numeric)
cell_id_split <- do.call(rbind, cell_id_split) %>%
data.table::as.data.table() %>%
`names<-`(c("row", "col"))
dat <- cbind(cell_id_split, dat)
dat$cell_id <- NULL
self$fieldname_codes <-
data.frame(field = unique(dat$field),
field_code = seq(1, length(unique(dat$field))))
dat$field <- match(dat$field, self$fieldname_codes$field)
return(dat)
}
)
)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.