# plotting functions ------------------------------------------------------
# the goal of these functions is to compare a function to any available datasets
# with the same predictor and response variables. These functions are not from
# anywhere that I know of, so this is the closest I can think of a way to
# validate them
plot_equation_with_all_data <- function(df, xvar, yvar, est_f){
requireNamespace("assertthat")
assert_that(has_name(detritus_wider_correct_frenchguiana, xvar))
assert_that(has_name(detritus_wider_correct_frenchguiana, yvar))
dat <- df %>%
select_(xs = xvar, ys = yvar, "dataset_name") %>%
filter(complete.cases(.))
# browser()
assert_that(nrow(dat) > 0)
# extract the function: TODO make sure it is length 1 before doing this
f <- est_f[[1]]
curve_dat <- data_frame(xs = seq_range(dat$xs, n = 40, expand = FALSE),
ys = f(xs))
dat %>%
ggplot(aes(x = xs, y = ys, colour = dataset_name)) +
geom_point() +
geom_line(aes(colour = NULL), data = curve_dat) +
viridis::scale_color_viridis(discrete = TRUE) +
coord_trans("log", "log") +
xlab(xvar) +
ylab(yvar)
}
## make the setup data
create_equation_table <- function() {
frame_data(
~xvar, ~yvar, ~est_f, ~used_on_dataset,
"detritus150_20000", "detritus0_150", function(coarse) {exp(0.68961 * log(coarse) - 0.11363)}, c(6),
"detritus1500_20000", "detritus10_1500", function(med) {exp(0.79031 * log(med) - 0.070033)}, c(111),
"detritus150_850", "detritus0_150", function(med) { ((0.9857 *med) + 1.496)}, c(166,171,181)
)
}
plot_data_with_equation_table <- function(.equation_table, .detritus_data) {
.equation_table %>%
select(-used_on_dataset) %>%
by_row(plot_equation_with_all_data %>%
# use the most recent dataset for these calculations
partial(df = .detritus_data) %>%
# lift from using named arguements to using a list
lift_dl %>%
possibly(otherwise = NA_real_))
}
#' filter the dataset to just the data that this particular model uses.
#'
#' Before using an equation to predict a certain detritus volume, the data
#' should be filtered to show just the data that will be used to do the
#' prediction. This simplifies things like validation, plotting, and combining
#' with the rest of the data later.
#'
#' @param used_on_dataset Which dataset will this be used on?
#' @param xvar the predictor variable used. only used for checking to make sure
#' it exists at this site
#' @param df the dataset that this is used on (the most recent one)
#' @param ... cheating! allowing unused columns to pass through and come out the
#' other end of by_row
filter_by_dataset_id <- function(used_on_dataset, xvar, df, ...){
ds_id <- unlist(used_on_dataset)
df %>%
filter(dataset_id %in% ds_id) %>%
# check to make sure predictors actually exist
assert_(not_na, xvar) %>%
# and that dataset is not 0 from the filter above
verify(nrow(.) > 0)
}
mutate_new_col <- function(xvar, yvar, est_f, filtered_data){
# predicted flag as column or different header?
# test for all(is.na()) in target
newname <- paste0(yvar, "_function")
assert_that(length(est_f) == 1)
# unlist function(it is in a list column)
ff <- est_f[[1]]
mexp <- lazyeval::interp(~ ff(x), x = as.name(xvar))
ll <- list(mexp) %>% set_names(newname)
filtered_data[[1]] %>%
mutate_(.dots = ll)
}
do_filter_dataset_id <- function(.equation_table, .detritus_data) {
.equation_table %>%
by_row(filter_by_dataset_id %>%
lift %>%
possibly(NA_real_),
df = .detritus_data,
.to = "filtered_data")
}
do_mutate_new_col <- function(.filtered_df_table){
.filtered_df_table %>%
# this only works because of possibly() above, and in reality should do
# nothing -- why are functions being applied to absent data; there must be an
# error
filter(!is.na(filtered_data)) %>%
select(-used_on_dataset) %>%
by_row(mutate_new_col %>%
lift)
}
# functions for creating, preparaing and modifying data relating to models -------------------
add_new_columns_for_prediction <- function(.detritus_data) {
.detritus_data %>%
mutate(detritus10_1500_2000_NA = detritus10_1500 + detritus1500_20000 + detritus20000_NA) %>%
mutate(detritus_over_150 = detritus0_150 + detritus150_20000 + detritus20000_NA) %>%
mutate(detritus850_20000_sum = if_else(is.na(detritus850_20000),
true = detritus1500_20000 + detritus850_1500,
false = detritus850_20000)) %>%
mutate(detritus0_NA_sum = detritus0_150 + detritus150_850 + detritus850_20000_sum + detritus20000_NA) %>%
mutate(detritus150_NA_sum = detritus150_850 + detritus850_20000_sum + detritus20000_NA) %>%
mutate(detritus0_20000_sum = detritus0_150 + detritus150_850 + detritus850_20000_sum,
detritus20000_NA_na0 = if_else(detritus20000_NA < 0.001, true = NA_real_, false = detritus20000_NA)) %>%
mutate(detritus150_1500 = detritus150_850 + detritus850_1500,
detritus150_1500_plus = if_else(is.na(detritus150_1500),
true = detritus150_NA,
false = detritus150_1500)) %>%
mutate(detritus150_NA_sum_Japi = detritus150_850 + detritus850_1500 + detritus1500_20000 + detritus20000_NA,
detritus150_NA_sum_Japi = if_else(is.na(detritus150_NA_sum_Japi),
true = detritus125_NA,
false = detritus150_NA_sum_Japi)
)
}
create_model_table <- function(){
frame_data(
~m_id, ~target_dat, ~src_dat, ~xvar, ~yvar, ~.f, ~family,
"m01", "116", c("131", "126", "121", "221"), "~log(diameter)", "~log(detritus10_1500_2000_NA)", glm, "gaussian",
"m02", c("186", "216"), c("211"), "~log(detritus0_150)", "~log(detritus150_20000)", glm, "gaussian",
"m03", c("186", "216"), c("211"), "~log(detritus0_150)", "~log(detritus20000_NA)", glm, "gaussian",
"m04", c("201"), c("211"), "~log(detritus0_150)", "~log(detritus_over_150)", glm, "gaussian",
"m05", c("71", "51", "61"),c("56"), "~log(detritus850_20000_sum)", "~log(detritus0_150)", glm, "gaussian",
"m06", c("71", "51"), c("61"), "~log(detritus850_20000_sum)", "~log(detritus20000_NA)", glm, "gaussian",
"m07", c("66"), c("56"), "~diameter", "~detritus0_NA_sum", glm, "gaussian",
"m08", c("76", "81", "91"),c("56"), "~detritus150_NA_sum", "~detritus0_150", glm, "gaussian",
"m09", c("86"), c("91"), "~num_leaf", "~detritus150_NA", glm, "gaussian", # too weak to use??
"m10", c("101", "106"), c("56"), "~log(detritus0_20000_sum)", "~log(detritus20000_NA_na0)", glm, "gaussian", # can't get coefficients or r2 to match Diane's notes
"m11", c("146"), c("56"), "~log(detritus150_1500_plus)", "~log(detritus0_150)", glm, "gaussian",
"m12", c("161"), c("56"), "~log(detritus150_NA_sum_Japi)","~log(detritus0_150)", glm, "gaussian"
) %>%
mutate(xvar = xvar %>% map(as.formula),
yvar = yvar %>% map(as.formula))
}
derive_modelling_information <- function(.model_table, .detritus_data){
# create a dataframe that holds everything we need to run the models:
model_info <- .model_table %>%
# select the required input rows
mutate(src_df = map(src_dat,
~ .detritus_data %>%
# TODO: ? add select() to contain only some variables??
filter(dataset_id %in% .x)),
# create modelling function
fml = map2(.x = xvar, .y = yvar, ~ formulae(.y, .x)),
fml = flatten(fml)) %>%
mutate(x_symb = xvar %>% map(find_symbols),
y_symb = yvar %>% map(find_symbols),
x_funs = x_symb %>% map("functions") %>% map_if(is_empty, ~ "") %>% flatten_chr(),
x_vars = x_symb %>% map_chr("variables"),
y_funs = y_symb %>% map("functions") %>% map_if(is_empty, ~ "") %>% flatten_chr(),
y_vars = y_symb %>% map_chr("variables"))
# make sure that there are actually some data to use to fit the model!
nrows_src_df <- map_dbl(model_info$src_df, nrow)
if(any(nrows_src_df == 0)) stop("there are 0 rows in some input data for the imputation models")
return(model_info)
}
# write a function which uses all these arguements to create a model
fit_predictive_model <- function(m_id, src_df, fml, .f, family, target_dat) {
# mod_list = fit_with(src_df)
# browser()
ff <- .f[[1]]
fit_dat <- partial(fit_with, .f = ff, .formulas = fml, family = family)
mod <- map(src_df, fit_dat)
return(mod)
}
do_fit_predictive_model <- function(.modelling_information){
.modelling_information %>%
# selet the functions's arguements
select(m_id, src_df, fml, .f, family, target_dat) %>%
by_row(fit_predictive_model %>% lift,
.to = "predicting_model") %>%
mutate(predicting_model = predicting_model %>% flatten)
}
#' make the predictor range (and name it appropriately)
#'
#' given a data.frame and a variable name, find that variable name and use
#' seq_range to make 30 points in its range
#'
#' @param m_id model.id, for housekeeping
#' @param src_df source data.frame
#' @param x_vars x variable name
#'
#' @return a data.frame with one column only (the new sequence)
make_prediction_xs <- function(m_id, src_df, x_vars) {
dd <- src_df[[1]]
dd %>%
.[[x_vars]] %>%
seq_range(n = 30) %>%
list(.) %>%
set_names(x_vars) %>%
as.data.frame
}
make_prediction_df <- function(m_id, incoming_data, predicting_model, y_vars, y_funs){
dd <- incoming_data[[1]]
# type response means that we will get back the answer on the scale of the
# response (which might have been transformed!!)
# NOTE PLEASE: that `prediction()` gives these **BY DEFAULT**. see ?prediction
modlist <- predicting_model[[1]] # just because it is a list-column, get the first element (which is a list lol)
# note that this list will normall be length 1, but I'm doing it this way just
# in case multiple models are ever fit to the data
out <- map_df(modlist, ~ prediction(.x, data = dd))
names(out) <- paste0(y_vars, "_", names(out))
# back transformation function for y axis (x not necessary because it is in the function?) >> that's correct
back_trans <- switch(y_funs, log = exp, I)
# REASONABLY CERTAIN this is correct since iirc SEs are in the same units as
# the values they are associate with
out <- back_trans(out)
obs_pred <- bind_cols(dd, out)
return(obs_pred)
}
construct_plotting_information <- function(.observed_model_fit, .modelling_information) {
.observed_model_fit %>%
select(-.f) %>%
left_join(.modelling_information %>%
select(m_id, target_dat, x_funs, y_funs, x_vars, y_vars),
by = "m_id")
}
plot_fn <- function(src_df, x_funs, y_funs, x_vars, y_vars, curve_data,...){
dd <- src_df[[1]]
# give "identity" if x_funs or y_funs is ""
switch_trans <- function(x) switch(x, "log" = "log", "identity")
# get correct axis transformations
xt <- switch_trans(x_funs)
yt <- switch_trans(y_funs)
# browser()
# rename variables for plotting
plotdat <- dd %>%
select_(xs = x_vars,
ys = y_vars)
# tweak the name - predicted values have "_fitted" on the side
y_vars <- paste0(y_vars, "_fitted")
ld <- curve_data[[1]] %>%
select_(xs = x_vars,
ys = y_vars)
plotdat %>%
ggplot(aes(x = xs, y = ys)) +
geom_point() +
geom_line(data = ld) +
labs(x = x_vars, y = y_vars) +
coord_trans(x = xt, y = yt)
}
# TODO perhaps a color map to show the site(s)??
# plotting models ---------------------------------------------------------
plot_model_and_supporting_data <- function(.plotting_information, .modelling_information) {
# browser()
# create the x range over which all the models should be predicted.
data_for_drawing_line <- .plotting_information %>%
select(m_id, src_df, x_vars) %>%
by_row(make_prediction_xs %>% lift, .to = "xs_range") %>%
select(-src_df, -x_vars)
# join back to original and run another function -- this time, to add predictions
plotting_info_pred <- .plotting_information %>%
left_join(data_for_drawing_line, by = "m_id") %>%
select(m_id, incoming_data = xs_range, predicting_model, y_vars, y_funs) %>%
# mutate(predicting_model = flatten(predicting_model)) %>%
by_row(make_prediction_df %>% lift, .to = "curve_data")
# TODO another pipeline here, that takes a list of models (derived from
# bootstraps) which will be part of plotting_information. Then applies this in
# a process similar to the above, then generates a list of predictions. Then
# applies quantiles to this list.
plots <- plotting_info_pred %>%
left_join(.modelling_information %>% select(m_id, src_df, x_funs, x_vars)) %>%
by_row(plot_fn %>% lift, .to = "model_fit_plot")
}
# Coup de Grace: use the model fit to add the missing values to observed data from another site
estimate_missing_detritus_new_site <- function(.observed_model_fit, .modelling_information,
.detritus_data){
# extract the fit data that we need
fit_data_needed <- .observed_model_fit %>%
select(m_id, predicting_model, target_dat)
# there are some "model-level" info that we also need -- specifically info
# about the y variable and transformations if any
model_info <- .modelling_information %>%
select(m_id, y_vars, y_funs)
# browser()
# join these, add filtered dataset to model
prediction_raw_material <- fit_data_needed %>%
left_join(model_info) %>%
mutate(target_df = target_dat %>%
map(~ .detritus_data %>%
# TODO: ? add select() to contain only some variables??
filter(dataset_id %in% .x))
)
output <- prediction_raw_material %>%
# get the precise variables we need to add predictions
select(m_id, incoming_data = target_df, predicting_model, y_vars, y_funs) %>%
by_row(make_prediction_df %>% lift, .to = "pred_data")
# polish the output a bit, and flatten the predicting_model
output %>%
mutate(predicting_model = flatten(predicting_model))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.