knitr::opts_chunk$set(echo = TRUE)

----------CLEAN THIS Requirements : - script is runned right after prepare_and_run_GDM.Rmd. Requires its output gdminput. - note that gdminput is also saved as .csv and .Rds from the above mentioned script. - nicenames for nice correlation plot - model_names_selection : a row of the dataframe model_names

The script calculates euclidean distances abs(simple substraction) of the covariates and LUI as well as the geographic distance based on coordinates and forms a table with all distances. A correlation plot is produced and a linear model fitted and both are saved. Variance inflation of the linear model is calculated and saved.

DEPRECIATED notice : linear model not used any more in analyis: assumption of independent observations (i.i.d. assumption) not met and therefore p-values not trustable. Also : nonlinearities are exptected. GAM used to check, but it also violates the i.i.d. assumption. Individual associations (correlation plot) does not correct for other predictors. Need to check associations among variables and GDM assumption of monotonic increase. --> all methods violate important assumptions --> compare the 3 methods and make the best of it.

Output : - DEPRECIATED linear model output vignettes/out/<modelname>_linear_model_before_gdm.Rds - DEPRECIATED variance inflation out/<modelname>_vif_before_gdm.txt - DEPRECIATED linear model plot vignettes/out/<modelname>_linear_model_directions.pdf


Goal : - find negative and positive associations --> help interpretation - variance inflation

Requirement : - checkgdminput : calculated in previous script "check_GDM_input.Rmd"

require(mgcv)

Linear model

# ## variance inflation in linear model
# # note : p-values are not trustable, because the assumption of independent observations is  not met! (matrix correlation)
# #    DEPRECIATED --> new method needed
# # generate model
# mod <- paste("distance ~ " ,paste(colnames(checkgdminput)[!colnames(checkgdminput) %in% c("distance")], collapse = " + "))
# m1 <- lm(formula = formula(mod), data = checkgdminput)
# # save linear model
# saveRDS(m1, file = paste("vignettes/out/", modelname, "_linear_model_before_gdm.Rds", sep = ""))
# # save linear model output
# capture.output(summary(m1),
#                file = paste("vignettes/out/", modelname, "_lm_summary.txt"))
# 
# vif_output <- car::vif(m1)
# vif_output[which(vif_output > 5)]
# # vif_output[which(vif_output[, "GVIF"] > 5),]
# capture.output(c(mod,
#                vif_output[which(vif_output > 5)]),
#                file = paste("vignettes/out/", modelname, "_vif_before_gdm.txt", sep = ""))
# 
# # plot linear model
# # m1$coefficients
# test <- summary(m1)
# coefs <- test$coefficients
# coefs <- data.table(coefs)
# coefs[, names := rownames(test$coefficients)]
# coefs[`Pr(>|t|)` < 0.05, pval_symbol := "*"]
# coefs[`Pr(>|t|)` > 0.05, pval_symbol := ""]
# 
# linplot <- ggplot(coefs, aes(x = names, y = Estimate)) +
#   geom_bar(stat = "identity") +
#   coord_flip() +
#   geom_text(aes(label=pval_symbol), y=0.5, color="black", size=3.5) +
#   geom_errorbar(aes(ymin=Estimate-`Std. Error`, ymax=Estimate+`Std. Error`), width=.2, color = "red") +
#   background_grid() +
#   ggtitle(modelname) +
#   ylab("linear model estimates") +
#   xlab("") +
#   ylim(-0.4, 0.5) +
#   labs(caption = "error bars correspond to standard errors")
# 
# ggsave(linplot, filename = paste("vignettes/out/", modelname, "_linear_model_directions.pdf", sep = ""), dpi = 400,
#        units = "cm", width = 29.7, height = 21)

GAM

# test model
testmod <- mgcv::gam(distance ~ autotroph.beta.sim + s(bacteria.RNA.beta.sim) + s(omnivore.protist.beta.sim) + s(plantparasite.protist.beta.sim) + s(geosphere_distgeo) + s(LUI), data = checkgdminput)
plot(testmod, all.terms = T) # also plot linear terms

# full model
mod <- paste(paste("distance ~ s(", paste(colnames(checkgdminput)[!colnames(checkgdminput) %in% c("distance")], collapse = ") + s("), sep = ""), ")", sep = "")
# check if model is running.
#  mod1 : full model with all smooth terms
mod1 <- mgcv::gam(distance ~ s(autotroph.beta.sim) + s(bacteria.RNA.beta.sim) + s(herbivore.arthropodsoillarvae.beta.sim) + s(herbivore.arthropod.beta.sim) + s(plantpathogen.fungi.beta.sim) + s(bacterivore.protist.beta.sim)+ s(eukaryvore.protist.beta.sim) + s(omnivore.protist.beta.sim) + s(plantparasite.protist.beta.sim) + s(secondary.consumer.beta.sim) + s(secondary.consumer.arthropodsoillarvae.beta.sim) + s(tertiary.consumer.beta.sim) + s(decomposer.soilfungi.beta.sim) + s(pathotroph.soilfungi.beta.sim) + s(symbiont.soilfungi.beta.sim) + s(autotroph.beta.sne) + s(bacteria.RNA.beta.sne) + s(herbivore.arthropodsoillarvae.beta.sne) + s(herbivore.arthropod.beta.sne) + s(plantpathogen.fungi.beta.sne) + s(bacterivore.protist.beta.sne) + s(eukaryvore.protist.beta.sne) + s(omnivore.protist.beta.sne) + s(plantparasite.protist.beta.sne) + s(secondary.consumer.beta.sne) + s(secondary.consumer.arthropodsoillarvae.beta.sne) + s(tertiary.consumer.beta.sne) + s(decomposer.soilfungi.beta.sne) + s(pathotroph.soilfungi.beta.sne) + s(symbiont.soilfungi.beta.sne) + s(geosphere_distgeo) + s(LUI) + s(deltaLUI) + s(edis_soil) + s(plot_isolation), data = checkgdminput)

summary(mod1) # Rsq = 0.38, Devexpl = 39.2%, GCV = 0.01552
# if predictor edf (effective degrees of freedom) = 1 : gam works with linear estimator --> can leave out the s() part for the
#    given variable
# no automatic model selection with "REML". Better : adjust manually
#    some smooth terms can probably be removed.
mod1B <- update(mod1, .~.-s(plantpathogen.fungi.beta.sim) + plantpathogen.fungi.beta.sim
                -s(autotroph.beta.sne) + autotroph.beta.sne
                -s(omnivore.protist.beta.sne) + omnivore.protist.beta.sne 
                -s(secondary.consumer.beta.sne) + secondary.consumer.beta.sne
                -s(symbiont.soilfungi.beta.sne) + symbiont.soilfungi.beta.sne
                -s(pathotroph.soilfungi.beta.sne) + pathotroph.soilfungi.beta.sne)
summary(mod1B) # Rsq = 0.38, Devexpl = 29.2%, GCV = 0.015521
#TODO : discuss : omnivore protist still has negative effect
#    also omnivore protist beta sim! (which is nonlinearly modelled)
#    also plantparasite protist sim still has negative
#        ! plantparasite protist beta sne could also be positive in GAM nonlinear
#    ! decomposer soilfungi beta sim changed : had negative in linear model, now positive (hump-shaped)
#        ! decomposer soilfungi sne definitely changed to positive in nonlinear case
#
#    distgeo : 3 regions visible? 
plot(mod1B, pages = 2)
# gratia::draw(mod1B, residuals = T, pages = 2)  # gratia is useful to plot mgcv results

mgcv::gam.check(mod1B) # some significant terms! Problematic
#TODO : why diagnostics not so good? because wrong assumption?

# partial effects plot
vis.gam(mod1B, view=c("LUI", "omnivore.protist.beta.sim"), 
        plot.type='contour', color='topo', main='test')
plot.gam(mod1B, select = 7, residuals = T)
vis.gam(mod1B, view=c("LUI", "omnivore.protist.beta.sne"), 
        plot.type='contour', color='topo', main='test')
plot.gam(mod1B, all.terms = T, select = 32, residuals = T) # plot linear effect
plot.gam(testmod, residuals = T, select = 1)

Exkurs

What happens if I use mean LUI and delta LUI = sd(LUI1, LUI2) as predictors for EFdist?

# NOTE : to construct the dataset checkgdminput : delete "lui" instead of "delcols" in the script "check_GDM_input.Rmd"
#    in order to have mean and sd LUI instead of euclidean distance

# linear model
# run code from check_GDM_input.Rmd

mod <- paste(paste("distance ~ s(", paste(colnames(checkgdminput)[!colnames(checkgdminput) %in% c("distance")], collapse = ") + s("), sep = ""), ")", sep = "")
# check if model is running.
#  mod1 : full model with all smooth terms
mod1 <- mgcv::gam(distance ~ s(autotroph.beta.sim) + s(bacteria.RNA.beta.sim) + s(herbivore.arthropodsoillarvae.beta.sim) + s(herbivore.arthropod.beta.sim) + s(plantpathogen.fungi.beta.sim) + s(bacterivore.protist.beta.sim) + s(eukaryvore.protist.beta.sim) + s(omnivore.protist.beta.sim) + s(plantparasite.protist.beta.sim) + s(secondary.consumer.beta.sim) + s(secondary.consumer.arthropodsoillarvae.beta.sim) + s(tertiary.consumer.beta.sim) + s(decomposer.soilfungi.beta.sim) + s(pathotroph.soilfungi.beta.sim) + s(symbiont.soilfungi.beta.sim) + s(autotroph.beta.sne) + s(bacteria.RNA.beta.sne) + s(herbivore.arthropodsoillarvae.beta.sne) + s(herbivore.arthropod.beta.sne) + s(plantpathogen.fungi.beta.sne) + s(bacterivore.protist.beta.sne) + s(eukaryvore.protist.beta.sne) + s(omnivore.protist.beta.sne) + s(plantparasite.protist.beta.sne) + s(secondary.consumer.beta.sne) + s(secondary.consumer.arthropodsoillarvae.beta.sne) + s(tertiary.consumer.beta.sne) + s(decomposer.soilfungi.beta.sne) + s(pathotroph.soilfungi.beta.sne) + s(symbiont.soilfungi.beta.sne) + s(meanLUI) + s(sdLUI) + s(geosphere_distgeo) + s(edis_soil) + s(plot_isolation), data = checkgdminput)


allanecology/multiFunBetadivLuiGDM documentation built on Nov. 12, 2023, 6:16 a.m.