library(odem.data)
# setwd('~/Documents/DSI/odem.data/')
library(parallel)
library(MASS)
library(factoextra)
library(cluster)
library(tidyverse)
library(LakeMetabolizer)
library(dtw)
library(zoo)
library(patchwork)
library(rnaturalearth)
library(sf)
library(ggExtra)
library(ggmosaic)
# library(mapview)
library(data.table)
library(mlr)
library(nnet)
library(glmulti)
library(performance)# checks and compares quality of models
library(effects)
library(flextable)
library(vegan)
library(pROC)
library(multiROC)
data <- read_csv('analysis/figures/data_nov18.csv', col_names = T)
world <- ne_countries(scale = "medium", returnclass = "sf")
us <- map_data("state")
hydLakes <- read_sf(dsn = "inst/extdata/HydroLAKES_polys_v10_shp/HydroLAKES_polys_v10.shp")
# data$RT <- hydLakes$Res_time[match(data$Hylak_id, hydLakes$Hylak_id)]
lake_shapes <- st_read("inst/extdata/HydroLAKES_polys_v10_shp/HydroLAKES_polys_v10.shp")
idy <- (match(data$Hylak_id,lake_shapes$Hylak_id))
lakes_df <- lake_shapes[idy,]
lakes_df$ct <- data$ct
lakes_df$lndu <- data$lndu
ggplot(lakes_df, aes(fill = ct)) +
theme_minimal() +
geom_sf() +
scale_fill_brewer(type = "qual")
gmap <- ggplot(lakes_df, aes(fill = ct)) +
geom_sf() +
geom_polygon(data = us, aes(x=long, y=lat,
group = group), color = "black", fill = 'white',
size =.5, alpha = 0) +
scale_fill_manual(values= c('red4', 'lightblue1')) +
# geom_point(data = data, aes(longitude, latitude, col = lndu, shape = ct, size = depth)) +
coord_sf(xlim = c(-97.3, -86), ylim = c(42.6, 48.7), expand = FALSE) +
xlab('Longitude') + ylab('Latitude') +
theme_minimal(); gmap
## get model performance
all.dne <- list.files('analysis/')
all.dne_all <- all.dne[grepl('nhdhr', all.dne)]
info.df <- c()
for (idx in all.dne_all){
if (file.exists(paste0('analysis/',idx,'/lakeinfo.txt'))){
info.df <- rbind(info.df,read.csv(paste0('analysis/',idx,'/lakeinfo.txt')))
}
}
data$fit = info.df$fit_tall[na.omit(match(data$lake,info.df$lake_id))]
g.rmse.depth <- ggplot(data, aes(fit, depth, col = trophic)) +
geom_point() +
geom_smooth(method = "lm") +
scale_color_manual(values= c('darkgreen', 'gold')) +
xlab("RMSE (g/m3)") + ylab('Depth (m)') +
theme_minimal(); g.rmse.depth
# g.rmse.depth <- ggMarginal(g.rmse.depth)
g.wsh.area <- ggplot(data, aes(log10(WshA), log10(area), col = trophic)) +
geom_point() +
geom_smooth(method = "lm") +
scale_color_manual(values= c('darkgreen', 'gold')) +
xlab("Watershed area (log10 m2)") + ylab('Lake area (log10 m2)') +
theme_minimal(); g.wsh.area
# g.wsh.area <- ggMarginal(g.wsh.area)
g.mos <- ggplot(data) +
geom_mosaic( aes( x = product(ct, trophic), fill = ct)) +
xlab("RMSE (g/m3)") + ylab('Depth (m)') +
scale_fill_manual(values= c('red4', 'lightblue1')) +
theme_minimal()+
theme(legend.position = 'none'); g.mos
g.lndu.wsh <- ggplot(data, aes(lndu, log10(WshA))) +
geom_boxplot() +
xlab("Land use") + ylab('Watershed area (log10 m2)') +
geom_jitter(color = 'black', size = 0.4, alpha = 0.9) +
theme_minimal(); g.lndu.wsh
g.density <- ggplot(data, aes(depth, fill = ct)) +
geom_density(alpha = 0.5) +
xlab("Depth (m)") + ylab('Density (-)') +
scale_fill_manual(values= c('red4', 'lightblue1')) +
theme_minimal() +
theme(legend.position = 'none'); g.density
g <- ggplot(data, aes(depth, fill = trophic)) +
geom_density(alpha = 0.25) +
xlab("Depth (m)") + ylab('Density (-)') +
theme_minimal(); g
fig1 <- gmap / (g.rmse.depth + g.density + g.mos + g.wsh.area) + plot_layout(guides = 'collect') +
plot_annotation(tag_levels = 'A')
ggsave(file = 'analysis/figures/Figure2.png', fig1, dpi = 600, width =13, height = 15)
df_data = data[,c("ct", "developed", "forest", "cultivated", "wetlands", "water", "barren",
"shrubland", "herbaceous",
"depth", "area", "elev", "eutro", "dys", "oligo", "RT", "WshA",
"dep_avg", "trophic")]
df_data_num = df_data[, c("developed", "forest", "cultivated", "wetlands", "water", "barren",
"shrubland", "herbaceous",
"depth", "area", "elev", "eutro", "dys", "oligo", "RT",
"dep_avg")]
data$ct <- as.numeric(as.factor(data$ct)) - 1
data_new <- data %>%
mutate(human_impact = developed + cultivated,
vegetated = forest + wetlands + shrubland,
depth = log10(depth))
models_exhaust <- glmulti(ct ~ human_impact + log10(area) + log10(depth) +
eutro + log10(RT),
data = data_new,
# crit = aicc, # AICC corrected AIC for small samples
level = 2, # 2 with interactions, 1 without
method = "h", # "d", or "h", or "g"
family = "binomial",
fitfunction = glm, # Type of model (LM, GLM etc.)
confsetsize = 100) # Keep 100 best models
model_averaged <- model.avg(object = models_exhaust@objects[c(1:24)])
png(file='analysis/figures/Figure3.png', width = 25, height = 20, units = "cm",res = 600)
plot(effects::allEffects(test_h@objects[[1:2]]),
lines = list(multiline = T),
confint = list(style = "auto"))
dev.off()
plot(test_h)
weightable(test_h)[1:10,] %>%
regulartable() %>% # beautifying tables
autofit()
plot(test_h, type = "s")
best_model <- test_h@objects[[2]]
car::Anova(best_model)
model <- multinom(ct ~ developed + cultivated + depth +
eutro + dys + oligo + (RT) , data = train)
model1 <- multinom(ct ~ developed + depth + eutro, data = df_data)
model2 <- multinom(ct ~ developed + depth + oligo, data = df_data)
model3 <- multinom(ct ~ developed + cultivated + depth + eutro, data = df_data)
model4 <- multinom(ct ~ developed + cultivated + depth + eutro, data = df_data)
model5 <- multinom(ct ~ developed + cultivated + depth, data = df_data)
model6 <- multinom(ct ~ developed + depth + eutro + RT, data = df_data)
pscl::pR2(model1)["McFadden"]
pscl::pR2(model2)["McFadden"]
pscl::pR2(model3)["McFadden"]
pscl::pR2(model4)["McFadden"]
pscl::pR2(model5)["McFadden"]
pscl::pR2(model6)["McFadden"]
summary(model)
summary(model)$AIC
z <- summary(model)$coefficients/summary(model)$standard.errors
p <- (1 - pnorm(abs(z), 0, 1)) * 2
print(p)
# https://datasciencebeginners.com/2018/12/20/multinomial-logistic-regression-using-r/
# https://stats.oarc.ucla.edu/r/dae/multinomial-logistic-regression/
## extracting coefficients from the model and exponentiate
exp(coef(model))
head(probability.table <- fitted(model))
# Predicting the values for train dataset
train$precticed <- predict(model_averaged, newdata = train, "class")
# Building classification table
ctable <- table(train$ct, train$precticed)
# Calculating accuracy - sum of diagonal elements divided by total obs
round((sum(diag(ctable))/sum(ctable))*100,2)
# Predicting the values for train dataset
test$precticed <- predict(model, newdata = test, "class")
# Building classification table
ctable <- table(test$ct, test$precticed)
# Calculating accuracy - sum of diagonal elements divided by total obs
round((sum(diag(ctable))/sum(ctable))*100,2)
pscl::pR2(model)["McFadden"]
# predicted data
data_new$prediction <- stats::predict(model_averaged, type = "response")
# create roc curve
roc_object <- pROC::roc(data_new$ct, data_new$prediction)
train_fraction <- 0.7
reduced_model_data_interation <- 100
# Run the specified number of model averaging iterations
for(rdmi in 1:reduced_data_model_iteration){
# Define data subset for modeling and sample a fraction for model training
temp_lake_dat <- data_new %>%
sample_frac(size = train_fraction,
weight = as.factor(ct))
AIC <- rep(0, length(model_result@formulas[c(1:24)]))
MODEL <- rep(NA, length(model_result@formulas[c(1:24)]))
AUC <- rep(0, length(model_result@formulas[c(1:24)]))
RSQUARED <- rep(0, length(model_result@formulas[c(1:24)]))
ACCURACY <- rep(0, length(model_result@formulas[c(1:24)]))
PVALUE <- rep(0, length(model_result@formulas[c(1:24)]))
for(i in 1:length(model_result@formulas[c(1:24)])){
fit <- glm(paste(as.character(model_result@formulas[i])),
data = temp_lake_dat,
family = binomial)
MODEL[i] <- paste(as.character(model_result@formulas[i]))
AIC[i] <- fit$aic
predictpr <- predict(fit, type = "response")
ROC <- pROC::roc(temp_lake_dat$ct ~ predictpr)
temp_lake_dat$PREDICTION <- predictpr
AUC[i] <- pROC::auc(ROC)
RSQUARED[i] <- 1 - (fit$deviance/fit$null.deviance)
temp_lake_dat_permute <- temp_lake_dat %>%
dplyr::mutate(PREDICTION = ifelse(as.numeric(PREDICTION) < 0.65, 0, 1))
table <- table(Reality = temp_lake_dat_permute$ct,
Prediction = temp_lake_dat_permute$PREDICTION)
ACCURACY[i] <- (table[1,1]+table[2+2])/sum(table)
j <- 1
nreps <- 1000
AUC.repo <- rep(0, nreps)
for(j in 1:nreps) {
temp_lake_dat_permute$ct <- sample(temp_lake_dat_permute$ct,
size = length(temp_lake_dat_permute$ct),
replace = FALSE)
predictpr <- predict(fit, type = "response")
ROC <- pROC::roc(temp_lake_dat_permute$ct ~ predictpr, quiet = TRUE,
levels = c(0,1), direction = "<")
AUC.repo[j] <- pROC::auc(ROC)
}
PVALUE[i] <- length(AUC.repo[AUC.repo > AUC[i]])/length(AUC.repo)
}
INDEX <- seq(1:length(model_result@formulas[c(1:24)]))
lake_fits <- data.frame(INDEX, MODEL, AIC, RSQUARED, AUC, ACCURACY, PVALUE)
lake_fits$MODEL <- as.character(lake_fits$MODEL)
lake_fits$AIC <- as.numeric(lake_fits$AIC)
lake_fits$RSQUARED <- as.numeric(lake_fits$RSQUARED)
lake_fits$AUC <- as.numeric(lake_fits$AUC)
lake_fits$ACCURACY <- as.numeric(lake_fits$ACCURACY)
lake_fits$PVALUE <- as.numeric(lake_fits$PVALUE)
lake_top_mod$MODEL <- gsub(pattern = "log00", replacement = "log10",
x = lake_top_mod$MODEL)
lake_top_mod <- lake_fits %>%
filter(AIC <= (min(AIC)+2)) %>%
filter(RSQUARED >= median(RSQUARED),
AUC >= median(AUC))
lake_mod_fits <- map(.x = lake_top_mod$MODEL,
.f = ~ glm(formula = .x,
family = "binomial",
data = temp_lake_dat))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.