analysis/scripts/07_analysis_permutation.R

library(odem.data)

setwd('~/Documents/DSI/odem.data/')

library(MuMIn)
library(broom)
library(parallel)
library(MASS)
library(factoextra)
library(cluster)
library(tidyverse)
library(LakeMetabolizer)
library(dtw)
library(zoo)
library(patchwork)
library(rnaturalearth)
library(sf)
library(ggExtra)
library(ggmosaic)
# library(mapview)
library(data.table)
library(mlr)
library(nnet)
library(glmulti)
library(performance)# checks and compares quality of models
library(effects)
library(flextable)
library(vegan)
library(pROC)
library(multiROC)


data <- read_csv('analysis/figures/data_jan19.csv', col_names = T)

world <- ne_countries(scale = "medium", returnclass = "sf")
us <- map_data("state")

hydLakes <- read_sf(dsn = "inst/extdata/HydroLAKES_polys_v10_shp/HydroLAKES_polys_v10.shp")
# data$RT <- hydLakes$Res_time[match(data$Hylak_id, hydLakes$Hylak_id)]

lake_shapes <- st_read("inst/extdata/HydroLAKES_polys_v10_shp/HydroLAKES_polys_v10.shp")

idy <- (match(data$Hylak_id,lake_shapes$Hylak_id))
lakes_df <- lake_shapes[idy,]
lakes_df$ct <- data$ct
lakes_df$lndu <- data$lndu

# ggplot(lakes_df, aes(fill = ct)) +
#   theme_minimal() +
#   geom_sf() +
#   scale_fill_brewer(type = "qual")
#
# gmap <- ggplot(lakes_df, aes(fill = ct)) +
#   geom_sf() +
#   geom_polygon(data = us, aes(x=long, y=lat,
#                               group = group), color = "black", fill = 'white',
#                size =.5, alpha = 0) +
#   scale_fill_manual(values= c('red4', 'lightblue1')) +
#   # geom_point(data = data, aes(longitude, latitude, col = lndu, shape = ct, size = depth)) +
#   coord_sf(xlim = c(-97.3, -86), ylim = c(42.6, 48.7), expand = FALSE) +
#   xlab('Longitude') + ylab('Latitude') +
#   theme_minimal(); gmap

## get model performance
all.dne <- list.files('analysis/')
all.dne_all <- all.dne[grepl('nhdhr', all.dne)]

info.df <- c()
for (idx in all.dne_all){
  if (file.exists(paste0('analysis/',idx,'/lakeinfo.txt'))){
    info.df <- rbind(info.df,read.csv(paste0('analysis/',idx,'/lakeinfo.txt')))
  }
}

lake.link <- read.csv('../landuse/lake_link.csv')
lake.information <- read.csv('../landuse/lake_information.csv')
lstm.sites <- read.csv('lstm/per_site_results.csv')

id.of.interest <- gsub(".*_","",info.df$lake_id)
id.model.information <- match(id.of.interest, lake.information$lake_nhdid)
as.factor(lake.information$lake_states[id.model.information]) %>% count()


data$fit = info.df$fit_tall[na.omit(match(data$lake,info.df$lake_id))]

# get another landuse

landuse_nlcd <- read.csv('../landuse/lts_nadp_nlcd_glcp_waste_bathy_glev.csv')
landuse <- data.frame('year' = landuse_nlcd$year,
                      'Hylak_id' = landuse_nlcd$Hylak_id,
                      'developed_nlcd' =  landuse_nlcd$developed_high_intensity + landuse_nlcd$developed_med_intensity +
                              landuse_nlcd$developed_low_intensity + landuse_nlcd$developed_open_space,

                      'cultivated_nlcd' = landuse_nlcd$cultivated_crops + landuse_nlcd$pasture_hay)

data_merged <- merge(data, landuse, by = c('Hylak_id', 'year'))

developed_interp <- c()
cultivated_interp <- c()
for (lakeid in unique(data_merged$Hylak_id)){
  df <- data_merged %>% filter(Hylak_id == lakeid)

  id.na_dev <- which(!is.na(df$developed_nlcd))
  id.na_cul <- which(!is.na(df$cultivated_nlcd))

  if (length(id.na_dev) != 0){
    interpolated_data_dev <- approx(df$year[id.na_dev], df$developed_nlcd[id.na_dev], df$year, rule = 2, method = 'constant')$y
  } else {
    interpolated_data_dev <- rep(0, length(df$year))
  }
  if (length(id.na_cul) != 0){
    interpolated_data_cul <- approx(df$year[id.na_cul], df$cultivated_nlcd[id.na_cul], df$year, rule = 2, method = 'constant')$y
  } else {
    interpolated_data_cul <-  rep(0, length(df$year))
  }


  developed_interp <- append(developed_interp, interpolated_data_dev)
  cultivated_interp <- append(cultivated_interp, interpolated_data_cul)
}

data_merged$developed_nlcd <- developed_interp
data_merged$cultivated_nlcd <- cultivated_interp

plot(data_merged$developed, data_merged$developed_nlcd)
plot(data_merged$cultivated, data_merged$cultivated_nlcd)


g.rmse.depth <- ggplot(data, aes(fit, depth, col = trophic)) +
  geom_point() +
  geom_smooth(method = "lm") +
  scale_color_manual(values= c('brown', 'darkgreen', 'lightblue')) +
  xlab("RMSE (g/m3)") + ylab('Depth (m)') +
  theme_minimal(); g.rmse.depth
# g.rmse.depth <- ggMarginal(g.rmse.depth)

g.wsh.area <- ggplot(data, aes(log10(WshA), log10(area), col = trophic)) +
  geom_point() +
  geom_smooth(method = "lm") +
  scale_color_manual(values= c('brown', 'darkgreen', 'lightblue')) +
  xlab("Watershed area (log10 m2)") + ylab('Lake area (log10 m2)') +
  theme_minimal(); g.wsh.area
# g.wsh.area <- ggMarginal(g.wsh.area)

g.mos <- ggplot(data) +
  geom_mosaic( aes( x = product(ct, trophic), fill = ct)) +
  xlab("RMSE (g/m3)") + ylab('Depth (m)') +
  scale_fill_manual(values= c('red4', 'lightblue1')) +
  theme_minimal()+
  theme(legend.position = 'none'); g.mos

g.lndu.wsh <- ggplot(data, aes(lndu, log10(WshA))) +
  geom_boxplot() +
  xlab("Land use") + ylab('Watershed area (log10 m2)') +
  geom_jitter(color = 'black', size = 0.4, alpha = 0.9) +
  theme_minimal(); g.lndu.wsh

g.density <- ggplot(data, aes(depth, fill = ct)) +
  geom_density(alpha = 0.5) +
  xlab("Depth (m)") + ylab('Density (-)') +
  scale_fill_manual(values= c('red4', 'lightblue1')) +
  theme_minimal() +
  theme(legend.position = 'none'); g.density

g <- ggplot(data, aes(depth, fill = trophic)) +
  geom_density(alpha = 0.25) +
  xlab("Depth (m)") + ylab('Density (-)') +
  scale_color_manual(values= c('brown', 'darkgreen', 'lightblue')) +
  theme_minimal(); g

# fig1 <- gmap / (g.rmse.depth + g.density + g.mos + g.wsh.area) + plot_layout(guides = 'collect') +
#   plot_annotation(tag_levels = 'A')

# ggsave(file = 'analysis/figures/Figure2.png', fig1, dpi = 600, width =13, height = 15)

df_data = data[,c("ct", "developed", "forest", "cultivated", "wetlands", "water", "barren",
                  "shrubland", "herbaceous",
                  "depth", "area", "elev", "eutro", "dys", "oligo", "RT", "WshA",
                  "dep_avg", "trophic")]
df_data_num = df_data[, c("developed", "forest", "cultivated", "wetlands", "water", "barren",
                          "shrubland", "herbaceous",
                  "depth", "area", "elev", "eutro", "dys", "oligo", "RT",
                  "dep_avg")]

data_merged$ct <- as.numeric(as.factor(data_merged$ct)) - 1

data_new <- data_merged %>%
  mutate(human_impact = developed_nlcd + cultivated_nlcd,
         vegetated = forest + wetlands + shrubland,
         depth = log10(depth))


data_plot <- data_merged %>%
  mutate(area = log10(area),
         depth = log10(depth),
         RT = log10(RT),
         WshA = log10(WshA),
         human_impact = developed_nlcd + cultivated_nlcd,
         ct = ifelse(ct == 0, 'High consumption', 'Low consumption'))
data_melt <- reshape2::melt(data_plot, id = 'ct')

landuse_plot <- ggplot(data_plot) +
  geom_density(aes(human_impact,  fill = ct, group = ct), alpha = 0.3) +
  scale_fill_manual(values = c('red4','lightblue1'), name = 'Cluster') +
  xlab('Human impact: developed + cultivated') +
  ylab('Density') +
  theme_minimal(base_size = 15)

area_plot <- ggplot(data_plot) +
  geom_density(aes(area,  fill = ct, group = ct), alpha = 0.3) +
  scale_fill_manual(values = c('red4','lightblue1'), name = 'Cluster') +
  xlab('log Area') +
  ylab('Density') +
  theme_minimal(base_size = 15)

depth_plot <- ggplot(data_plot) +
  geom_density(aes(depth,  fill = ct, group = ct), alpha = 0.3) +
  scale_fill_manual(values = c('red4','lightblue1'), name = 'Cluster') +
  xlab('log Depth') +
  ylab('Density') +
  theme_minimal(base_size = 15)

RT_plot <- ggplot(data_plot) +
  geom_density(aes(RT, fill = ct, group = ct), alpha = 0.3) +
  scale_fill_manual(values = c('red4','lightblue1'), name = 'Cluster') +
  xlab('log Residence time') +
  ylab('Density') +
  theme_minimal(base_size = 15)

eutro_plot <- ggplot(data_plot) +
  geom_density(aes(eutro, fill = ct, group = ct), alpha = 0.3) +
  scale_fill_manual(values = c('red4','lightblue1'), name = 'Cluster') +
  xlab('Eutrophic probability') +
  ylab('Density') +
  theme_minimal(base_size = 15)

fig2 <- landuse_plot +area_plot +depth_plot +eutro_plot +RT_plot + plot_layout(guides = 'collect')  +
  plot_annotation(tag_levels = 'A') & theme(legend.position = 'bottom')
ggsave(file = 'analysis/figures/Figure2.png', fig2,  dpi = 600, width =13, height = 8)


models_exhaust <- glmulti(ct ~ human_impact + log10(area) + (depth) +
          eutro + log10(RT),
        data   = data_new,
        # crit   = aicc,       # AICC corrected AIC for small samples
        level  = 1,          # 2 with interactions, 1 without
        method = "h",        # "d", or "h", or "g"
        family = "binomial",
        fitfunction = glm,   # Type of model (LM, GLM etc.)
        confsetsize = 100)   # Keep 100 best models

plot(effects::allEffects(models_exhaust@objects[[1]]),
     lines = list(multiline = T),
     confint = list(style = "auto"))

weightable(models_exhaust)[1:10,] %>%
  regulartable() %>%       # beautifying tables
  autofit()

plot(models_exhaust)
plot(models_exhaust, type = "s")


model_averaged <- model.avg(object = models_exhaust@objects[c(1:2)])

# predicted data
data_new$prediction <- stats::predict(model_averaged, type = "response")

# create roc curve
roc_object <- pROC::roc(data_new$ct, data_new$prediction)

table <- table(Reality = data_new$ct,
               Prediction = ifelse(as.numeric(data_new$prediction) <= 0.50, 0, 1) )
(table[1,1]+table[2+2])/sum(table) * 100

data_new = data_new %>%
  mutate(prediction_ct = ifelse(as.numeric(data_new$prediction) <= 0.50, 0, 1),
         flag = ifelse(ct == prediction_ct, T, F))
summary(data_new$flag)


landuse_plot_flag <- ggplot(data_new) +
  geom_histogram(aes(human_impact,  fill = flag, group = flag), alpha = 0.3, position="identity") +
  scale_fill_manual(values = c("#00AFBB", "#E7B800"), name = '') +
  xlab('Human impact: developed + cultivated') +
  ylab('Density') +
  theme_minimal(base_size = 15)

area_plot_flag <- ggplot(data_new) +
  geom_histogram(aes(area,  fill = flag, group = flag), alpha = 0.3, position="identity") +
  scale_fill_manual(values = c("#00AFBB", "#E7B800"), name = '') +
  xlab('log Area') +
  ylab('Density') +
  theme_minimal(base_size = 15)

depth_plot_flag <- ggplot(data_new) +
  geom_histogram(aes(depth,  fill = flag, group = flag), alpha = 0.3, position="identity") +
  scale_fill_manual(values = c("#00AFBB", "#E7B800"), name = '') +
  xlab('log Depth') +
  ylab('Density') +
  theme_minimal(base_size = 15)

RT_plot_flag <- ggplot(data_new) +
  geom_histogram(aes(RT, fill = flag, group = flag), alpha = 0.3, position="identity") +
  scale_fill_manual(values = c("#00AFBB", "#E7B800"), name = '') +
  xlab('log Residence time') +
  ylab('Density') +
  theme_minimal(base_size = 15)

eutro_plot_flag <- ggplot(data_new) +
  geom_histogram(aes(eutro, fill = flag, group = flag), alpha = 0.3, position="identity") +
  scale_fill_manual(values = c("#00AFBB", "#E7B800"), name = '') +
  xlab('Eutrophic probability') +
  ylab('Density') +
  theme_minimal(base_size = 15)

fig4 <- landuse_plot_flag +area_plot_flag +depth_plot_flag +eutro_plot_flag +RT_plot_flag+ plot_layout(guides = 'collect')  +
  plot_annotation(tag_levels = 'A') & theme(legend.position = 'bottom')
ggsave(file = 'analysis/figures/Figure4.png', fig4,  dpi = 600, width =13, height = 8)


### permutation analysis
train_fraction <- 0.7
reduced_model_data_interation <- 5000

model_result <- models_exhaust
# Run the specified number of model averaging iterations
for(rdmi in 1:reduced_model_data_interation){

  # Define data subset for modeling and sample a fraction for model training
  temp_lake_dat <- data_new %>%
    sample_frac(size = train_fraction,
                weight = as.factor(ct))

  # model_result <- glmulti(ct ~ human_impact + log10(area) + log10(depth) +
  #                           eutro + log10(RT),
  #                         data   = temp_lake_dat,
  #                         # crit   = aicc,       # AICC corrected AIC for small samples
  #                         level  = 1,          # 2 with interactions, 1 without
  #                         method = "h",        # "d", or "h", or "g"
  #                         family = "binomial",
  #                         fitfunction = glm,   # Type of model (LM, GLM etc.)
  #                         confsetsize = 100)   # Keep 100 best models

  #num_models <- length(model_result@crits[model_result@crits <= min(model_result@crits)+2])

  num_models <- 24

  AIC <- rep(0, length(model_result@formulas[c(1:num_models)]))
  MODEL <- rep(NA, length(model_result@formulas[c(1:num_models)]))
  AUC <- rep(0, length(model_result@formulas[c(1:num_models)]))
  RSQUARED <- rep(0, length(model_result@formulas[c(1:num_models)]))
  ACCURACY <- rep(0, length(model_result@formulas[c(1:num_models)]))
  PVALUE <- rep(0, length(model_result@formulas[c(1:num_models)]))
  for(i in 1:length(model_result@formulas[c(1:num_models)])){
    fit <- glm(paste(as.character(model_result@formulas[i])),
               data = temp_lake_dat,
               family = binomial)
    MODEL[i] <- paste(as.character(model_result@formulas[i]))
    AIC[i] <- fit$aic
    predictpr <- predict(fit, type = "response")
    ROC <- pROC::roc(temp_lake_dat$ct ~ predictpr)
    temp_lake_dat$PREDICTION <- predictpr
    AUC[i] <- pROC::auc(ROC)
    RSQUARED[i] <- 1 - (fit$deviance/fit$null.deviance)

    temp_lake_dat_permute <- temp_lake_dat %>%
      dplyr::mutate(PREDICTION = ifelse(as.numeric(PREDICTION) < 0.65, 0, 1))
    table <- table(Reality = temp_lake_dat_permute$ct,
                   Prediction = temp_lake_dat_permute$PREDICTION)
    ACCURACY[i] <- (table[1,1]+table[2+2])/sum(table)

    j <- 1
    nreps <- 1000
    AUC.repo <- rep(0, nreps)

    for(j in 1:nreps) {
      temp_lake_dat_permute$ct <- sample(temp_lake_dat_permute$ct,
                                                   size = length(temp_lake_dat_permute$ct),
                                                   replace = FALSE)
      predictpr <- predict(fit, type = "response")
      ROC <- pROC::roc(temp_lake_dat_permute$ct ~ predictpr, quiet = TRUE,
                       levels = c(0,1), direction = "<")
      AUC.repo[j] <- pROC::auc(ROC)
    }
    PVALUE[i] <- length(AUC.repo[AUC.repo > AUC[i]])/length(AUC.repo)

  }
  INDEX <- seq(1:length(model_result@formulas[c(1:num_models)]))
  lake_fits <- data.frame(INDEX, MODEL, AIC, RSQUARED, AUC, ACCURACY, PVALUE)
  lake_fits$MODEL <- as.character(lake_fits$MODEL)
  lake_fits$AIC <- as.numeric(lake_fits$AIC)
  lake_fits$RSQUARED <- as.numeric(lake_fits$RSQUARED)
  lake_fits$AUC <- as.numeric(lake_fits$AUC)
  lake_fits$ACCURACY <- as.numeric(lake_fits$ACCURACY)
  lake_fits$PVALUE <- as.numeric(lake_fits$PVALUE)

  lake_top_mod <- lake_fits %>%
    filter(AIC <= (min(AIC)+2)) %>%
    filter(RSQUARED >= median(RSQUARED),
           AUC >= median(AUC))

  lake_top_mod$MODEL <- gsub(pattern = "log00", replacement = "log10",
                             x = lake_top_mod$MODEL)


  lake_mod_fits <- map(.x = lake_top_mod$MODEL,
                      .f = ~ glm(formula = .x,
                                 family = "binomial",
                                 data = temp_lake_dat))

  out_path <- "permute_odem_model"
  # Create export folder if it doesn't exist
  ifelse(!dir.exists(file.path(out_path)),
         dir.create(file.path(out_path), recursive = TRUE), FALSE)

  # Export results
  if(length(lake_mod_fits) == 1) {

    results <- tidy(lake_mod_fits[[1]])

    write.csv(x = results,
              file = paste0(out_path, "/run_",
                            rdmi, ".csv"),
              row.names = FALSE)

  } else if (length(lake_mod_fits) == 0){

    tryCatch(write.csv(x = results,
                       file = paste0(out_path, "/run_",
                                     rdmi, ".csv"),
                       row.names = FALSE), error = function(e) NULL)

  } else {

    all_average <- model.avg(lake_mod_fits)

    results <- data.frame(summary(all_average)$coefmat.subset) %>%
      rename(estimate = Estimate,
             std.error = Std..Error,
             statistic = z.value,
             p.value = Pr...z..)
    results$term <- rownames(results)

    write.csv(x = results,
              file = paste0(out_path, "/run_",
                            rdmi, ".csv"),
              row.names = FALSE)
  }

}

run_files <- list.files(path = "permute_odem_model",
                        pattern = "run_", full.names = TRUE)

run_results <- map_df(.x = run_files,
                      .f = ~ read_csv(.x) %>%
                        select(estimate, term) %>%
                        mutate(run_number = .x,
                               run_number = gsub(pattern = "permute_odem_model/",
                                                 replacement = "", x = run_number),
                               run_number = gsub(pattern = ".csv",
                                                 replacement = "", x = run_number),
                               term = gsub(pattern = ".1",
                                           replacement = "", x = term)))


paramter_counts <- run_results %>%
  # filter(estimate >= -20,
  #        estimate <= 20) %>%
  unique() %>%
  filter(term != "(Intercept)") %>%
  mutate(term = str_replace(pattern = ":", replacement = ".", string = term),
         term = ifelse(term == "lo0(area)", "area", term),
         term = ifelse(term == "lo0(RT)", "RT", term),
         term = ifelse(term == "lo0(depth)", "depth", term)) %>%
  group_by(term) %>%
  count() #%>%
  #mutate(label = paste("Number of models:", n))

run_results_filtered <- run_results %>%
  ungroup() %>%
  # filter(estimate >= -10,
  #        estimate <= 10) %>%
  filter(term != "(Intercept)") %>%
  mutate(term = str_replace(pattern = ":", replacement = ".", string = term),
         term = ifelse(term == "lo0(area)", "area", term),
         term = ifelse(term == "lo0(RT)", "RT", term),
         term = ifelse(term == "lo0(depth)", "depth", term),
         #term = str_replace(pattern = "", replacement = "", string = term),
         est_prob = exp(estimate)/(1+exp(estimate))) %>%
  unique() %>%
  inner_join(x = .,
             y = paramter_counts) %>%
  mutate(facet_label = paste(term, "(n =", n, ")"))

complete_results <- (model_averaged$coefficients)%>%
  data.frame() %>%
  rownames_to_column() %>%
  filter(rowname == "full") %>%
  rename(#"depth" = log10.depth.,
         "RT" = log10.RT.,
         "area" = log10.area.) %>%
  pivot_longer(cols = c(human_impact:RT), names_to = "term", values_to = "estimate")

run_results_filtered_plot <- run_results_filtered %>%
  mutate(term = ifelse(term == 'human_impact', 'Human impact', term),
         term = ifelse(term == 'area', 'Area', term),
         term = ifelse(term == 'depth', 'Depth', term),
         term = ifelse(term == 'eutro', 'Eutrophic probability', term),
         term = ifelse(term == 'RT', 'Residence time', term))

complete_results_plot <- complete_results %>%
  mutate(term = ifelse(term == 'human_impact', 'Human impact', term),
         term = ifelse(term == 'area', 'Area', term),
         term = ifelse(term == 'depth', 'Depth', term),
         term = ifelse(term == 'eutro', 'Eutrophic probability', term),
         term = ifelse(term == 'RT', 'Residence time', term))

all_plot <- ggplot() +
  geom_histogram(data = run_results_filtered_plot, aes(x = estimate)) +
  #geom_label(data = paramter_counts, aes(label = label, x = 0, y = 1000)) +
  geom_vline(data = complete_results_plot, aes(xintercept = estimate)) +
  ggtitle("Distribution of subsampled estimate values") +
  #xlim(-25, 25) +
  facet_wrap(vars(term), scales = "free_x")+ xlab('Parameter estimate') + ylab('Count') +
  theme_minimal(base_size = 15)

ggsave(file = 'analysis/figures/Figure3.png',
        dpi = 600, height = 6, width = 8)
jsta/odem.data documentation built on Feb. 10, 2023, 3:56 a.m.