knitr::opts_chunk$set(echo=FALSE,
  comment = '', fig.width = 6, fig.height = 6
)



library(tidyverse)
library(pastecs)
library(factoextra)
library(cluster)
library(DataExplorer)
library(tidymodels)
library(caret)
library(broom)
library(knitr)
library(kableExtra)
library(gtsummary)
library(rlist)



load("~/GitHub/s8Regressions/data/s8_data.rda")


ps <- s8_data$paramgroup %>% unique()
study_names <- s8_data$study_name
study_names_list <- s8_data$study_name %>% unique() %>% sort()

study_id <- study_names_list %>%  as.factor() %>% as.numeric()
types <- s8_data$type %>% unique()

study_names_short <- c('sea','tac','clark','king','pierce','pos','pot','sno')


lookup <- setNames(as.list(study_names_short), study_names_list)
s8.data.df <- s8_data %>%
  mutate(short_name = dplyr::recode(study_names, !!!lookup)) %>%
  mutate(site_name = paste(sep = "-", dplyr::recode(study_names, !!!lookup), type)) %>%
  filter(!short_name %in% c("clark")) %>%
  filter(!site_name %in% c("pierce-HDR","pierce-LDR")) %>%
  filter(sample_matrix %in% c("water", "Water", "WATER")) %>%
  filter(sample_replicate_flag != "Y") %>%
  filter(paramgroup %in% c("Nutrient","Metal")) %>%
  filter(!parameter %in% c(
    "Cadmium - Water - Dissolved",
    "Biochemical Oxygen Demand - Water - Total",
    "Cadmium - Water - Total",
    "Lead - Water - Dissolved",
    # "Magnesium - Water - Dissolved","Magnesium - Water - Total",# "Chloride - Water - Total",
    # "Total Suspended Solids - Water - Total",
    "Surfactants - Water - Total",
    # "Conductivity - Water - Total","Turbidity - Water - Total",
    "Ammonia - Water - Total",
    "Mercury - Water - Total", "Mercury - Water - Dissolved", "Arsenic - Water - Dissolved",
    "Calcium - Water - Dissolved",
    "pH - Water - Total", "Calcium - Water - Total"
    # "Hardness as CaCO3 - Water - Total",
    # "Magnesium - Water - Total" %>%
  )) %>%
  # remove outliers
  group_by(parameter) %>%
  filter(!((log(new_result_value) - median(log(new_result_value))) > 3 * sd(log(new_result_value))))


s8.data.df %>%
  select(c("site_name", "parameter", "new_result_value")) %>%
  pivot_wider(names_from = site_name, values_from = new_result_value, values_fn = length) %>%
  kable(caption = "Number of sampling events") %>%
  kable_classic(full_width = F)

s8.data.df %>%
  select(c("site_name", "parameter", "nondetect_flag", "new_result_value")) %>%
  mutate(nondetect = (nondetect_flag == TRUE) %>% as.numeric()) %>%
  select(-c("nondetect_flag")) %>%
  group_by(site_name, parameter) %>%
  summarise(percent = sum(nondetect) / length(new_result_value)) %>%
  ungroup() %>%
  pivot_wider(id_cols = parameter, values_from = c(percent), names_from = site_name) %>%
  mutate_if(is.numeric, funs(paste0(round(., 2) * 100, "%"))) %>%
  kable(caption = "percent-nondetects summary") %>%
  kable_classic(html_font = "serif", full_width = T)
data.df <- s8.data.df %>%
    mutate(loc_date = paste0(site_name, "_", field_collection_start_date)) %>%
    pivot_wider(
      names_from = loc_date, id_cols = parameter,
      values_from = new_result_value,
      values_fn = mean
    ) %>%
    remove_rownames() %>%
    column_to_rownames("parameter") %>%
    log() %>%
    t() %>%
    as.data.frame() %>%
    drop_na()

data.df2 <- data.df %>%
    scale(center = TRUE, scale = TRUE) %>%
    t() %>%
    as.data.frame()



psych::pairs.panels(data.df,pch=21,cex=0.8, main='all sites',density=FALSE,ellipses=FALSE,stars=TRUE,cex.cor=1)



fviz_cluster(kmeans(data.df2,3), main = "all sites", subtitle = paste("n =", ncol(data.df2)), data = data.df2, repel = TRUE, show.clust.cent = FALSE, ellipse.type = "confidence", ellipse.level = 0.90)
data.df2.5 <- data.df %>% scale() %>% as.data.frame() %>%

   rownames_to_column('event') %>%
  #pivot_longer(cols = -c(cluster,event)) %>%
  add_column(site = stringr::str_sub(.$event, 1, 7))

all_event_clusters <- kmeans(data.df %>% scale() %>% as.data.frame(),3)
data.df3<- data.df2.5 %>% add_column(cluster =as.factor(all_event_clusters$cluster))



colors <- rainbow(3)[unclass(data.df3$cluster)]
psych::pairs.panels(data.df3 %>%
                      select(-c(cluster,event)) %>%
                      select(c(site,everything() )),
                      pch=21,cex=0.8,
                    main='all sites', density=FALSE,ellipses=FALSE,stars=TRUE,cex.cor=1,bg=colors)

data.df3.long <- data.df3 %>%
 # rownames_to_column('event') %>%
  pivot_longer(cols = -c(site,cluster,event))

ggplot(data.df3.long, aes(x = site, y = value))+geom_jitter(aes(color = cluster))
  #add_column(site = stringr::str_sub(.$event, 1, 7))
short_names = unique(s8.data.df$short_name)
site_names =  unique(s8.data.df$site_name)
i = 1

clus <- function(i) {

 data.df <- s8.data.df %>%
    #filter(site_name == site_names[i]) %>%
    mutate(loc_date = paste0(site_name, "_", field_collection_start_date)) %>%
    pivot_wider(
      names_from = loc_date, id_cols = parameter,
      values_from = new_result_value,
      values_fn = mean
    ) %>%
    remove_rownames() %>%
    column_to_rownames("parameter") %>%
    log() %>%
    t() %>%
    as.data.frame() %>%
    drop_na()

data.df2 <- data.df %>%
    scale(center = TRUE, scale = TRUE) %>%
    t() %>%
    as.data.frame()

km <- kmeans(data.df, 3)
data.df3<- data.df %>% add_column(cluster =as.factor(km$cluster))
#colors <-  c("red", "green", "blue")[unclass(data.df3$cluster)]
pairs.plot <- psych::pairs.panels(data.df3 %>% select(-cluster),pch=21,cex=0.8, main=site_names[i],density=FALSE,ellipses=FALSE,stars=TRUE,cex.cor=1)
  # return(fviz_nbclust(data.df2, kmeans, method = "silhouette"))

  return(list(
    plot1 = pairs.plot,
    plot2 =
      fviz_cluster(kmeans(data.df2,3), main = site_names[i], subtitle = paste("n =", ncol(data.df2)), data = data.df2, repel = TRUE, show.clust.cent = FALSE, ellipse.type = "confidence", ellipse.level = 0.90)
# # outplots[i] <- fviz_cluster(km.res1, data = data.df2,repel =TRUE,ellipse.type = "convex",show.clust.cent = FALSE)
))
}
for (i in 1:length(site_names)){

  try(print(clus(i),silent = TRUE))
}


stormwaterheatmap/s8Regressions documentation built on Dec. 23, 2021, 6:38 a.m.