knitr::opts_chunk$set(echo=FALSE, comment = '', fig.width = 6, fig.height = 6 ) library(tidyverse) library(pastecs) library(factoextra) library(cluster) library(DataExplorer) library(tidymodels) library(caret) library(broom) library(knitr) library(kableExtra) library(gtsummary) library(rlist) load("~/GitHub/s8Regressions/data/s8_data.rda") ps <- s8_data$paramgroup %>% unique() study_names <- s8_data$study_name study_names_list <- s8_data$study_name %>% unique() %>% sort() study_id <- study_names_list %>% as.factor() %>% as.numeric() types <- s8_data$type %>% unique() study_names_short <- c('sea','tac','clark','king','pierce','pos','pot','sno') lookup <- setNames(as.list(study_names_short), study_names_list)
s8.data.df <- s8_data %>% mutate(short_name = dplyr::recode(study_names, !!!lookup)) %>% mutate(site_name = paste(sep = "-", dplyr::recode(study_names, !!!lookup), type)) %>% filter(!short_name %in% c("clark")) %>% filter(!site_name %in% c("pierce-HDR","pierce-LDR")) %>% filter(sample_matrix %in% c("water", "Water", "WATER")) %>% filter(sample_replicate_flag != "Y") %>% filter(paramgroup %in% c("Nutrient","Metal")) %>% filter(!parameter %in% c( "Cadmium - Water - Dissolved", "Biochemical Oxygen Demand - Water - Total", "Cadmium - Water - Total", "Lead - Water - Dissolved", # "Magnesium - Water - Dissolved","Magnesium - Water - Total",# "Chloride - Water - Total", # "Total Suspended Solids - Water - Total", "Surfactants - Water - Total", # "Conductivity - Water - Total","Turbidity - Water - Total", "Ammonia - Water - Total", "Mercury - Water - Total", "Mercury - Water - Dissolved", "Arsenic - Water - Dissolved", "Calcium - Water - Dissolved", "pH - Water - Total", "Calcium - Water - Total" # "Hardness as CaCO3 - Water - Total", # "Magnesium - Water - Total" %>% )) %>% # remove outliers group_by(parameter) %>% filter(!((log(new_result_value) - median(log(new_result_value))) > 3 * sd(log(new_result_value)))) s8.data.df %>% select(c("site_name", "parameter", "new_result_value")) %>% pivot_wider(names_from = site_name, values_from = new_result_value, values_fn = length) %>% kable(caption = "Number of sampling events") %>% kable_classic(full_width = F) s8.data.df %>% select(c("site_name", "parameter", "nondetect_flag", "new_result_value")) %>% mutate(nondetect = (nondetect_flag == TRUE) %>% as.numeric()) %>% select(-c("nondetect_flag")) %>% group_by(site_name, parameter) %>% summarise(percent = sum(nondetect) / length(new_result_value)) %>% ungroup() %>% pivot_wider(id_cols = parameter, values_from = c(percent), names_from = site_name) %>% mutate_if(is.numeric, funs(paste0(round(., 2) * 100, "%"))) %>% kable(caption = "percent-nondetects summary") %>% kable_classic(html_font = "serif", full_width = T)
data.df <- s8.data.df %>% mutate(loc_date = paste0(site_name, "_", field_collection_start_date)) %>% pivot_wider( names_from = loc_date, id_cols = parameter, values_from = new_result_value, values_fn = mean ) %>% remove_rownames() %>% column_to_rownames("parameter") %>% log() %>% t() %>% as.data.frame() %>% drop_na() data.df2 <- data.df %>% scale(center = TRUE, scale = TRUE) %>% t() %>% as.data.frame() psych::pairs.panels(data.df,pch=21,cex=0.8, main='all sites',density=FALSE,ellipses=FALSE,stars=TRUE,cex.cor=1) fviz_cluster(kmeans(data.df2,3), main = "all sites", subtitle = paste("n =", ncol(data.df2)), data = data.df2, repel = TRUE, show.clust.cent = FALSE, ellipse.type = "confidence", ellipse.level = 0.90)
data.df2.5 <- data.df %>% scale() %>% as.data.frame() %>% rownames_to_column('event') %>% #pivot_longer(cols = -c(cluster,event)) %>% add_column(site = stringr::str_sub(.$event, 1, 7)) all_event_clusters <- kmeans(data.df %>% scale() %>% as.data.frame(),3) data.df3<- data.df2.5 %>% add_column(cluster =as.factor(all_event_clusters$cluster)) colors <- rainbow(3)[unclass(data.df3$cluster)] psych::pairs.panels(data.df3 %>% select(-c(cluster,event)) %>% select(c(site,everything() )), pch=21,cex=0.8, main='all sites', density=FALSE,ellipses=FALSE,stars=TRUE,cex.cor=1,bg=colors) data.df3.long <- data.df3 %>% # rownames_to_column('event') %>% pivot_longer(cols = -c(site,cluster,event)) ggplot(data.df3.long, aes(x = site, y = value))+geom_jitter(aes(color = cluster)) #add_column(site = stringr::str_sub(.$event, 1, 7))
short_names = unique(s8.data.df$short_name) site_names = unique(s8.data.df$site_name) i = 1 clus <- function(i) { data.df <- s8.data.df %>% #filter(site_name == site_names[i]) %>% mutate(loc_date = paste0(site_name, "_", field_collection_start_date)) %>% pivot_wider( names_from = loc_date, id_cols = parameter, values_from = new_result_value, values_fn = mean ) %>% remove_rownames() %>% column_to_rownames("parameter") %>% log() %>% t() %>% as.data.frame() %>% drop_na() data.df2 <- data.df %>% scale(center = TRUE, scale = TRUE) %>% t() %>% as.data.frame() km <- kmeans(data.df, 3) data.df3<- data.df %>% add_column(cluster =as.factor(km$cluster)) #colors <- c("red", "green", "blue")[unclass(data.df3$cluster)] pairs.plot <- psych::pairs.panels(data.df3 %>% select(-cluster),pch=21,cex=0.8, main=site_names[i],density=FALSE,ellipses=FALSE,stars=TRUE,cex.cor=1) # return(fviz_nbclust(data.df2, kmeans, method = "silhouette")) return(list( plot1 = pairs.plot, plot2 = fviz_cluster(kmeans(data.df2,3), main = site_names[i], subtitle = paste("n =", ncol(data.df2)), data = data.df2, repel = TRUE, show.clust.cent = FALSE, ellipse.type = "confidence", ellipse.level = 0.90) # # outplots[i] <- fviz_cluster(km.res1, data = data.df2,repel =TRUE,ellipse.type = "convex",show.clust.cent = FALSE) )) }
for (i in 1:length(site_names)){ try(print(clus(i),silent = TRUE)) }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.