load_all()
library(tidyverse)
library(lubridate)
library(ggplot2)
col_record <- pull_collections_active()
col_dates <- col_record %>% group_by(group) %>%
summarize(date=as.Date(max(collection_datetime)))
# remove outlier data - unrealistic mass/pc ratios (both high and low)
col_record$ratio <- col_record$dwp_mass/col_record$sumpc_total
quantile_filter <- 0.9
filter_data <- col_record %>%
filter((ratio < quantile(col_record$ratio, probs=quantile_filter, na.rm=T) &
1/ratio < quantile(1/col_record$ratio, probs=quantile_filter, na.rm=T)) |
is.na(ratio))
# remove sumpc_total > 700, screws up Poisson regression model
# remove dwp_mass > 20, focus on exceedance order of magnitude
# remvoe SF Study area Sensits
filter_data2 <- filter_data %>% filter(sumpc_total<700 & dwp_mass<20) %>%
filter(group!='SF Studies')
filter_data2$group2 <- filter_data2$group
filter_data3 <- filter_data2
filter_data3[filter_data3$group!='TwB2', ]$group <- 'Other Areas'
filter_data3$group <- factor(filter_data3$group, ordered=T)
# convert mass to flux using cox opening area constant
filter_data3$dwp_flux <- filter_data3$dwp_mass/1.2
# convert decimal flux into counts of 0.1g for use in Poisson regression
filter_data3$dwp_flux_count <- round(filter_data3$dwp_flux * 10, 0)
# add T/F exceedence flag for logistic regression
flux_threshold <- c('TwB2'=0.5, 'Other Areas'=5)
filter_data3$flag <- rep(NA, nrow(filter_data3))
for (i in 1:nrow(filter_data3)){
thresh <- flux_threshold[[as.character(filter_data3$group[i])]]
filter_data3$flag[i] <-
if_else(filter_data3$dwp_flux[i]>thresh, T, F)
}
predict_list <- vector(mode="list", length=length(unique(filter_data3$sensit)))
names(predict_list) <- unique(filter_data3$sensit)
lm_summary <- data.frame(sensit=c(), n=c(), coeff=c(), st.err=c(),
r.sq=c(), p=c(), group2=c())
for (i in names(predict_list)){
model_data <- filter_data3 %>% filter(sensit==i)
# build linear regression models
predict_list[[i]] <- lm(dwp_flux ~ sumpc_total + 0, data=model_data)
predict_list[[i]]$data <- cbind(model_data,
"fitted"=predict_list[[i]]$fitted)
lm_summary <-
rbind(lm_summary,
data.frame(sensit=i, n=nrow(model_data),
coeff=summary(predict_list[[i]])$coefficients[1],
st.err=summary(predict_list[[i]])$coefficients[2],
r.sq=summary(predict_list[[i]])$r.squared,
p=summary(predict_list[[i]])$coefficients[4],
group2=unique(model_data$group2)))
}
logistic_list <- vector(mode="list", length=2)
names(logistic_list) <- unique(filter_data3$group)
for (i in names(logistic_list)){
model_data <- filter_data3 %>% filter(group==i)
# build logistic regression models
logistic_list[[i]] <- glm(flag ~ sumpc_total, data=model_data,
family=binomial)
logistic_list[[i]]$data <- cbind(model_data,
"logistic.prob"=logistic_list[[i]]$fitted)
}
plot_data <- lm_summary[complete.cases(lm_summary), ]
p1 <- ggplot(plot_data, aes(x=n, y=r.sq)) +
geom_point(aes(color=group2)) +
ggrepel::geom_label_repel(data=filter(plot_data, r.sq<0.6),
aes(label=sensit)) +
ylab("Model Rsquared") + xlab("# of collections in model") +
theme(legend.position=c(1, 0),
legend.justification=c(1, 0))
png(file="./data/rsquared_plot.png", height=6, width=6, units="in", res=300)
print(p1)
dev.off()
save(predict_list, logistic_list, file="./data/model.RData")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.