Nothing
## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>"
)
## ----setup--------------------------------------------------------------------
library(pcpr)
suppressPackageStartupMessages({
library(dplyr) # for manipulation of queens
library(GGally) # for the ggcorr fn
library(ggplot2) # for plotting
library(lubridate) # for manipulating dates
library(magrittr) # for the pipe %>%
library(tidyr) # for pivoting queens for better plotting
})
plot_matrix <- function(D, ..., lp = "none", title = NULL) {
D <- t(D)
if (is.null(colnames(D))) colnames(D) <- paste0("C", 1:ncol(D))
data.frame(D) %>%
dplyr::mutate(x = paste0("R", 1:nrow(.))) %>%
tidyr::pivot_longer(
tidyselect::all_of(colnames(D)),
names_to = "y",
values_to = "value"
) %>%
dplyr::mutate(
x = factor(x, levels = unique(x)),
y = factor(y, levels = unique(y))
) %>%
ggplot(aes(x = x, y = y, fill = value)) +
geom_raster() +
scale_y_discrete(limits = rev) +
coord_equal() +
scale_fill_viridis_c(na.value = "white", ...) +
theme_minimal() +
theme(
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
legend.position = lp,
plot.margin = margin(0, 0, 0, 0),
aspect.ratio = 1
) +
ggtitle(title)
}
## ----queens raw---------------------------------------------------------------
queens
## ----plot queens--------------------------------------------------------------
plot_matrix(queens[, 2:ncol(queens)])
## ----trends-------------------------------------------------------------------
queens %>%
pivot_longer(
colnames(queens)[-1],
names_to = "chem", values_to = "concentration"
) %>%
filter(!is.na(concentration)) %>%
ggplot(aes(x = Date, y = concentration)) +
geom_line() +
geom_smooth(color = "red", formula = "y ~ x", method = "loess", span = 0.05) +
facet_wrap(~chem, scales = "free_y") +
labs(x = "Date", y = "Concentration (µg/m^3)") +
theme_bw()
## ----dates--------------------------------------------------------------------
start_date <- "2015-01-01"
queens_small <- queens %>% filter(Date >= as.Date(start_date))
queens_small
## ----raw correlation----------------------------------------------------------
ggcorr(
queens_small[, 2:ncol(queens_small)],
method = "pairwise.complete.obs",
limits = FALSE, label = FALSE, size = 5
)
## ----distributions------------------------------------------------------------
queens_small %>%
pivot_longer(
colnames(queens_small)[-1],
names_to = "chem", values_to = "concentration"
) %>%
filter(!is.na(concentration)) %>%
ggplot(aes(x = concentration)) +
geom_histogram(bins = 50) +
theme_bw() +
facet_wrap(~chem, scales = "free")
## ----preprocessing------------------------------------------------------------
queens_scaled <- queens_small %>% select(-Date)
queens_scaled[queens_scaled < 0] <- 0
queens_scaled <- data.frame(scale(queens_scaled, center = FALSE))
queens_scaled$Date <- queens_small$Date
## ----new dists----------------------------------------------------------------
queens_scaled %>%
pivot_longer(
colnames(queens_small)[-1],
names_to = "chem", values_to = "concentration"
) %>%
filter(!is.na(concentration)) %>%
ggplot(aes(x = concentration)) +
geom_histogram(bins = 50) +
theme_bw() +
facet_wrap(~chem, scales = "free")
## ----sing---------------------------------------------------------------------
D <- as.matrix(queens_scaled %>% select(-Date))
D_imputed <- impute_matrix(D, apply(D, 2, mean, na.rm = TRUE))
singular_values <- sing(D_imputed)
plot(singular_values, type = "b")
## ----gs, eval=FALSE, echo=TRUE------------------------------------------------
# eta_0 <- get_pcp_defaults(D)$eta
# # to get progress bar, could wrap this
# # in a call to progressr::with_progress({ gs <- grid_search_cv(...) })
# gs <- grid_search_cv(
# D,
# pcp_fn = rrmc,
# grid = data.frame(eta = 10^seq(-1, 2, 1) * round(eta_0, 3)),
# r = 8,
# parallel_strategy = "multisession",
# num_workers = 16,
# verbose = FALSE
# )
# r_star <- gs$summary_stats$r[1]
# eta_star <- round(gs$summary_stats$eta[1], 3)
# gs$summary_stats
## ----gs results, eval=TRUE, echo=FALSE----------------------------------------
gs <- readRDS("rds_files/applied-gs.rds")
r_star <- gs$summary_stats$r[1]
eta_star <- round(gs$summary_stats$eta[1], 3)
gs$summary_stats
## ----pcp----------------------------------------------------------------------
pcp_model <- rrmc(D, r = r_star, eta = eta_star)
L <- pcp_model$L
S <- pcp_model$S
## ----obj----------------------------------------------------------------------
plot(pcp_model$objective, type = "l")
## ----output L-----------------------------------------------------------------
plot_matrix(L)
L_rank <- matrix_rank(L)
L_rank
## ----orig corr----------------------------------------------------------------
ggcorr(
queens_small[, 2:ncol(queens_small)],
method = "pairwise.complete.obs",
limits = FALSE, label = FALSE, size = 5
)
## ----new corr-----------------------------------------------------------------
L_df <- data.frame(L)
colnames(L_df) <- colnames(queens[, 2:ncol(queens)])
ggcorr(
L_df,
method = "pairwise.complete.obs",
limits = FALSE, label = FALSE, size = 5
)
## ----sparse-------------------------------------------------------------------
sparsity(S)
hist(S)
plot_matrix(S)
## ----sparse by col------------------------------------------------------------
S_df <- data.frame(S)
colnames(S_df) <- colnames(queens_small[, 2:ncol(queens_small)])
chems_w_most_extreme_events <- sort(apply(S_df, 2, function(x) sparsity(as.matrix(x))))
chems_w_most_extreme_events
## ----K sparse events----------------------------------------------------------
S_df %>%
select(K) %>%
mutate(Date = queens_small$Date) %>%
filter(K > 0)
## ----NMF, eval=FALSE, echo=TRUE-----------------------------------------------
# library(NMF)
#
# nmf_mat <- L
# nmf_mat[nmf_mat < 0] <- 0
#
# res <- nmf(
# nmf_mat,
# rank = L_rank,
# method = "offset", nrun = 30,
# seed = 0, .opt = "vp16"
# )
#
# raw_scores <- basis(res)
# raw_loadings <- coef(res)
## ----load NMF, eval=TRUE, echo=FALSE------------------------------------------
raw_scores <- readRDS("rds_files/applied-nmf-raw-scores-W.rds")
raw_loadings <- readRDS("rds_files/applied-nmf-raw-loadings-H.rds")
## ----loadings-----------------------------------------------------------------
loadings <- data.frame(raw_loadings)
colnames(loadings) <- colnames(L_df)
loadings[["Pattern"]] <- paste("Pattern", 1:L_rank)
loadings %>%
pivot_longer(-Pattern, names_to = "Chemical", values_to = "Loading") %>%
ggplot(aes(x = Chemical, y = Loading)) +
geom_point(size = 2) +
geom_segment(aes(yend = 0, xend = Chemical), linewidth = 1) +
facet_wrap(~Pattern, scales = "free_x") +
theme_bw() +
theme(
strip.text.x = element_text(size = 12),
title = element_text(size = 16),
legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
strip.background = element_rect(fill = "white"),
axis.title.x = element_blank(),
axis.title.y = element_blank()
) +
geom_hline(yintercept = 0, linewidth = 0.2) +
ggtitle("PCP-NMF Loadings")
## ----var exp------------------------------------------------------------------
nmf_scores <- data.frame(raw_scores)
colnames(nmf_scores) <- c(
"Traffic", "Crustal Dust", "Salt", "Regional/Secondary & Tailpipe Emissions"
)
nmf_sources <- nmf_scores %>%
mutate(Date = queens_scaled$Date) %>%
pivot_longer(-Date, names_to = "Pattern", values_to = "Score")
var_explained <- nmf_sources %>%
group_by(Pattern) %>%
summarize(patsum = sum(Score)) %>%
mutate(VarianceExplained = round(100 * patsum / sum(patsum), 1)) %>%
select(-patsum) %>%
arrange(desc(VarianceExplained)) %>%
mutate(PatName = paste(Pattern, paste0("(", VarianceExplained, "% Var Exp)")))
nmf_sources <- nmf_sources %>%
mutate(
Pattern = factor(
Pattern,
levels = as.character(var_explained$Pattern),
labels = as.character(var_explained$PatName)
)
)
var_explained %>% select(-PatName)
## ----rename patterns----------------------------------------------------------
loadings %>%
mutate(Pattern = factor(
colnames(nmf_scores),
levels = as.character(var_explained$Pattern),
labels = as.character(var_explained$PatName)
)) %>%
pivot_longer(-Pattern, names_to = "Chemical", values_to = "Loading") %>%
ggplot(aes(x = Chemical, y = Loading, color = Pattern)) +
scale_color_brewer(palette = "Set1") +
geom_point(size = 2) +
geom_segment(aes(yend = 0, xend = Chemical), linewidth = 1) +
facet_wrap(~Pattern, scales = "free_x") +
theme_bw() +
theme(
strip.text.x = element_text(size = 12),
title = element_text(size = 16),
legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
strip.background = element_rect(fill = "white"),
axis.title.x = element_blank(),
axis.title.y = element_blank()
) +
geom_hline(yintercept = 0, linewidth = 0.2) +
ggtitle("PCP-NMF Loadings")
## ----NMF scores---------------------------------------------------------------
yr_num <- length(unique(year(nmf_sources$Date)))
ggplot(nmf_sources, aes(Date, Score, color = Pattern)) +
scale_color_brewer(palette = "Set1") +
geom_smooth(method = "loess", span = 0.05) +
facet_wrap(~Pattern, scales = "free", ncol = 1) +
xlab("") +
ylab("Pattern Score") +
theme_classic() +
theme(
legend.position = "none",
title = element_text(size = 18),
axis.title.y = element_text(size = 18),
strip.text = element_text(size = 14),
axis.text.x = element_text(size = 13),
axis.text.y = element_text(size = 11)
) +
scale_x_date(date_labels = "%Y", date_breaks = "1 year") +
ggtitle("PCP-NMF Scores over Time")
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.