## Number of principal components to keep
npc <- 5

Regional patterns at fixed time.

The first thing we want to do is to consolidate from the 14 regions to a manageable number of single-timestep features. We'll try principal components analysis for this.

rgns <- unique(residual_load[['region']])
residual_table <- tidyr::spread(residual_load, region, residual_load)
residual_matrix <- as.matrix(residual_table[rgns])
## Set center to FALSE because the data is already centered.
resid_pca <- prcomp(residual_matrix, center=FALSE)$rotation[,1:npc])

rotpltdata <- tibble::as_tibble(resid_pca$rotation[,1:npc]) %>%
    dplyr::mutate(region=row.names(resid_pca$rotation)) %>%
    tidyr::gather('PC', 'value', -region)
pltpc <- ggplot(data=rotpltdata, aes(x=region, y=value, linetype=PC, group=PC)) + 
    geom_line(color='lightgrey') +
    ggthemes::theme_solarized_2(light=FALSE, base_size = 8)

I guess we'll need to decide how many components to keep, but for now we'll forge ahead with r npc just to see what we get.

pcseries <- dplyr::select(residual_table, EconYear, quarter, week, time) %>%

Here's a sample of two weeks in October 2006. Most of these principal components are pretty regular. There is evidently a fairly strong diurnal signal in most of the components, though the strength of those components does vary from day to day.

pltdata <- tidyr::gather(pcseries, 'PC', 'value', -EconYear, -quarter, -week, -time) %>%
    dplyr::filter(EconYear==2006, quarter==4, week<4, week>1)
ggplot(data=pltdata, aes(x=time, y=value, linetype=PC)) + geom_line(color='lightgrey') +

Here is Thanksgiving week and the week before for the same year.

pltdata <- tidyr::gather(pcseries, 'PC', 'value', -EconYear, -quarter, -week, -time) %>%
    dplyr::filter(EconYear==2006, time >= '2006-11-12', 
                  time <= '2006-11-26')
pltnov <- 
    ggplot(data=pltdata, aes(x=time, y=value, linetype=PC, color=PC)) + geom_line() +
    scale_x_datetime(date_breaks='2 days') +
    ggthemes::scale_color_solarized() +

Time series patterns

To do the time series analysis, first observe that our basic unit of analysis is the calendar week. So, let each week be a data point. For each week, we want all 168 values of the principal components for that week.

pcats <- dplyr::select(residual_table, EconYear, quarter, week, time) %>%
  dplyr::group_by(EconYear, quarter, week) %>% 
  dplyr::mutate(weekhour = timesince(time)) %>%
pcats <- cbind(pcats,$x))
## Do the analysis for just PC1 and PC2 for now
pc1weeks <- tidyr::spread(dplyr::select(pcats, EconYear, quarter, week, weekhour, PC1),
                          weekhour, PC1)
pc2weeks <- tidyr::spread(dplyr::select(pcats, EconYear, quarter, week, weekhour, PC2),
                          weekhour, PC2)
pc1mat <- as.matrix(dplyr::select(pc1weeks, -EconYear, -quarter, -week))
pc2mat <- as.matrix(dplyr::select(pc2weeks, -EconYear, -quarter, -week))
pcatime1 <- prcomp(pc1mat, center=FALSE)
pcatime2 <- prcomp(pc2mat, center=FALSE)
plot(pcatime1$rotation[,1], type='l')
plot(pcatime2$rotation[,1], type='l')

Oops. We didn't center because the data had already been standardized, but they have been standardized by region, not by hour of the week. Clearly there is a strong diurnal dependence for both regional components. Let's rerun with the observations centered.

pcatime1_ctr <- prcomp(pc1mat, center=TRUE)
pcatime2_ctr <- prcomp(pc2mat, center=TRUE)
calc_varfrac <- function(pcstr, npc=3, pctstring=FALSE) {
    pwr <- pcstr$sdev^2
    vf <- cumsum(pwr/sum(pwr))[npc]
    if(pctstring) {
      paste0(signif(100*vf,2), '%')
    else {
pltpcstruct <- function(pcstr, title='', npc=3) {
    varfrac <- calc_varfrac(pcstr)
    subtitle <- paste('Variance fraction:', calc_varfrac(pcstr, pctstring=TRUE))
    pcf <-$rotation[,1:npc, drop=FALSE])
    pcf$hour <- seq(0, nrow(pcf)-1)
    pltdata <- tidyr::gather(pcf, 'component', 'weight', -hour)
    ggplot(data=pltdata, aes(x=hour, y=weight, color=component)) + geom_line(size=1.25) +
      theme_bw() + ggtitle(title, subtitle) + ggthemes::scale_color_solarized('red') +
      scale_x_continuous(breaks=seq(0,168,24), minor_breaks=seq(0,168,24)+12)
pltpcstruct(pcatime1_ctr, 'Regional component 1')
pltpcstruct(pcatime2_ctr, 'Regional component 2')

Here we have three temporal components for each regional component. For regional component 1 (mostly CONUS), these three components account for r calc_varfrac(pcatime1_ctr, pctstring=TRUE) of the total variation in the data. In component 2 (mostly Alaska and Hawaii), the first component alone accounts for r calc_varfrac(pcatime2_ctr, npc=1, pctstring=TRUE) of the total variation. For both regional components, the first temporal component has some diurnal variation, but it is small (compared to the other components), and all of the coefficients have the same sign. In other words, this component represents a weighted sum of the departure from the mean value for the particular hour of the week. For component 2, this is almost all of the information in the time series. For component 1, the second temporal component appears to encode a difference between morning and evening values, while the third appears to encode a weekly trend.

JGCRI/eiamod documentation built on Feb. 21, 2020, 9:16 a.m.