In Sayani07/gravitas: Explore Probability Distributions for Bivariate Temporal Granularities

knitr::opts_chunk$set(
  echo = FALSE,
  collapse = TRUE,
  comment = "#>",
  fig.height = 5,
  fig.width = 8,
  fig.align = "center",
  cache = FALSE
)
library(knitr)
library(tidyverse)
library(broom)
library(lubridate)
library(lvplot)
#library(ggridges)
library(tsibble)
library(gravitas)
library(ggpubr)
library(readr)
library(kableExtra)
library(distributional)
library(ggplot2)
library(sugrrants)
library(here)
library(ggplot2)
library(patchwork)
library(scales)
library(GGally)
library(gghighlight)
#library(viridis)
#remotes::install_github("Sayani07/hakear", force = TRUE)

When there are too many possible displays

How to decide which might be the more interesting ones to display?
How to rank the most interesting ones?

These questions are answered using the functions wpd() and select_harmonies() which implements the distance metric $wpd$ (weighted pairwise distances) proposed in [@paper-hakear]. This vignette shows how to use these functions to screen and rank important cyclic granularities for a data set containing smart meter electricity usage for eight residential households in Melbourne.

Linear display of electricity usage data

The smart meter data set for eight households has been cleaned to form a tsibble containing half-hourly electricity consumption from July to December 2019 for each of the households.

elec <- read_rds(here("data/elec_all-8.rds")) %>% 
  dplyr::filter(date(reading_datetime) >= ymd("20190701"), date(reading_datetime) < ymd("20191231"), meter_id==1) %>% 
  select(-meter_id) %>% 
  rename("id" = "household_id",
         "date_time" = "reading_datetime") %>% 
  mutate(date = date(date_time))

elec %>% select(-date)

The data is plotted in different facets for each id from Jul-19 to Dec-19 in Fig a and it has been zoomed in for Sep-19 in Fig b, where a week in Sep-19 has been highlighted.

From the scales of Fig a, it is apparent that these households vary in consumption levels, but all their patterns have been squeezed with this representation.
In fig b, we can see some weekly patterns for the entire period and daily patterns for the highlighted week

elec_linear <- elec %>% 
  ggplot() +
  geom_line(aes(x = date_time, y = kwh),alpha = 0.7) +
  facet_wrap(~id, nrow = 8, labeller = "label_both",
             strip.position =  "right",
             scales = "free_y") + ggtitle("a")

elec_zoom <-  elec %>%
  as_tibble() %>% 
  filter(date >as.Date("2019-09-01") & date < (as.Date("2019-09-30"))) %>% 
  ggplot(aes(x=date_time, y = kwh)) +
  geom_line(size = 0.1, colour = "blue") +
  facet_wrap(~id, 
             scales = "free_y",
             ncol = 1,
             strip.position =  "right") +
  gghighlight(date > as.Date("2019-09-15") & date < (as.Date("2019-09-21")), unhighlighted_params = list(colour = "black")) + ggtitle("b")

elec_linear + elec_zoom

The facts that are not clear from this display are:

(1) which temporal patterns are present in each of these households? (2) How strong are the patterns, are they significant enough for further investigation? (3) Are some houses similar in few temporal patterns but different in others?

We answer these questions using the package.

Since electricity data generally have a daily, weekly and annual pattern, we consider all cyclic granularities that could be formed using temporal units "hour", "day", "week", "month", "wknd_wday".

Choose the set of granularities you are interested in using `search_gran()`.

elec %>% search_gran(highest_unit = "month",
                              filter_out = c("hhour", "fortnight"),
                     filter_in = "wknd_wday"
)

Find the = using `harmony()`

harmonies <- elec %>%
  harmony(
    ugran = "month",
    filter_in = "wknd_wday",
    filter_out = c("hhour", "fortnight")
  )

Find `wpd()` for each harmony for one customer

sm <- elec %>% filter(id==1)

all_harmony_id1 <- select_harmonies(sm,
 harmony_tbl = harmonies,
 response = kwh,
 nsamp = 20
)

tab2 <- all_harmony_id1
tab2

Find `wpd()` for each granularity for one customer

harmonies1 <- harmonies %>% mutate(facet_variable = NA)

h <- harmonies1 %>%
  select(-facet_levels) %>%
  distinct() %>%
  dplyr::mutate(facet_levels = NA)

all_harmony <- select_harmonies(sm,
 harmony_tbl = h,
 response = kwh,
 nsamp = 20
)

tab1 <- all_harmony %>% select(-facet_variable, -facet_levels)
tab1

Find `wpd()` for all ids

elec_split = elec %>% group_split(id)

elec_select_harmony = parallel::mclapply(1:8, function(x){

  data_id <-  elec_split %>% magrittr::extract2(x) %>%
    as_tsibble(index = date_time)


  hakear::select_harmonies(data_id,
                           harmony_tbl = harmonies,
                           response = kwh
  )
}, mc.cores = parallel::detectCores() - 1, mc.preschedule = FALSE, mc.set.seed = FALSE)

# OkabeIto color blind friendly scale
okabeito <- c("#D55E00", "#0072B2","#009E73", "#CC79A7", "#E69F00", "#56B4E9", "#F0E442", "#333333")

elec_harmony_all <- read_rds(here("data/elec_harmony_all.rds"))
select_split <- str_split(elec_harmony_all$select_harmony, " ", simplify = TRUE)[,2]

elec_sig_split <- elec_harmony_all %>% 
  bind_cols(select_split = select_split) %>% 
  mutate(significant = case_when(
    select_split == "***" ~ "highest",
    select_split == "**" ~ "high",
    select_split == "*" ~ "medium",
    select_split == "" ~ "low"
  )) %>% 
  mutate(
    rank = if_else(rank < 10, paste0("\\phantom{0}",rank), paste0(rank)),
    rank = paste0(rank,"\\rlap{$^{",select_split,"}$}")
  )


elec_sig_split$significant <- 
  factor(elec_sig_split$significant, levels = c("highest", "high", "medium", "low"))

elec_rank <- elec_sig_split %>% 
  select(-c(6, 7, 9, 10)) %>% 
  pivot_wider(
    names_from = id,
    values_from = rank) %>% 
  rename("facet variable" = "facet_variable",
         "x variable" = "x_variable") %>% 
  select(-facet_levels, -x_levels)

elec_rank %>% 
  kable(format = "markdown", 
        caption = "Ranking of harmonies for the eight households with significance marked for different thresholds. Rankings are different and at most three harmonies are significant for any household. The number of harmonies to explore are reduced from 42 to 3.")

Table \ref{tab:tab-rank-8} shows the rank of the harmonies for different households. The rankings are different for different households, which is a reflection of their varied behaviors. Most importantly, there are at most three harmonies that are significant for any household. This is a huge reduction in the number of potential harmonies to explore.

heatplot <- elec_sig_split %>% 
  mutate(significance_95 = if_else(significant %in% c("high", "highest"), "*", "")) %>% 
  ggplot(aes(x = x_variable,
             y = facet_variable)) +
  geom_tile(aes(fill = wpd)) + 
  #color = as.factor(significance_95)),
  #size = 0.3) +
  geom_text(aes(label = significance_95), color = "#42275a") +
  scale_fill_gradient(low = "white",high = "#ba5370") +
  #scale_fill_manual(palette = "Dark2") +
  #scale_colour_manual(values = c("white","red")) + 
  theme(legend.position = "none") +
  coord_fixed() + 
  guides(fill = guide_legend()) +
  theme_void() +
  theme_gray(base_size = 12, base_family = "Times") +
  theme(panel.grid = element_blank(),
        axis.text.x = element_text(angle = 60, hjust=1),legend.position = "bottom") + ggtitle("a") +
  theme(
    strip.background = element_blank(),
    strip.text.y  = element_blank(), plot.margin = unit(c(0, -2, 0, 0), "cm")) + ggtitle("(a)") + xlab("x") + ylab("facet") 

elec <- read_rds(here("data/elec_all-8.rds")) %>% 
  dplyr::filter(date(reading_datetime) >= ymd("20190701"), date(reading_datetime) < ymd("20191231"), meter_id==1) %>% 
  select(-meter_id) %>% 
  rename("id" = "household_id",
         "date_time" = "reading_datetime")


elec1 <- elec %>% 
  filter(id == "1") %>% 
  create_gran("day_week") %>% 
  create_gran("hour_day") %>% 
  mutate(day_week = as.numeric(day_week),
         hour_day = as.numeric(hour_day),
         id = as.numeric(id))


# data_elec_order <- elec %>% 
#   as_tibble() %>% 
#   group_by(id) %>% 
#   summarise(av = mean(kwh)) %>% 
#   arrange(av)
# 
# 
# 
# elec_zoom <-  elec %>%
#   as_tibble() %>%   
#   left_join(data_elec_order) %>%
#   arrange(av)  %>% 
#   dplyr::filter(date(date_time) > as.Date("2019-09-01") & date(date_time) < (as.Date("2019-09-30")))
# 
# elec_zoom$id = factor(elec_zoom$id, levels = data_elec_order$id)

elec_zoom <-  elec %>%
    as_tibble() %>%
    dplyr::filter(date(date_time) > as.Date("2019-09-01") & date(date_time) < (as.Date("2019-09-30"))) %>% 
  mutate(id = paste("id", id, sep = " ")) %>% 
  ggplot(aes(x=date_time, y = kwh)) +
  #geom_point(size = 0.1, colour = "black", alpha = 0.3) +
  geom_line(size = 0.5, aes(color = id), alpha = 1) +
  facet_wrap(~id, 
             scales = "free_y",
             ncol = 1,
             strip.position =  "right") + 
  xlab("Time [30m]") + 
  theme_grey() + 
  ylab("Energy demand (in Kwh)") + ggtitle("(b)") +
  theme(panel.grid.major.x = element_blank()) +
  scale_x_datetime("Date", date_labels = "%b %d",
                   breaks = "1 week",
                   date_minor_breaks = "1 day")  + theme_bw() +
  theme(panel.grid.major.x =  element_line(colour = "#A9A9A9"),
        panel.grid.minor.x =  element_line(colour = "#D3D3D3")) +
  theme(
    strip.text = element_text(size = 10, margin = margin(b = 0, t = 0))) +
  scale_color_discrete(type = okabeito)



##parallel-coordinate-plot

data_order <- elec_harmony_all %>% 
  mutate(comb = paste(facet_variable, x_variable, sep = "-")) %>% 
  group_by(comb) %>% 
  summarise(m = mean(wpd)) %>% 
  arrange(desc(m))


data_pcp <- elec_harmony_all %>% 
  mutate(wpd = as.numeric(wpd)) %>% 
  mutate(comb = paste(facet_variable, x_variable, sep = "-")) %>% 
  left_join(data_order) %>% 
  arrange(desc(m)) %>% 
  select(-c(2, 3, 4, 5, 7, 8, 10)) %>% 
  pivot_wider(., 
              names_from = comb, 
              values_from = wpd) 

parcoord <- GGally::ggparcoord(data_pcp,
                   columns = 2:ncol(data_pcp),
                   groupColumn = 1,
                   showPoints = FALSE, 
                   title = "(c)",
                   alphaLines = 1 ,
                   scale = "globalminmax"
) + ggplot2::theme_bw() +
  scale_color_discrete(type = okabeito) + 
  ggplot2::theme(
    plot.title = ggplot2::element_text(size=10)
  )+
  ggplot2::theme(axis.text.y = ggplot2::element_text(angle = 0)) +
  theme(legend.position = "bottom") +
  coord_flip() +
  xlab("") +
  ylab("wpd")


(heatplot + 
    theme(legend.position = "none") +
    facet_grid(id~.) +
  elec_zoom +
  theme(plot.margin = unit(c(0, 0, 0, 0), "cm"))) +  
  parcoord + 
    plot_layout(widths = c(1, 3, 1.5), guides = "collect") &
    theme(legend.position='bottom') + 
  theme(panel.spacing = unit(0.2, "lines"))

According to Figure \ref{fig:dotplot-8}(c), for the harmony pair (dow-hod), household id 7 has the greatest value of $wpd$, while id 1 has the least. From Table \ref{tab:tab-rank-8} it can be seen that the harmony pair (dow, hod) is important for id 7, however it has been labeled as an inconsequential pair for id 1. The distribution of energy demand for both of these households, with dow as the facet and hod on the x-axis, may help explain the choice. Figure \ref{fig:gravitas-plot-8} demonstrates that for id 7, the median (black) and quartile deviation (orange) of energy consumption fluctuates for most hours of the day and days of the week, while for id 1, daily patterns are more consistent within weekdays and weekends. As a result, for id 1, it is more appropriate to examine the distributional difference solely across (dow, wdwnd), which has been rated higher in Table \ref{tab:tab-rank-8}.