knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  eval = FALSE
)
library(disag)
library(dplyr)
library(forcats)
library(gganimate)
library(plotly)
library(broom)
library(tidyr)
library(patchwork)
data(madagascar_malaria)

Data visualisation is a difficult problem with aggregated outputs. Even standard plots like scatter plots do not work because there are many covariate values for each response value. Simple plots, like using the aggregate output value for all covariate values don't make much sense. There are two particular problems that should be addressed. Firstly, the methods should scale to hundreds or thousands of aggregate output datapoints. Secondly, the methods should be vaguely capable of detecting the importance of nonlinearity. By this I mean, if it is the very large values of a covariate that drive the output, this should be at least in principle visable in the plots. This exludes using the group mean of the covariates, or removing "outliers". Ideally, the methods should also be able to accept weights.

p1 <- 
  ggplot(madagascar_malaria, aes(longitude, latitude, fill = LSTmean)) + 
  geom_raster() + 
  coord_equal() +
  theme_minimal() +
  scale_fill_viridis_c() +
  ggtitle('Temperature')


p2 <- 
ggplot(madagascar_malaria, aes(longitude, latitude, fill = case_rate)) + 
  geom_raster() + 
  coord_equal() +
  theme_minimal() +
  ggtitle('Case Rate')


p1 + p2
ggplot(madagascar_malaria, aes(LSTmean, case_rate, colour = as.numeric(ID))) + 
  geom_point() +
  scale_colour_viridis_c()
ggplot(madagascar_malaria, aes(LSTmean, case_rate, colour = as.numeric(ID), alpha = pop)) + 
  geom_point() +
  scale_colour_viridis_c()

While subsetting the data is definitely not ideal, we can perhaps pick out some interesting groups.

maxx <-
  madagascar_malaria %>% 
    arrange(desc(LSTmean)) %>% 
    head(10) %>% 
    pull(ID)

minx <- 
  madagascar_malaria %>% 
    arrange(LSTmean) %>% 
    head(10) %>% 
    pull(ID)



maxy <- 
  madagascar_malaria %>% 
    arrange(desc(case_rate)) %>% 
    head(10) %>% 
    pull(ID)


miny <- 
  madagascar_malaria %>% 
    arrange(case_rate) %>% 
    head(10) %>% 
    pull(ID)


rand <-
  madagascar_malaria %>% 
    filter(!(ID %in% c(maxx, minx, maxy, miny))) %>% 
    distinct(ID) %>% 
    sample_n(10) %>% 
    pull(ID)

madagascar_malaria %>% 
  filter(ID %in% c(minx, maxx, miny, maxy, rand)) %>% 
  ggplot(aes(LSTmean, case_rate, colour = as.numeric(ID), 
             alpha = log10(pop), group = factor(ID))) + 
    geom_point() +
    scale_colour_viridis_c() +
    geom_path()
ggplot(madagascar_malaria, aes(LSTmean, case_rate, colour = as.numeric(ID), alpha = pop)) + 
  geom_point()

Here are some basic ideas that I don't think work very well.

madagascar_malaria %>% 
  ggplot(aes(y = case_rate, x = LSTmean, colour = as.numeric(ID), 
             group = factor(ID))) +
    geom_point() +
    geom_path() + 
    scale_colour_viridis_c()


madagascar_malaria %>% 
  ggplot(aes(y = LSTmean, x = fct_reorder(factor(ID), case_rate), colour = case_rate, group = ID)) +
    geom_violin() +
    #geom_point(alpha = 0.5) +
    coord_flip()+
    ylab('Pixel LSTmean') +
    labs(colour = 'Disease Rate') +
    xlab('Site id (ordered by disease rate)') +
    theme(text = element_text(size = 20),
          axis.text.y = element_blank())

Another way we could consider plotting these is to summarise each aggregate output as a weighted proportion of covariates that are over a particular value. And then plot this against the aggregate output. However, we would want to sweep over various values of the threshold value. Therefore, small multiples, 3D plots and annimated plots are all potentially useful.

First we will do a single plot.

madagascar_malaria %>% 
  group_by(ID) %>% 
  summarise(prop = weighted.mean(LSTmean > 0, w = pop),
            case_rate = mean(case_rate)) %>% 
  ggplot(aes(x = prop, y = case_rate)) + 
    geom_point() +
    xlab("Prop LSTmean > 0")

Now try small multiples.

thresholds_sm <- quantile(madagascar_malaria$LSTmean, seq(0.1, 0.9, length.out = 25))

madagascar_malaria %>% 
  group_by(ID) %>% 
  summarise(prop = sapply(thresholds_sm, 
                          \(x) weighted.mean(LSTmean > x, w = pop)),
            threshold = round(thresholds_sm, 3),
            case_rate = mean(case_rate)) %>% 
  ggplot(aes(x = prop, y = case_rate)) + 
    geom_point() +
    facet_wrap(~ threshold)

Lets add some bells and whistels

thresholds_sm <- quantile(madagascar_malaria$LSTmean, seq(0.1, 0.9, length.out = 25))

madagascar_malaria %>% 
  group_by(ID) %>% 
  summarise(prop = sapply(thresholds_sm, 
                          \(x) weighted.mean(LSTmean > x, w = pop)),
            threshold = round(thresholds_sm, 3),
            case_rate = mean(case_rate)) %>% 
  ggplot(aes(x = prop, y = case_rate, colour = threshold)) + 
    geom_point() +
    geom_smooth(method = 'lm', colour = 'black') +
    facet_wrap(~ threshold)

And lets try 16 classes instead of 25.

thresholds_sm <- quantile(madagascar_malaria$LSTmean, seq(0.1, 0.9, length.out = 16))

madagascar_malaria %>% 
  group_by(ID) %>% 
  summarise(prop = sapply(thresholds_sm, 
                          \(x) weighted.mean(LSTmean > x, w = pop)),
            threshold = round(thresholds_sm, 3),
            case_rate = mean(case_rate)) %>% 
  ggplot(aes(x = prop, y = case_rate, colour = threshold)) + 
    geom_point() +
    geom_smooth(method = 'lm', colour = 'black') +
    facet_wrap(~ threshold)

Finally lets try a different covariate.

thresholds_sm <- quantile(madagascar_malaria$EVI, seq(0.1, 0.9, length.out = 16))

madagascar_malaria %>% 
  group_by(ID) %>% 
  summarise(prop = sapply(thresholds_sm, 
                          \(x) weighted.mean(EVI > x, w = pop)),
            threshold = round(thresholds_sm, 3),
            case_rate = mean(case_rate)) %>% 
  ggplot(aes(x = prop, y = case_rate, colour = threshold)) + 
    geom_point() +
    geom_smooth(method = 'lm', colour = 'black') +
    facet_wrap(~ threshold)

Now let's animate it.

thresholds_sm <- quantile(madagascar_malaria$LSTmean, seq(0.9, 0.1, length.out = 100))

anim <- 
madagascar_malaria %>% 
  group_by(ID) %>% 
  summarise(prop = sapply(thresholds_sm, 
                          \(x) weighted.mean(LSTmean > x, w = pop)),
            threshold = round(thresholds_sm, 3),
            case_rate = mean(case_rate)) %>% 
  ggplot(aes(x = prop, y = case_rate, colour = threshold)) + 
    geom_point() +
    geom_smooth(method = 'lm', colour = 'black') +
    transition_states(threshold) + 
    labs(title = 'threshold: {closest_state}')


gganimate::animate(anim, renderer = gifski_renderer())

Now let's animate it.

thresholds_sm <- quantile(madagascar_malaria$EVI, seq(1, 0.0, length.out = 50))

anim <- 
madagascar_malaria %>% 
  group_by(ID) %>% 
  summarise(prop = sapply(thresholds_sm, 
                          \(x) weighted.mean(EVI > x, w = pop)),
            threshold = round(thresholds_sm, 3),
            case_rate = mean(case_rate)) %>% 
  ggplot(aes(x = prop, y = case_rate, colour = threshold)) + 
    geom_point() +
    geom_smooth(method = 'lm', colour = 'black') +
    transition_states(threshold)

animate(anim, renderer = gifski_renderer())

Plot of the linear model parameters.

thresholds_sm <- quantile(madagascar_malaria$LSTmean, seq(0.9, 0.1, length.out = 100))


thresh_models_summary <- 
  madagascar_malaria %>% 
    group_by(ID) %>% 
    summarise(prop = sapply(thresholds_sm, 
                            \(x) weighted.mean(LSTmean > x, w = pop)),
              threshold = round(thresholds_sm, 3),
              case_rate = mean(case_rate)) %>% 
    group_by(threshold) %>% 
    do(fitThresh = tidy(lm(case_rate ~ prop, data = .), conf.int = TRUE)) %>% 
    unnest(fitThresh)



thresh_models_summary %>% 
  filter(term == 'prop') %>% 
  ggplot(aes(x = threshold, y = estimate)) + 
    geom_ribbon(aes(ymin = conf.low, ymax = conf.high), fill = 'steelblue') +
    geom_line()

Plot of the linear model parameters with simulated data.

sim <- madagascar_malaria

sim$pixel_case <- ((sim$LSTmean / 10) ^ 2) * sim$pop

sim$pixel_case_pois <- sapply(sim$pixel_case, function(x) rpois(1, x))

sim <- 
  sim %>% 
    group_by(ID) %>% 
    mutate(cases = sum(pixel_case_pois),
           case_rate = cases / agg_pop)

thresholds_sm <- quantile(madagascar_malaria$LSTmean, seq(0.9, 0.1, length.out = 25))



sim %>% 
  group_by(ID) %>% 
  summarise(prop = sapply(thresholds_sm, 
                          \(x) weighted.mean(LSTmean > x, w = pop)),
            threshold = round(thresholds_sm, 3),
            case_rate = mean(case_rate)) %>% 
  ggplot(aes(x = prop, y = case_rate, colour = threshold)) + 
    geom_point() +
    geom_smooth(method = 'lm', colour = 'black') +
    facet_wrap(~ threshold)


thresholds_sm <- quantile(madagascar_malaria$LSTmean, seq(0.9, 0.1, length.out = 100))


thresh_models_summary <- 
  sim %>% 
    group_by(ID) %>% 
    summarise(prop = sapply(thresholds_sm, 
                            \(x) weighted.mean(LSTmean > x, w = pop)),
              threshold = round(thresholds_sm, 3),
              case_rate = mean(case_rate)) %>% 
    group_by(threshold) %>% 
    do(fitThresh = tidy(lm(case_rate ~ prop, data = .), conf.int = TRUE)) %>% 
    unnest(fitThresh)



thresh_models_summary %>% 
  filter(term == 'prop') %>% 
  ggplot(aes(x = threshold, y = estimate)) + 
    geom_ribbon(aes(ymin = conf.low, ymax = conf.high), fill = 'steelblue') +
    geom_line()

Now let's do a 3D version.

thresholds_sm <- quantile(madagascar_malaria$LSTmean, seq(0.1, 0.9, length.out = 25))


threshold_df <- 
  madagascar_malaria %>% 
    group_by(ID) %>% 
    summarise(prop = sapply(thresholds_sm, 
                            \(x) weighted.mean(LSTmean > x, w = pop)),
              threshold = round(thresholds_sm, 3),
              case_rate = mean(case_rate)) %>% 
    mutate(ID = as.numeric(ID))


fig <- plot_ly(threshold_df, x = ~prop, y = ~threshold, z = ~case_rate,
              color = ~ID, colorscale='Viridis')
#fig <- fig %>% add_markers()
fig <- fig %>% layout(scene = list(xaxis = list(title = 'Proportion'),
                     yaxis = list(title = 'Threshold value'),
                     zaxis = list(title = 'Case Rate')))

fig

htmlwidgets::saveWidget(fig, file = '../../disag_outputs/lst_3d.html')

Something about quantiles instead

N <- 25
diff <- 5
quants <- seq(0.01, 0.99, length.out = N)
quants2 <- quants[1:(N - diff)]
quants <- quants[(diff + 1):N]


in_group_quants <- 
  madagascar_malaria %>% 
    group_by(ID) %>% 
    summarise(lower = sapply(seq_along(quants), 
                            \(i) quantile(LSTmean, c(quants2[i]))),
              upper = sapply(seq_along(quants), 
                            \(i) quantile(LSTmean, c(quants[i]))),
              lower_quant = round(quants2, 2),
              upper_quant = round(quants, 2),
              case_rate = mean(case_rate))


in_group_quants %>% 
  ggplot(aes(x = lower, y = upper, colour = case_rate)) + 
    geom_point() +
    # geom_smooth(method = 'lm', colour = 'black') +
    facet_wrap(~ lower_quant) +
    scale_colour_viridis_c()

Animated quantiles instead

N <- 100
diff <- 40
quants <- seq(0.01, 0.99, length.out = N)
quants2 <- quants[1:(N - diff)]
quants <- quants[(diff + 1):N]

in_group_quants <- 
  madagascar_malaria %>% 
    group_by(ID) %>% 
    summarise(lower = sapply(seq_along(quants), 
                            \(i) quantile(LSTmean, c(quants2[i]))),
              upper = sapply(seq_along(quants), 
                            \(i) quantile(LSTmean, c(quants[i]))),
              lower_quant = round(quants2, 2),
              upper_quant = round(quants, 2),
              case_rate = mean(case_rate))

anim <- 
  in_group_quants %>% 
    ggplot(aes(x = lower, y = upper, colour = case_rate, size = case_rate)) + 
      geom_point() +
      # geom_smooth(method = 'lm', colour = 'black') +
      transition_states(lower_quant) +
      scale_colour_viridis_c()
animate(anim, renderer = gifski_renderer())
ranks <-
  madagascar_malaria %>% 
    distinct(ID, .keep_all = TRUE) %>% 
    mutate(case_rate_rank = rank(case_rate)) %>% 
    select(ID, case_rate_rank)


madagascar_malaria %>% 
  left_join(ranks) %>% 
  ggplot(aes(x = LSTmean, y = case_rate_rank, colour = case_rate))  +
    geom_point() +
    scale_colour_viridis_c(option = 'E')
ranks <-
  madagascar_malaria %>% 
    distinct(ID, .keep_all = TRUE) %>% 
    mutate(case_rate_rank = rank(case_rate)) %>% 
    select(ID, case_rate_rank)


madagascar_malaria %>% 
  left_join(ranks) %>% 
  ggplot(aes(y = LSTmean, x = case_rate_rank, colour = case_rate))  +
    geom_point() +
    scale_colour_viridis_c(option = 'E') +
    geom_smooth()
madagascar_malaria %>% 
  group_by(ID) %>% 
  summarise(
    `0.05quant` = quantile(LSTmean, 0.05),
    `0.25quant` = quantile(LSTmean, 0.25),
     `0.5quant` = median(LSTmean),
    `0.75quant` = quantile(LSTmean, 0.75),
    `0.95quant` = quantile(LSTmean, 0.95),
    case_rate = mean(case_rate)) %>% 
  ungroup %>% 
  select(-ID) %>%
  pivot_longer(-case_rate) %>% 
  ggplot(aes(value, case_rate)) + 
    facet_wrap(~ name, scale = 'free') +
    geom_point() + 
    geom_smooth()


timcdlucas/agouti documentation built on Feb. 8, 2024, 6:12 p.m.