In diazrenata/sadspace: Exploring SAD space

knitr::opts_chunk$set(echo = F)
library(drake)
library(sadspace)
library(dplyr)
library(ggplot2)
library(ggpmisc)

if(params$on_hpg) {
  diffs <- read.csv(here::here("analysis", "diff_dfs.csv"), stringsAsFactors = F)
  diffs_summary <- read.csv(here::here("analysis", "diff_summary.csv"), stringsAsFactors = F)

  diffs <- diffs %>%
    filter(nparts >1, s0 > 2, !is.na(sim1), !is.na(sim2), !is.infinite(r2), !is.infinite(r2_log)) %>%
    mutate(community_name = paste0(s0, "_", n0)) %>%
    group_by_all() %>%
    mutate(low_sim = min(sim1, sim2),
           high_sim = max(sim1, sim2)) %>%
    ungroup() %>%
    group_by(low_sim, high_sim, community_name) %>%
    mutate(dup = row_number()) %>%
    ungroup() %>%
    filter(dup == 1)


  diffs_summary <- diffs %>%
    mutate(sim_name = paste0(sim1, "_", sim2)) %>%
    group_by(community_name, s0, n0, nparts) %>%
    summarize(ncomparisons = n(),
              mean_r2 = mean(r2),
              mean_r2_log = mean(r2_log),
              mean_cd = mean(cd),
              mean_prop_off = mean(prop_off),
              mean_div = mean(div)) %>%
    ungroup() %>%
    mutate(lognparts = log(as.numeric(nparts)))

  write.csv(diffs, here::here("analysis", "diffs_handled.csv"), row.names = F)

  write.csv(diffs_summary, here::here("analysis", "diff_summary.csv"), row.names = F)
} else {
  diffs <- read.csv(here::here("analysis", "diffs_handled.csv"), stringsAsFactors = F)
  diffs_summary <- read.csv( here::here("analysis", "diff_summary.csv"), stringsAsFactors = F)
}

Number of comparisons

ggplot(diffs_summary, aes(lognparts, ncomparisons)) +
  geom_point() +
  theme_bw() +
  geom_vline(xintercept = 10) +
  xlim(0, 50) +
  geom_hline(yintercept = c(100,  250, 500))

ggplot(diffs_summary, aes(log(s0), log(n0), color = ncomparisons)) +
  geom_point() +
  scale_color_viridis_c() +
  theme_bw()


ggplot(filter(diffs_summary, ncomparisons < 100), aes((s0), log(n0), color = ncomparisons)) +
  geom_point() +
  scale_color_viridis_c() +
  theme_bw()


ggplot(filter(diffs_summary), aes((s0), log(n0), color = log(n0/s0), shape = log(n0/s0) <= .2)) +
  geom_point() +
  scale_color_viridis_c() +
  theme_bw()

The region of interest is down where lognparts < 10. These FS also have fewer comparisons (because they are smaller). We will look at results summarized from all comparisons and from subsampled to an equal, small number of comparisons for all communities. We can use the subsampled diffs to look at density plots, etc as well.

All comparisons:

ggplot(filter(diffs_summary), aes(log(s0), log(n0), color = mean_r2, shape = lognparts >= 10)) +
  geom_point() +
  theme_bw() +
  scale_color_viridis_c(limits = c(-1, 1))


ggplot(filter(diffs_summary), aes(log(s0), log(n0), color = mean_r2_log, shape = lognparts >= 10)) +
  geom_point() +
  theme_bw() +
  scale_color_viridis_c(limits = c(-2.5, 1))

ggplot(diffs_summary, aes(log(s0), log(n0), color = mean_cd, shape = lognparts >= 10)) +
  geom_point() +
  theme_bw() +
  scale_color_viridis_c()


ggplot(diffs_summary, aes(log(s0), log(n0), color = mean_prop_off, shape = lognparts >= 10)) +
  geom_point() +
  theme_bw() +
  scale_color_viridis_c()


ggplot(diffs_summary, aes(log(s0), log(n0), color = mean_div, shape = lognparts >= 10)) +
  geom_point() +
  theme_bw() +
  scale_color_viridis_c()

diffs_small <- diffs %>%
  mutate(sim_name = paste0(sim1, "_", sim2)) %>%
  group_by(community_name, s0, n0, nparts) %>%
  mutate(sim_index = row_number(),
         ncomparison = n()) %>%
  ungroup() %>%
  filter(sim_index <= 100, ncomparison >= 100)  %>%
  mutate(lognparts = log(as.numeric(nparts)))

diffs_small_summary <- diffs_small %>%
  group_by(community_name, s0, n0, nparts, ncomparison, lognparts) %>%
  summarize(ncomparison_included = n(),
            mean_r2 = mean(r2),
            mean_r2_log = mean(r2_log),
            mean_cd = mean(cd),
            mean_prop_off = mean(prop_off),
            mean_div = mean(div)) %>%
  ungroup()

Not shown, but the r2 relationships (log and not) get more off for very low values of mean r2 and mean r2 (log). Very low meaning, r2 < 0 and r2_log < -1.

summary_comparison <- diffs_small_summary %>%
  select(community_name, lognparts, s0, n0, mean_r2, mean_r2_log, mean_cd, mean_prop_off, mean_div) %>%
  left_join(select(diffs_summary, community_name, s0, n0, mean_r2, mean_r2_log, mean_cd, mean_prop_off, mean_div), by = c("community_name", "s0", "n0"))

ggplot(summary_comparison, aes(mean_r2.x, mean_r2.y, color = lognparts)) +
  geom_point() +
  theme_bw() + scale_color_viridis_c(limits = c(0, 10)) +
  xlim(0, 1) +
  ylim(0, 1)

ggplot(summary_comparison, aes(mean_r2_log.x, mean_r2_log.y, color = lognparts)) +
  geom_point() +
  theme_bw() + scale_color_viridis_c(limits = c(0, 10)) +
  xlim(-1, 1) +
  ylim(-1, 1) 

ggplot(summary_comparison, aes(mean_cd.x, mean_cd.y, color = lognparts)) +
  geom_point() +
  theme_bw() + scale_color_viridis_c(limits = c(0, 10)) +
  xlim(0.5,1) +
  ylim(0.5,1)


ggplot(summary_comparison, aes(mean_prop_off.x, mean_prop_off.y, color = lognparts)) +
  geom_point() +
  theme_bw() + scale_color_viridis_c(limits = c(0, 10)) +
  xlim(0, .25) +
  ylim(0, .25)


ggplot(summary_comparison, aes(mean_div.x, mean_div.y, color = lognparts)) +
  geom_point() +
  theme_bw() + scale_color_viridis_c(limits = c(0, 10)) +
  xlim(0, .2) +
  ylim(0, .2)

The summary stats from the subsampled comparisons are closely related to the non-subsampled comparisons, but there is motion when we subsample.

Subsampling allows us to plot the density plots for these various stats...

R2

For R2, I have filtered out sim comparisons where r2 < -1. r2 gets as low as -8000, but most of the variation we are interested in is in the 0-1 range.

These extreme r2 values make it impossible to see what we are looking for in these plots. They are rare and concentrated in the very smallest communities. Excluding them, if anything, makes the density estimates for those communities more pointed than they would be if they were included. I have re-subsampled the remaining draws so there are the same number of comparisons represented for every community.

Other cutoffs, including 0, don't change the qualitative outcome.

tinyr2_cutoff = -1
tinyr2_log_cutoff = -2

diffs_small <- mutate(diffs_small, more_than_10 = lognparts >= 10,
                      tinyr2 = r2 <= tinyr2_cutoff, 
                      tinyr2_log = r2_log <= tinyr2_log_cutoff) %>%
  group_by(community_name) %>%
  mutate(prop_tinyr2 = mean(tinyr2),
         prop_tinyr2_log = mean(tinyr2_log)) %>%
  ungroup()

ggplot(filter(diffs_small, prop_tinyr2 > 0), aes(log(s0), log(n0), color = prop_tinyr2)) +
  geom_point() +
  theme_bw() +
  scale_color_viridis_c()

ggplot(distinct(select(diffs_small, community_name, lognparts, prop_tinyr2)), aes(x = lognparts, y = prop_tinyr2)) +
  geom_point() +
  theme_bw() +
  geom_vline(xintercept = 10)

diffs_small_r2s <- filter(diffs_small, r2 > tinyr2_cutoff) %>%
  group_by(community_name) %>%
  mutate(sim_index = row_number(),
         ncomparison_kept = n()) %>%
  ungroup() %>%
  filter(sim_index <= min(ncomparison_kept))

ggplot(diffs_small_r2s, aes(x = r2, group = community_name, color = lognparts)) +
  geom_density(size = .1) +
  theme_bw() +
  scale_color_viridis_c(option = "plasma", end = .9) +
  facet_wrap(vars(more_than_10), scales = "fixed")

Larger FS are dramatically more concentrated with high r2.

R2 with log

Similarly, I've removed draws where r2_log <= -2. Changing the cutoff doesn't change the outcome...

ggplot(filter(diffs_small, prop_tinyr2_log > 0), aes(log(s0), log(n0), color = prop_tinyr2_log)) +
  geom_point() +
  theme_bw() +
  scale_color_viridis_c()

ggplot(distinct(select(diffs_small, community_name, lognparts, prop_tinyr2_log)), aes(x = lognparts, y = prop_tinyr2_log)) +
  geom_point() +
  theme_bw() +
  geom_vline(xintercept = 10)

diffs_small_r2_logs <- filter(diffs_small, r2_log > tinyr2_log_cutoff) %>%
  group_by(community_name) %>%
  mutate(sim_index = row_number(),
         ncomparison_kept = n()) %>%
  ungroup() %>%
  filter(sim_index <= min(ncomparison_kept))

ggplot(diffs_small_r2_logs, aes(x = r2_log, group = community_name, color = lognparts)) +
  geom_density(size = .1) +
  theme_bw() +
  scale_color_viridis_c(option = "plasma", end = .9) +
  facet_wrap(vars(more_than_10), scales = "fixed")

...which is, a concentration at higher r2 for larger FS.

CD

This is the coefficient of determination from a lm fitting one element of a FS to another. It is bounded 0-1.

ggplot(diffs_small, aes(x = cd, group = community_name, color = lognparts)) +
  geom_density(size = .1) +
  theme_bw() +
  scale_color_viridis_c(option = "plasma", end = .9) +
  facet_wrap(vars(more_than_10), scales = "fixed")

Again!

Proportion off

This is the proportion of individuals who are allocated differently in two FS. It is bounded 0-1.

ggplot(filter(diffs_small, log(n0/s0) >= .2), aes(x = prop_off, group = community_name, color = lognparts)) +
  geom_density(size = .1) +
  theme_bw() +
  scale_color_viridis_c(option = "plasma", end = .9) +
  facet_wrap(vars(more_than_10), scales = "fixed")

Again! This time, the large FS have fewer individuals allocated differently (proportional to the number of individuals in the community)

K-L divergence

This is an estimate of the Kullbak-Lieber (sp) divergence between two elements of the FS.

ggplot(filter(diffs_small, log(n0/s0) >= .2), aes(x = div, group = community_name, color = lognparts)) +
  geom_density(size = .1) +
  theme_bw() +
  scale_color_viridis_c(option = "plasma", end = .9) +
  facet_wrap(vars(more_than_10), scales = "fixed")