In diazrenata/sadspace: Exploring SAD space

knitr::opts_chunk$set(echo = F)
library(drake)
library(sadspace)
library(dplyr)
library(ggplot2)
library(ggpmisc)

#di_df <- read.csv(here::here("analysis", "di_dfs.csv"), stringsAsFactors = F)
fs_size_dat <- read.csv(here::here("analysis", "fs_size_dat.csv"), stringsAsFactors = F)
di_summary_df <- read.csv(here::here("analysis", "di_summary_df.csv"), stringsAsFactors = F)

Here is the range of S and N space covered, and nparts in FS:

ggplot(fs_size_dat, aes(log(s0), log(n0), color = log_nparts)) +
  geom_point() +
  theme_bw() +
  scale_color_viridis_c(option = "plasma") +
  theme(legend.position = "top")

This is on a log scale, so up in that right corner is 1.942426e+130.

ggplot(fs_size_dat, aes(log(n0/s0), log_nparts, group = as.factor(log(s0)), color = log(s0))) +
  geom_line() +
  theme_bw() +
  scale_color_viridis_c() +
  theme(legend.position = "top")

Note that the number of elements would continue to increase along each of these lines, except we hit the edge of the sampling space.

Here is the number of unique FS found, and the number of unique FS over the number that exist. Note that the number of draws was 5,000, so the maximum possible log(nunique) would be r log(5000). FS that found 5,000 unique draws are covered with black dots in this plot:

ggplot(fs_size_dat, aes(log(s0), log(n0), color = log_nunique)) +
  geom_point() +
  geom_point(data = filter(fs_size_dat, log_nunique >= 8.5), color = "black") +
  scale_color_viridis_c(option = "plasma", end = .9) +
  theme_bw() +
  theme(legend.position = "top")

Here is the number of unique samples found relative to the number possible. FS that found all the elements possible are covered with green dots:

ggplot(fs_size_dat, aes(log(s0), log(n0), color = log_nunique - log_nparts)) +
  geom_point() +
  geom_point(data = filter(fs_size_dat, log_nunique == log_nparts), color = "green")+ 
  scale_color_viridis_c(option = "plasma", end = .8, begin = .1, direction = -1) +
  theme_bw() +
  theme(legend.position = "top")

Skewness behavior

Both skewness and the variability in skewness decrease with N/S, and increase with S. For a given S, a higher N/S (so higher N) will have lower skew and lower variability.

I don't think it matters very much whether you look at mean or median/range or sd.

ggplot(data = filter(di_summary_df, s0 > 3, !is.infinite(mean_skew), !is.infinite(sd_skew), !is.na(mean_skew), !is.na(sd_skew)), aes(log(s0), log(n0), color = mean_skew)) +
  geom_point() +
  theme_bw() +
  scale_color_viridis_c(option = "plasma") +
  theme(legend.position = "top")

ggplot(data = filter(di_summary_df, s0 > 3, !is.infinite(mean_skew), !is.infinite(sd_skew), !is.na(mean_skew), !is.na(sd_skew)), aes(log_nparts, mean_skew, color = log(n0), group = log(s0))) +
  geom_line() +
  theme_bw() +
  geom_vline(xintercept = 10) +
  scale_color_viridis_c() +
  theme(legend.position = "top")



ggplot( data = filter(di_summary_df, s0 > 3, !is.infinite(mean_skew), !is.infinite(sd_skew), !is.na(mean_skew), !is.na(sd_skew)), aes(log(s0), log(n0), color = sd_skew)) +
  geom_point() +
  theme_bw() +
  scale_color_viridis_c(option = "plasma") +
  theme(legend.position = "top")

ggplot(data = filter(di_summary_df, s0 > 3, !is.infinite(mean_skew), !is.infinite(sd_skew), !is.na(mean_skew), !is.na(sd_skew)), aes(log_nparts, sd_skew, color = log(n0), group = log(s0))) +
  geom_line() +
  theme_bw() +
  geom_vline(xintercept = 10) +
  scale_color_viridis_c() +
  theme(legend.position = "top")



ggplot(data =filter(di_summary_df, s0 > 3, !is.infinite(mean_skew), !is.infinite(sd_skew), !is.na(mean_skew), !is.na(sd_skew)), aes(log(s0), log(n0), color = abs(sd_skew/mean_skew))) +
  geom_point() +
  theme_bw() +
  scale_color_viridis_c(option = "plasma") +
  theme(legend.position = "top")

ggplot(data = filter(di_summary_df, s0 > 3, !is.infinite(mean_skew), !is.infinite(sd_skew), !is.na(mean_skew), !is.na(sd_skew)), aes(log_nparts, abs(sd_skew/mean_skew), color = log(n0), group = log(s0))) +
  geom_line() +
  theme_bw() +
  geom_vline(xintercept = 10) +
  scale_color_viridis_c() +
  theme(legend.position = "top")

Simpson behavior

ggplot(data = filter(di_summary_df, s0 >= 2, !is.infinite(mean_simpson), !is.infinite(sd_simpson), !is.na(mean_simpson), !is.na(sd_simpson)), aes(log(s0), log(n0), color = mean_simpson)) +
  geom_point() +
  theme_bw() +
  scale_color_viridis_c(option = "plasma") +
  theme(legend.position = "top")

ggplot(data = filter(di_summary_df, s0 >= 2, !is.infinite(mean_simpson), !is.infinite(sd_simpson), !is.na(mean_simpson), !is.na(sd_simpson)), aes(log_nparts, mean_simpson, color = log(n0), group = log(s0))) +
  geom_line() +
  theme_bw() +
  geom_vline(xintercept = 10) +
  scale_color_viridis_c() +
  theme(legend.position = "top")



ggplot( data = filter(di_summary_df, s0 >= 2, !is.infinite(mean_simpson), !is.infinite(sd_simpson), !is.na(mean_simpson), !is.na(sd_simpson)), aes(log(s0), log(n0), color = sd_simpson)) +
  geom_point() +
  theme_bw() +
  scale_color_viridis_c(option = "plasma") +
  theme(legend.position = "top")

ggplot(data = filter(di_summary_df, s0 >= 2, !is.infinite(mean_simpson), !is.infinite(sd_simpson), !is.na(mean_simpson), !is.na(sd_simpson)), aes(log_nparts, (sd_simpson), color = log(n0), group = log(s0))) +
  geom_line() +
  theme_bw() +
  geom_vline(xintercept = 10) +
  scale_color_viridis_c() +
  theme(legend.position = "top")


ggplot(data =filter(di_summary_df, s0 >= 2, !is.infinite(mean_simpson), !is.infinite(sd_simpson), !is.na(mean_simpson), !is.na(sd_simpson)), aes(log(s0), log(n0), color = abs(sd_simpson/mean_simpson))) +
  geom_point() +
  theme_bw() +
  scale_color_viridis_c(option = "plasma") +
  theme(legend.position = "top")

ggplot(data = filter(di_summary_df, s0 >= 2, !is.infinite(mean_simpson), !is.infinite(sd_simpson), !is.na(mean_simpson), !is.na(sd_simpson)), aes(log_nparts, abs(sd_simpson/mean_simpson), color = log(n0), group = log(s0))) +
  geom_line() +
  theme_bw() +
  geom_vline(xintercept = 10) +
  scale_color_viridis_c() +
  theme(legend.position = "top")

Overlap with datasets represented in `scadsanalysis`

sa <- read.csv(here::here("analysis", "all_di.csv"), stringsAsFactors = F)

sa_sv <- sa %>%
  filter(!singletons)

ggplot(fs_size_dat, aes(x = log(s0), y = log(n0))) +
  geom_point(color = "blue", alpha = .5) +
  theme_bw() +
  geom_point(data = sa_sv, color = "red", alpha = .5)