knitr::opts_chunk$set(echo = FALSE) library(dplyr) library(scads) library(ggplot2) dataset <- "plants"
if(dataset == "plants") { portal_sad <- portalr::plant_abundance(level = "Treatment", type = "Winter Annuals", plots = "All", unknowns = F, correct_sp = T, shape = "flat", min_quads = 16) %>% filter(treatment == "control") %>% select(year, species, abundance) %>% filter(year == 1994) %>% select(-year) %>% rename(abund = abundance) %>% select(abund) %>% arrange(abund) %>% as.matrix() %>% as.vector() nleg <- 10 } else if(dataset == "rodents94") { portal_data <- isds::get_toy_portal_data(download=T, years = c(1994)) portal_sad <- portal_data %>% select(species) %>% group_by(species) %>% summarize(abund = n()) %>% ungroup() %>% select(abund) %>% arrange(abund) %>% as.matrix() %>% as.vector() nleg <- 5 } else { portal_data <- isds::get_toy_portal_data(download=T, years = c(1990:1995)) portal_sad <- portal_data %>% select(species) %>% group_by(species) %>% summarize(abund = n()) %>% ungroup() %>% select(abund) %>% arrange(abund) %>% as.matrix() %>% as.vector() nleg <- 8 } nspp = length(portal_sad) nind = sum(portal_sad) portal_sad <- data.frame( abund = portal_sad, source = "emp", rank = 1:length(portal_sad) )
fs_bank <- sample_feasibleset(s = nspp, n = nind, nsamples = 10000, distinct = TRUE) save(fs_bank, file = "fs_bank_94.Rds")
if(dataset == "plants") { load(here::here("fs_bank_plants.Rds")) nametoprint <- "Portal winter annuals 1994, 10000 draws" } else if(dataset == "rodents94") { load(here::here("fs_bank_94.Rds")) nametoprint <- "Portal control rodents 1994, 10000 draws" } else if(dataset == "small") { load(here::here("fs_bank_small.Rds")) nametoprint <- "Portal control rodents 1990-95, 100 draws" } else { load(here::here("fs_bank.Rds")) nametoprint <- "Portal control rodents 1990=95, 10000 draws" }
fs_bank_M <- fs_bank %>% as.data.frame() %>% mutate(rank = row_number()) %>% tidyr::gather(-rank, key = "sim", value = "abund") %>% mutate(sim = as.integer(substr(sim, 2, nchar(sim))))
r nametoprint
.The y-axes are abundance (on the left) and relative abundance (on the right). Each black dot is an abundance value from a vector drawn from the feasible set. The red line plots the distribution from Portal.
The black dots are semi-transparent, which makes it a little easier to see the density distribution.
The rescaled vectors (on the right) are what go into Legendre approximation. In this case, these plots should look identical, because all the draws from the feasible set have the same number of individuals as the Portal vector. I have at other times compared SADs without the total abundance constraint, where the rescaled plots could look quite different.
fs_dp <- ggplot(data = fs_bank_M, aes(x =as.factor(rank), y = abund)) + geom_point(alpha = .2, size = 1) + geom_line(data = portal_sad, aes(x = rank, y = abund), alpha = .5, color = "red") + theme_bw() + ggtitle("Abundance: draws from FS vs Portal") fs_bank_M <- fs_bank_M %>% mutate(sabund = abund / nind) portal_sad <- portal_sad %>% mutate(sabund = abund/nind) fs_sdp <- ggplot(data = fs_bank_M, aes(x =as.factor(rank), y = sabund)) + geom_point(alpha = .2, size = 1) + geom_line(data = portal_sad, aes(x = rank, y = sabund), alpha = .5, color = "red") + theme_bw() + ggtitle("Abundance: draws from FS vs Portal, rescaled") gridExtra::grid.arrange(grobs = list(fs_dp, fs_sdp), nrow = 1)
fs_leg_coeffs <- apply(as.matrix(1:ncol(fs_bank)), MARGIN = 1, FUN = function(sim_index, fs_bank_M) return(legendre_approx(filter(fs_bank_M, sim == sim_index)$sabund, nleg = nleg)$coefficients), fs_bank_M = fs_bank_M) portal_leg_coeffs <-data.frame(coeff_value = legendre_approx(portal_sad$sabund, nleg = nleg)$coefficients) %>% mutate(coeff_name = names(legendre_approx(portal_sad$sabund, nleg = nleg)$coefficients), coeff_index = row_number()) fs_leg_coeffs_M <- fs_leg_coeffs %>% as.data.frame() %>% mutate(coeff_name = row.names(fs_leg_coeffs)) %>% tidyr::gather(-coeff_name, key = "sim", value = "coeff_value") %>% mutate(sim = as.integer(substr(sim, 2, nchar(sim)))) %>% left_join(select(portal_leg_coeffs, coeff_index, coeff_name), by = "coeff_name") coeffs_plot <- ggplot(data = fs_leg_coeffs_M, aes(x = coeff_index, y = coeff_value)) + geom_point(alpha = .2, size = 1) + geom_line(data = portal_leg_coeffs, aes(x = coeff_index, y = coeff_value), alpha = .5, color = "red") + theme_bw()
The y-axis is the value of the coefficient for each Legendre polynomial. Each black dot is a value for a draw from the feasible set.
The green line marks the centroid of the coefficients for the draws from the feasible set. The red line marks the coefficients for the Portal vector.
fs_centroid <- fs_leg_coeffs_M %>% group_by(coeff_name) %>% summarize(centroid_value = mean(coeff_value)) %>% ungroup() fs_leg_coeffs_M <- fs_leg_coeffs_M %>% left_join(fs_centroid, by = "coeff_name") portal_leg_coeffs <- portal_leg_coeffs %>% left_join(fs_centroid, by = "coeff_name") coeffs_c_plot <- coeffs_plot + geom_line(data = portal_leg_coeffs, aes(x = coeff_index, y = centroid_value), color = "green") coeffs_c_plot
Here we are reconstructing the scaled SAD from the approximated coefficients. The y-axis is the scaled abundance or estimated scaled abundance. The hollow points are estimates from approximation with r nleg
polynomials, and the stars are the true values.
There is no true value for the centroid, but closest_fs
is the element of the feasible set with the lowest euclidean distance between its coefficients and the centroid coefficients. farthest_fs
is the element of the feasible set with the largest distance between its coefficients and the centroid coefficients.
fs_dist <- fs_leg_coeffs_M %>% group_by(sim) %>% summarize(centroid_dist = eucl_rows(coeff_value, centroid_value)) %>% ungroup() %>% arrange(centroid_dist) %>% mutate(sim_rank = row_number()) estimates_df <- data.frame( rank = c(1:nspp), centroid = legendre_generate(coeff_values = portal_leg_coeffs$centroid_value, nleg = nleg, nspp = nspp), closest_fs = legendre_generate(coeff_values = filter(fs_leg_coeffs_M, sim == filter(fs_dist, sim_rank == 1)$sim)$coeff_value, nleg = nleg, nspp = nspp), farthest_fs = legendre_generate(coeff_values = filter(fs_leg_coeffs_M, sim == filter(fs_dist, sim_rank == max(fs_dist$sim_rank))$sim)$coeff_value, nleg = nleg, nspp = nspp), portal = legendre_generate(coeff_values = portal_leg_coeffs$coeff_value, nleg = nleg, nspp = nspp)) %>% tidyr::gather(-rank, key = "source", value = "est_abund") true_sads <- filter(fs_bank_M, sim %in% c(filter(fs_dist, sim_rank == 1)$sim, filter(fs_dist, sim_rank == max(fs_dist$sim_rank))$sim)) %>% left_join(select(fs_dist, sim_rank, sim), by = "sim") %>% mutate(source = ifelse((sim_rank == 1), "closest_fs", "farthest_fs")) %>% select(-sim, -sim_rank) %>% bind_rows(mutate(portal_sad, source = "portal")) %>% left_join(estimates_df, by = c("rank", "source")) %>% bind_rows(data.frame(source = "centroid", stringsAsFactors = F)) ssqes <- data.frame( source = c("centroid", "closest_fs", "farthest_fs", "portal"), ssqe = c(NA, ssqe(filter(true_sads, source == "closest_fs")$est_abund, filter(true_sads, source == "closest_fs")$sabund), ssqe(filter(true_sads, source == "farthest_fs")$est_abund, filter(true_sads, source == "farthest_fs")$sabund), ssqe(filter(true_sads, source == "portal")$est_abund, filter(true_sads, source == "portal")$sabund)) ) estimates_plot <- ggplot(data = estimates_df, aes(x = rank, y = est_abund, color = source)) + geom_point(shape = 1) + geom_point(data = true_sads, aes(x = rank, y = sabund, color = source), shape = 8) + geom_label(data = ssqes, aes(x = 4, y = c(.95, .87, .8, .73), label = paste0("ssqe: ", format(signif(ssqe, 2), scientific = FALSE)), color = source), show.legend = FALSE) + scale_colour_viridis_d(end = .8, option = "C") + theme_bw() + ylim(0, 1) estimates_plot
Violin/density plot of the Euclidean distance between the centroid coefficients and the coefficients for draws from the feasible set. The violin is for all draws from the feasible set. The black dots are individual draws. The colored dots are for the closest and farthest (from the centroid) elements of the feasible set, and for Portal.
all_dist <- fs_dist %>% mutate(source1 = "fs") %>% mutate(source = NA) %>% mutate(source = ifelse(centroid_dist == max(fs_dist$centroid_dist), "farthest_fs", source)) %>% mutate(source = ifelse(centroid_dist == min(fs_dist$centroid_dist), "closest_fs", source)) %>% bind_rows(data.frame( source = "portal", source1 = "portal", sim = NA, sim_rank = NA, centroid_dist = eucl_rows(portal_leg_coeffs$coeff_value, portal_leg_coeffs$centroid_value), stringsAsFactors = F )) centroid_dist_plot <- ggplot(data = filter(all_dist, source1 != "portal"), aes(x = 0, y = centroid_dist)) + geom_violin() + geom_point(alpha = .1, size = .5) + geom_point(data = filter(all_dist, !is.na(source)), aes(x = 0, y = centroid_dist, color = source), size = 2) + theme_bw()+ scale_colour_viridis_d(begin = .2, end = .8, option = "C") centroid_dist_plot
fs_bank_M <- left_join(fs_bank_M, select(fs_dist, sim, centroid_dist), by = "sim") dist_quantiles <- quantile(fs_bank_M$centroid_dist, probs = c(0, .75, .85, .95, .975, .99, 1)) fs_bank_M <- fs_bank_M %>% mutate(qbin = vapply(as.matrix(fs_bank_M$centroid_dist), FUN = function(c_d, d_q) return((as.integer(substr(names(d_q)[max(which(d_q <= c_d))], 1, nchar(names(d_q)[max(which(d_q <= c_d))]) - 1))) / 100), d_q = dist_quantiles, FUN.VALUE = .1)) # fs_heatmap <- ggplot(data = fs_bank_M, aes(x = as.factor(qbin), y = sabund, color = qbin)) + # geom_point(size = .5, alpha = .1) + # geom_point(data = filter(fs_bank_M, qbin > .95), size = 1, alpha = 1) + # scale_colour_viridis_c(option = "plasma", end = .8) + # theme_minimal() + # theme(panel.background = element_rect(fill = "white"), # legend.position = "top", # panel.border = element_rect(fill = NA, colour = "black"), # panel.spacing.x = unit(0,"line")) + # facet_grid(cols = vars(rank), switch = "x") # fs_heatmap
fs_bank_linecloud <- fs_bank_M %>% select(sim, centroid_dist) %>% distinct() %>% arrange(desc(centroid_dist)) %>% mutate(dist_rank = row_number()) %>% right_join(fs_bank_M, by =c("sim", "centroid_dist")) %>% mutate(qbin_alpha = ifelse(qbin <= .75, .75, qbin)) linecloud <- ggplot(data = fs_bank_linecloud, aes(x = rank, y = sabund, color = as.factor(qbin), alpha = qbin_alpha, group = sim)) + geom_line(size = .1, show.legend = FALSE) + geom_line(data = filter(fs_bank_linecloud, dist_rank <= 3), aes(x = rank, y = sabund, color = as.factor(qbin)), alpha = 1, size = .6) + scale_colour_viridis_d(option = "plasma", end = .8) + theme_bw() + theme(legend.position = "top") linecloud
fs_coeffs_cloud <- left_join(fs_leg_coeffs_M, distinct(select(fs_bank_linecloud, sim, qbin, qbin_alpha, dist_rank)), by = "sim") fs_coeffs_linecloud <- ggplot(data = fs_coeffs_cloud, aes(x = coeff_index, y = coeff_value, color = as.factor(qbin), alpha = qbin_alpha, group = sim)) + geom_line(size = .1, show.legend = FALSE) + geom_line(data = filter(fs_coeffs_cloud, dist_rank <= 3), aes(x = coeff_index, y = coeff_value, color = as.factor(qbin)), alpha = 1, size = .6) + scale_colour_viridis_d(option = "plasma", end = .8) + theme_bw() + scale_x_continuous(breaks = unique(fs_coeffs_cloud$coeff_index), labels = unique(fs_coeffs_cloud$coeff_name)) + theme(legend.position = "top") fs_coeffs_linecloud
The y axis is scaled abundance. Each line marks one draw from the feasible set. The colors (qbin
) map the percentile of the Euclidean distance between the draw's coefficients and the centroid of the coefficients. Purpley-pink colors are closer to the centroid; orange to yellow are far. The 3 farthest draws are marked with thicker and less transparent lines. The red dots are the values from Portal.
emp_linecloud <- linecloud + geom_point(data = mutate(portal_sad, sim = 1, qbin = 0, qbin_alpha = 1), aes(x = rank, y = sabund), color = "#FF0000", show.legend = F) emp_linecloud
The y-axis is coefficient values, and the x axis is each coefficient. As above, each line marks the coefficients for a single draw, and lines are shaded according to the distance to the centroid. The red dots are the coefficients for Portal.
emp_coeff_linecloud <- fs_coeffs_linecloud + geom_point(data = mutate(portal_leg_coeffs, sim = 1, qbin = 0, qbin_alpha = 1), aes(x =coeff_index, y= coeff_value), color = "#FF0000", show.legend = F) emp_coeff_linecloud
It seems possible that the Legendre coefficients might just reproduce the same information that is in the abundance values. So I calculated a few things based on the centroid of the scaled abundance values rather than the centroid of the coefficients.
The yellow line is the centroid of the scaled abundance values (not the coefficients). The estimate from the centroid of the coefficients (black line and points) almost exactly matches.
raw_abund_centroid <- fs_bank_M %>% select(rank, sim, sabund) %>% group_by(rank) %>% summarize(abund = mean(sabund)) %>% ungroup() %>% mutate(source = "raw_abund_centroid") %>% bind_rows(rename(estimates_df, abund = est_abund)) raw_centroid <- ggplot(data = raw_abund_centroid, aes(x = rank, y = abund, color = source)) + geom_point() + geom_line() + theme_bw() + scale_color_viridis_d(option = "viridis") raw_centroid
Each dot is draw from the feasible set. The x axis is the distance from the centroid of the abundance values. The y axis is the distance from the centroid of the coefficients. The yellow line is 1:1. The centroid values are generally smaller than the abundance values.
fs_raw_dist <- fs_bank_M %>% select(rank, sim, sabund) %>% group_by(sim) %>% summarize(rawdist = eucl_rows(sabund, filter(raw_abund_centroid, source == "raw_abund_centroid")$abund)) %>% ungroup() %>% arrange(rawdist) %>% mutate(raw_sim_rank = row_number()) %>% left_join(fs_dist, by = "sim") raw_coeff_plot <- ggplot(data = fs_raw_dist, aes(x = rawdist, y = centroid_dist)) + geom_point() + geom_line(data = data.frame(x = seq(min(c(fs_raw_dist$rawdist, fs_raw_dist$centroid_dist)), max(c(fs_raw_dist$rawdist, fs_raw_dist$centroid_dist)), length.out = 10)), aes(x = x, y = x), color = "orange") + theme_bw() raw_coeff_plot
Each dot is draw from the feasible set. The x axis is the rank of a draw's distance from the centroid of the abundance values. The y axis is the rank of that draw's distance from the centroid of the coefficients. The yellow line is 1:1. There's a relationship, but a lot of scatter.
raw_coeff_rank_plot <- ggplot(data = fs_raw_dist, aes(x = raw_sim_rank, y = sim_rank)) + geom_point() + geom_line(data = data.frame(x = seq(min(c(fs_raw_dist$raw_sim_rank, fs_raw_dist$sim_rank)), max(c(fs_raw_dist$raw_sim_rank, fs_raw_dist$sim_rank)), length.out = 10)), aes(x = x, y = x), color = "orange") + theme_bw() raw_coeff_rank_plot
Do the coefficients, or the coefficient-centroid distances, recapitulate more longstanding metrics? I tried evar and skewness.
Higher values indicate more even vectors; note the color scale is reversed. (Based on exploration, I expect lower evenness values to be at the edges). The red dots are the Portal vector. evar
for the Portal vector is r conditionalsads::evar(portal_sad$sabund)
.
fs_bank_mets <- fs_bank_M %>% group_by(sim) %>% mutate(evar = conditionalsads::evar(sabund), skew = conditionalsads::rad_skew(sabund)) %>% ungroup() %>% mutate(skew_rounded = round(skew, digits = 4)) skr <- fs_bank_mets %>% select(sim, skew) %>% distinct() %>% arrange(skew) %>% mutate(skew_rank = row_number()) fs_bank_mets <- left_join(fs_bank_mets, skr, by = c("sim", "skew")) evar_plot <- ggplot(data = fs_bank_mets, aes(x = rank, y = sabund, color = evar, group = sim)) + geom_line(size = .1, alpha = .2) + geom_line(data = filter(fs_bank_mets, evar <= quantile(fs_bank_mets$evar, probs = .003)[1]), alpha = .5, size = .5) + scale_colour_viridis_c(option = "plasma", end = .8, direction = -1) + geom_point(data = mutate(portal_sad, sim = 1, qbin = 0, qbin_alpha = 1), aes(x = rank, y = sabund), color = "#FF0000", show.legend = F) + theme_bw() + theme(legend.position = "top") evar_plot
The y axis is abundance; colors correspond to skewness. The most and least skewed vectors are highlighted with thicker, less transparent lines. The red dots mark the Portal vector.
skew_plot <- ggplot(data = fs_bank_mets, aes(x = rank, y = sabund, color = skew, group = sim)) + geom_line(size = .1, alpha = .2) + geom_line(data = filter(fs_bank_mets, skew <= quantile(fs_bank_mets$skew, probs = .001)[1]), alpha = .5, size = .3) + geom_line(data = filter(fs_bank_mets, skew >= quantile(fs_bank_mets$skew, probs = 0.999)[1]), alpha = .5, size = .3) + scale_colour_viridis_c(option = "plasma", end = .8) + theme_bw() + theme(legend.position = "top") + geom_point(data = mutate(portal_sad, sim = 1, qbin = 0, qbin_alpha = 1), aes(x = rank, y = sabund), color = "#FF0000", show.legend = F) skew_plot skew_plot2 <- ggplot(data = fs_bank_mets, aes(x = rank, y = sabund, color = skew_rounded, group = sim)) + geom_line(size = .1, alpha = .2) + geom_line(data = filter(fs_bank_mets, skew <= quantile(fs_bank_mets$skew, probs = .001)[1]), alpha = .5, size = .3) + geom_line(data = filter(fs_bank_mets, skew >= quantile(fs_bank_mets$skew, probs = 0.999)[1]), alpha = .5, size = .3) + scale_colour_viridis_c(option = "plasma", end = .8) + theme_bw() + theme(legend.position = "top") + geom_point(data = mutate(portal_sad, sim = 1, qbin = 0, qbin_alpha = 1), aes(x = rank, y = sabund), color = "#FF0000", show.legend = F) skew_plot2 skew_plot3 <- ggplot(data = fs_bank_mets, aes(x = rank, y = sabund, color = skew_rank, group = sim)) + geom_line(size = .1, alpha = .2) + scale_colour_viridis_c(option = "plasma", end = .8) + theme_bw() + theme(legend.position = "top") + geom_point(data = mutate(portal_sad, sim = 1, qbin = 0, qbin_alpha = 1), aes(x = rank, y = sabund), color = "#FF0000", show.legend = F) skew_plot3 dominants <- fs_bank_mets %>% filter(rank == nspp) dominants_skew_plot <- ggplot(data = dominants, aes(x = sabund, y = skew)) + geom_point(alpha = .3) + theme_bw() dominants_skew_plot
The y axis is the coefficient values; colors correspond to skewness with the most and least skewed vectors highlighted. The red dots mark the Portal coefficients.
coeffs_skew <- fs_bank_mets %>% select(sim, skew) %>% distinct() %>% right_join(fs_leg_coeffs_M, by = "sim") coeffs_skew_plot <- ggplot(data = coeffs_skew, aes(x = coeff_index, y = coeff_value, color = skew, group = sim)) + geom_line(size = .1, alpha = .2) + geom_line(data = filter(coeffs_skew, skew <= quantile(coeffs_skew$skew, probs = .001)[1]), alpha = .5, size = .3) + geom_line(data = filter(coeffs_skew, skew >= quantile(coeffs_skew$skew, probs = 0.999)[1]), alpha = .5, size = .3) + scale_colour_viridis_c(option = "plasma", end = .8) + theme_bw() + theme(legend.position = "top") + geom_point(data = mutate(portal_leg_coeffs, sim = 1, qbin = 0, qbin_alpha = 1), aes(x =coeff_index, y= coeff_value), color = "#FF0000", show.legend = F) coeffs_skew_plot
There seems to be some relationship to skewness - skewness locates you in a particular part of the state space. I'm not sure what that means....
The x axis is skewness and the y axis is the distance of the coefficients to the centroid. Each dot is a draw from the feasible set.
skew_distance <- coeffs_skew %>% select(sim, skew) %>% distinct() %>% arrange(skew) %>% mutate(skew_rank = row_number()) %>% left_join(fs_dist, by = "sim") %>% rename(dist_rank = sim_rank) skd_plot <- ggplot(data = skew_distance, aes(x = skew, y = centroid_dist)) + geom_point() + theme_bw() skd_plot
The x axis is a vector's ranked skewness and the y axis is the ranked distance of the coefficients to the centroid. Each dot is a draw from the feasible set. The orange line is the 1:1 line.
skd_rank_plot <- ggplot(data = skew_distance, aes(x = skew_rank, y = dist_rank)) + geom_point() + theme_bw() + geom_line(aes(x = 1:max(skew_rank), y = 1:max(skew_rank)), color ="orange") skd_rank_plot
The red dot on the skewness plot is Portal.
skew_violin <- ggplot(data = skew_distance, aes(x = 0, y = skew)) + geom_violin() + geom_point(aes(x = 0, y = conditionalsads::rad_skew(portal_sad$sabund)), color = "red") + theme_bw() gridExtra::grid.arrange(grobs = list(skew_violin, centroid_dist_plot), nrow = 1)
Skewness is not equivalent to centroid distance.
skew_rank_percentile <- filter(skew_distance, skew <= conditionalsads::rad_skew(portal_sad$sabund))
Portal is very skewed compared to the bulk of the feasible set. Portal's skewness is r conditionalsads::rad_skew(portal_sad$sabund)
, which is more skewed than r (nrow(skew_rank_percentile) / nrow(skew_distance)) * 100
% of the feasible set.
coeffs_skew <- fs_bank_mets %>% select(sim, skew) %>% distinct() %>% right_join(fs_leg_coeffs_M, by = "sim") %>% select(sim, skew, coeff_value, coeff_index) %>% group_by(sim, skew) %>% mutate(coeff_sd = sd(coeff_value)) %>% ungroup() coeffs_skew_ranked <- coeffs_skew %>% select(sim, skew, coeff_sd) %>% distinct() %>% arrange(skew) %>% mutate(abund_skew_rank = row_number()) %>% arrange(coeff_sd) %>% mutate(coeff_sd_rank = row_number()) %>% ungroup() coeff_sd_plot <- ggplot(data = coeffs_skew, aes(x = coeff_index, y = coeff_value, color = coeff_sd, group = sim)) + geom_line(alpha = .5) + scale_color_viridis_c(end = .9) + theme_bw() coeff_sd_plot rank_sd_skew_plot <- ggplot(data = coeffs_skew_ranked, aes(x = abund_skew_rank, y = coeff_sd_rank)) + geom_point(alpha = .3) + theme_bw() rank_sd_skew_plot sd_skew_plot <- ggplot(data = coeffs_skew_ranked, aes(x = skew, y = coeff_sd)) + geom_point(alpha = .3) + theme_bw() sd_skew_plot
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.