R/08-plot-embeddings.R

# Visualizing neural network weights --------------------------------------

library(tidyverse)
library(patchwork)
library(RColorBrewer)
library(viridis)
library(Rtsne)
library(vroom)
library(lsa)
library(pbapply)


theme_set(theme_minimal() + 
            theme(panel.grid.minor = element_blank()))

bbs_species <- read_csv('data/cleaned/bbs_species.csv') %>%
  mutate(sp_lower = tolower(english))

bbs_routes <- read_csv('data/cleaned/clean_routes.csv')


weight_files <- list.files(path = 'out/weights', pattern = ".csv", full.names = TRUE)
en_files <- basename(weight_files) %>%
  gsub('_weights.csv', '', .) %>%
  gsub('_', ' ', .)



joined_df <- weight_files %>%
  lapply(read_csv) %>%
  bind_rows(.id = 'file') %>%
  mutate(file = parse_integer(file), 
         sp_lower = en_files[file]) %>%
  select(-file) %>%
  filter(par != 'p') %>%
  unite('par_h', c(par, h_dim)) %>%
  spread(par_h, value) %>%
  left_join(distinct(bbs_species, english, sp_lower)) %>%
  arrange(english) %>%
  select(-sp_lower)


# Cosine similarity analysis ----------------------------------------------

X <- joined_df %>%
  select(-english) %>%
  as.matrix
rownames(X) <- joined_df$english

sim <- cosine(t(X))

# cosine dissimilarity (distance) matrix
D_sim <- as.dist(1 - sim)

cosine_sim <- joined_df %>%
  select(english)

cosine_sim$nearest_species <- NA
cosine_sim$similarity <- NA
cosine_sim$next_nearest_species <- NA
cosine_sim$next_similarity <- NA
cosine_sim$farthest_species <- NA
cosine_sim$far_similarity <- NA

for (i in 1:nrow(cosine_sim)) {
  sim_vec <- sim[which(joined_df$english == cosine_sim$english[i]), ]
  names(sim_vec) <- joined_df$english
  
  nearest <- sort(sim_vec, decreasing = TRUE)[2]
  cosine_sim$similarity[i] <- nearest
  cosine_sim$nearest_species[i] <- names(nearest)
  
  next_nearest <- sort(sim_vec, decreasing = TRUE)[3]
  cosine_sim$next_nearest_species[i] <- names(next_nearest)
  cosine_sim$next_similarity[i] <- next_nearest
  
  farthest <- sort(sim_vec)[1]
  cosine_sim$far_similarity[i] <- farthest
  cosine_sim$farthest_species[i] <- names(farthest)
}

write_csv(cosine_sim, 'out/cosine_sim.csv')


make_species_plot <- function(sp1, sp2, legend = TRUE) {

  sp1_df <- sp1 %>%
    paste0(., '_finalnet.csv') %>%
    file.path('out', .) %>%
    read_csv %>%
    distinct(sp.bbs) %>%
    unlist %>%
    paste0('.csv') %>%
    file.path('out', 'q_dfs', .) %>%
    read_csv %>%
    select(sp.bbs, route_id, year, psi) %>%
    mutate(sp_lower = gsub("_", " ", sp1)) %>%
    left_join(distinct(bbs_species, sp_lower, english)) %>%
    rename(sp1_psi = psi) %>%
    select(route_id, year, ends_with("psi"), english)
  
  sp2_df <- sp2 %>%
    paste0(., '_finalnet.csv') %>%
    file.path('out', .) %>%
    read_csv %>%
    distinct(sp.bbs) %>%
    unlist %>%
    paste0('.csv') %>%
    file.path('out', 'q_dfs', .) %>%
    read_csv %>%
    select(sp.bbs, route_id, year, psi) %>%
    mutate(sp_lower = gsub("_", " ", sp2)) %>%
    left_join(distinct(bbs_species, sp_lower, english)) %>%
    rename(sp2_psi = psi) %>%
    select(route_id, year, ends_with("psi"), english)
  
  sp1_name <- unique(sp1_df$english)
  sp2_name <- unique(sp2_df$english)
  
  l1_labels <- read_csv('data/cleaned/routes.csv') %>%
    distinct(route_id, L1_KEY) %>%
    mutate(level_1_ecoregion = tools::toTitleCase(tolower(L1_KEY)), 
           level_1_ecoregion = gsub("[0-9]", "", level_1_ecoregion), 
           level_1_ecoregion = trimws(level_1_ecoregion))
  l1_order <- count(l1_labels, level_1_ecoregion) %>% arrange(-n)
  l1_labels$level_1_ecoregion <- factor(l1_labels$level_1_ecoregion, 
                                        levels = l1_order$level_1_ecoregion)
  
  p <- sp1_df %>%
    select(-english) %>%
    full_join(select(sp2_df, -english)) %>%
    left_join(l1_labels) %>%
    left_join(read_csv('data/cleaned/clean_routes.csv')) %>%
    filter(group != 'test', level_1_ecoregion != 'Water') %>%
    arrange(level_1_ecoregion) %>%
    ggplot(aes(x = sp1_psi, y = sp2_psi, 
               group = route_id, color = level_1_ecoregion)) + 
    geom_path(alpha = .2, size = .5) + 
    xlab(substitute(paste(sp1_name), 
                    list(sp1_name = sp1_name))) + 
    ylab(substitute(paste(sp2_name), 
                    list(sp2_name = sp2_name))) + 
    scale_color_brewer(palette = 'Paired', '') + 
    guides(colour = guide_legend(override.aes = list(alpha = 1, size = 1)))
  if (!legend) {
    p <- p + theme(legend.position = 'none')
  }
  p
}

make_pair_plot <- function(ref_sp, legend = TRUE, title = "", annotate = TRUE) {
  cosine_df <- cosine_sim %>%
    filter(english == ref_sp)
  
  focal_lower <- tolower(ref_sp) %>%
    gsub(' ', '_', .)
  nearest_lower <- tolower(cosine_df$nearest_species) %>%
    gsub(' ', '_', .)
  next_nearest_lower <- tolower(cosine_df$next_nearest_species) %>%
    gsub(' ', '_', .)
  farthest_lower <- tolower(cosine_df$farthest_species) %>%
    gsub(' ', '_', .)
  
  p_near <-  make_species_plot(sp1 = nearest_lower, sp2 = focal_lower, 
                               legend = FALSE) +
    ggtitle(title)
  
  p_far <- make_species_plot(sp1 = farthest_lower, sp2 = focal_lower, 
                             legend = legend) + 
    theme(axis.text.y = element_blank()) + 
    ylab("")
  
  if (annotate) {
    p_near <- p_near + 
      annotate(geom = "text", x = -Inf, y = Inf, hjust = 0, vjust = 1,
               label = paste("Similarity:", round(cosine_df$similarity, 2)))
    p_far <- p_far + 
      annotate(geom = "text", x = Inf, y = Inf, hjust = 1, vjust = 1,
               label = paste("Similarity:", round(cosine_df$far_similarity, 2)))
  }
  
  p <- p_near + p_far
  p
}

p <- make_pair_plot("Mourning Dove", legend = FALSE, title = "(a)") / 
  make_pair_plot("Eurasian Collared-Dove", title = "(b)") / 
  make_pair_plot("Bald Eagle", legend = FALSE, title = "(c)")
p

ex_p <- make_pair_plot("Mourning Dove", legend = FALSE, title = "", annotate = FALSE)
ggsave(filename = "slides/visec2020/figs/ex-assoc.pdf", width = 7, height = 3)


r_dpi <- 150


pwidth <- 9
pheight <- 6
p %>%
  {
    ggsave(filename = 'fig/occupancy_scatter.jpg', plot = ., 
           width = pwidth, height = pheight,
           dpi = r_dpi);
    ggsave(filename = 'fig/occupancy_scatter.pdf', plot = ., 
           width = pwidth, height = pheight);
  }



# Parse route embeddings --------------------------------------------------

route_embeddings <- vroom('out/route_embeddings.csv')
route_embeddings$route_id <- vroom('out/american_robin_finalnet.csv')$route_id[route_embeddings$row_idx + 1]

psi0_embeddings <- route_embeddings %>%
  filter(par == 'psi0') %>%
  select(-t, -row_idx) %>%
  unite(par_h_dim, par, h_dim) %>%
  spread(par_h_dim, value)

embed_wide <- route_embeddings %>%
  filter(par != 'psi0') %>%
  select(-row_idx) %>%
  unite(par_h_dim, par, h_dim, t) %>%
  spread(par_h_dim, value) %>%
  left_join(read_csv('data/cleaned/clean_routes.csv')) %>%
  arrange(Longitude, Latitude) %>%
  left_join(psi0_embeddings) %>%
  left_join(distinct(bbs_routes, route_id, group)) %>%
  filter(group != 'test')

unique_embed <- embed_wide %>%
  select(starts_with('gamma'), starts_with('phi_'), 
         starts_with('psi0_'), starts_with('p_')) %>%
  distinct

set.seed(0)
r_tsne <- unique_embed %>%
  Rtsne(verbose = TRUE, pca = FALSE, 
        num_threads = parallel::detectCores())

colnames(r_tsne$Y) <- c("d1", "d2")
raw_tsne_df <- as_tibble(r_tsne$Y) 
tsne_df <- unique_embed %>%
  bind_cols(raw_tsne_df) %>%
  right_join(embed_wide) %>% 
  left_join(vroom('data/cleaned/clean_routes.csv') %>%
              distinct(route_id, Longitude, Latitude, L1_KEY, L2_KEY)) %>%
  mutate(level_1_ecoregion = tools::toTitleCase(tolower(L1_KEY)), 
         level_1_ecoregion = gsub("[0-9]", "", level_1_ecoregion), 
         level_1_ecoregion = trimws(level_1_ecoregion))

pt_size <- .7
route_tsne_p1 <- tsne_df %>%
  filter(!(level_1_ecoregion %in% c('Hudson Plain', 'Water'))) %>%
  ggplot(aes(d1, d2, color = level_1_ecoregion)) + 
  geom_point(size = pt_size * 1.5, alpha = .5) +
  scale_color_brewer(palette = 'Paired', '') + 
  xlab("t-SNE dimension 1") + 
  ylab("t-SNE dimension 2") + 
  guides(colour = guide_legend(override.aes = list(alpha = 1, size = 1.5))) + 
  theme(legend.position = 'left')
route_tsne_p2 <- tsne_df %>%
  select(d1, d2, PC1, starts_with("c_")) %>%
  gather(var, value, -d1, -d2) %>%
  group_by(var) %>%
  mutate(value = c(scale(value))) %>%
  ungroup %>%
  mutate(var = case_when(
           var == 'c_elevation' ~ 'Elevation', 
           var == 'c_lat' ~ 'Latitude', 
           var == 'c_lon' ~ 'Longitude',
           var == 'c_road_den' ~ 'Road density',
           var == 'c_dist_shore' ~ 'Distance from coast',
           TRUE ~ var
         )) %>%
  ggplot(aes(d1, d2, color = value)) + 
  geom_point(size = pt_size, alpha = .5) +
  scale_color_gradient2("Z-score", mid = 'grey95') + 
  xlab("") + 
  ylab("") + 
  facet_wrap(~var)
route_tsne_plot <- route_tsne_p1 + ggtitle("(a)") + 
  route_tsne_p2 + ggtitle("(b)") + 
  plot_layout(nrow = 1, widths = c(1, 1.5))
route_tsne_plot
tsne_w <- 11
tsne_h <- 3.5
route_tsne_plot  %>%
  {
    ggsave(filename = 'fig/route_tsne.jpg', plot = ., 
           width = tsne_w, height = tsne_h,
           dpi = r_dpi);
    ggsave(filename = 'fig/route_tsne.pdf', plot = ., 
           width = tsne_w, height = tsne_h);
  }
mbjoseph/neuralecology documentation built on Nov. 6, 2022, 3:48 p.m.