In karinorman/biodivTS: What the Package Does (One Line, Title Case)

knitr::opts_chunk$set(echo = TRUE)

library(tidyverse)
library(furrr)
library(FD)
library(biodivTS)

source(here::here("R", "dbFD_mine.R"))

pins::board_register_github(repo = "karinorman/biodivTS", branch = "master")

Get FD metrics for each study

FD function:

get_fd <- function(obs_data, filter_plot, ...){
  require(FD)

  data <- obs_data %>%
    filter(plot_id == filter_plot)

  fs::dir_create(paste0(here::here("data"), "/fd_attempt"))
  write_tsv(data.frame(), paste0(here::here("data"), "/fd_attempt/", filter_plot, ".tsv"))

  # #check which column the abundaces weights are stored in (abundance or biomass)
  # if (sum(data$year_abun) == 0) {weight <- "year_biomass"} else {weight <- "year_abun"}

  species_mat <- data %>%
    select(year, id, weight) %>%
    arrange(id) %>%
    pivot_wider(names_from = id, values_from = weight, values_fill = 0)

  years <- select(species_mat, year)

  trait_mat <- data %>%
    select(-c(biotimeName, taxa, id_species, plot, year, weight, 
              traitName, scientificName, plot_id, study_id)
    ) %>%
    distinct() %>%
    select_if(~ length(unique(na.omit(.))) > 1)  #remove columns that have the same value for all columns

  #get binary variables so they can be excluded from rescaling 
  bin_vars <- map(trait_mat, ~ all(na.omit(.) %in% 0:1))

  trait_mat <- trait_mat %>%
    mutate_at(vars(-c(id, names(bin_vars[bin_vars == TRUE])), -ends_with("5cat")), 
              list(~as.numeric(scale(.)))) %>% #rescale variables, not binary or categorical
    arrange(id) %>%
    column_to_rownames("id")

  fs::dir_create(paste0(here::here("data"), "/fd_output"))

  #the convexhull c code creates an output file of vertices, in order to not have multiple threads writing
  #to the same file we have to create individual working directories
  tmp_dir <- paste0(here::here("data", "fd_output", "tmp"), "/tmp_", filter_plot)
  fs::dir_create(tmp_dir)
  setwd(tmp_dir)

  fd_data <- dbFD_mine(trait_mat, select(species_mat, -year), w.abun = TRUE, ...)
  fd_out <- as_tibble(fd_data$fd_met)[,1:8] %>%
    cbind(years, .) %>%
    mutate(plot_id = unlist(filter_plot), trait_count = dim(trait_mat)[2])

    write_tsv(fd_out, paste0(here::here("data"), "/fd_output/", filter_plot, "_fd.tsv"))
    write_tsv(fd_data$pca_traits, paste0(here::here("data"), "/fd_output/", filter_plot, "_traits.tsv"))

  setwd(here::here())
  print(filter_plot)
  #return(fd_out)
}

get_fd_safe <- possibly(get_fd, NA)

First pass at calculating FD metrics from FD package:

data <- pins::pin_get("biotime-traits", board = "github")

plots <- data %>%
  pull(plot_id) %>%
  unique()

system.time(fd <- map(plots, get_fd_safe, obs_data = data, m = "min", corr = "cailliez"))

The dataset is too big to copy to each parallel thread, so we split the dataset into chunks

#number of chunks to split original dataset into 
num_groups = 4

plot_sizes <- data %>%
  count(plot_id) %>%
  arrange(desc(n)) %>%
  mutate(group = rep_len(1:num_groups, dim(.)[1]))

get_parallel_FD <- function(df, group_num){
  group_plots <- plot_sizes %>% filter(group == group_num)
  group_data <- df %>% filter(plot_id %in% group_plots$plot_id)

  future::plan("multiprocess", workers = 4)
  fd <- future_map(group_plots$plot_id, get_fd_safe, obs_data = group_data, 
                   m = "min", corr = "cailliez") %>%
    rbind()

  print(group_num)
  return(fd)
}

fd <- map(1:num_groups, get_parallel_FD, df = data)

Read data back in, excluding sites/plots as they become a problem:

files <-  dir(here::here("data", "fd_output"), "*_fd.tsv")

fd_data <- files %>%
  paste0(here::here("data", "fd_output"), "/", .) %>%
  map_dfr(~read_tsv(.x, col_types = "dddddddddcd")) #%>% mutate(plot_id = as.character(plot_id)))

remain <- data %>%
  filter(!plot_id %in% fd_data$plot_id)

plot_sizes <- remain %>%
  count(plot_id) 

get_fd_error <- safely(get_fd)
system.time(fd <- map(plot_sizes$plot_id, get_fd_error, obs_data = data, m = "min", corr = "cailliez"))
system.time(fd <- map(plot_sizes$plot_id, get_fd_safe, obs_data = data,  m = "min", corr = "cailliez"))

Format FD table with null values

null_table <- pins::pin_get("null-table", board = "github")

fd_table <- fd_data %>% 
  select(year, plot_id, richness = nbsp, everything(), -sing.sp) %>%
  gather(metric, value, contains("F"), RaoQ) %>%
  left_join(biotime_traits %>% 
              select(plot_id, study_id, plot) %>%
                       distinct()) %>%
  left_join(null_table %>%
              select(plot_id, year, richness, metric, mean, sd), 
            by = c("year", "plot_id", "richness", "metric")) %>%
  mutate(SES = (value - mean)/sd)

usethis::use_data(fd_table)
pins::pin(fd_table, board = "github")