wrappeR: A Pipeline for Biodiversity Indicators

Documented in combineSamps

#' \code{combineSamps} - function inside \code{tempSampPost}, see \code{tempSampPost} and \code{applySamp} for details
#' @export

combineSamps <- function(species, 
                         indata, 
                         keep_iter, 
                         region, 
                         sample_n, 
                         tolerance,
                         combined_output,
                         minObs, 
                         scaleObs,
                         t0, 
                         tn, 
                         filetype,
                         iter) { 
  
  # set up defaults
  out_dat <- NULL
  out_meta <- NULL
  raw_occ <- NULL
  nRec_glob <- NA
  nRec_reg <- NA
  nRec <- NA
  yrs <- NA
  gaps <- NULL
  rot <- NULL
  first <- NA
  last <- NA
  gap <- NA
  firstMod <- NA
  lastMod <- NA
  REGION_IN_Q <- paste0("psi.fs.r_", region)
  
  # load_rdata function
  # loads an RData file, and assigns it to an object name
  load_rdata <- function(fileName) {
    load(fileName)
    get(ls()[ls() != "fileName"])
  }
  
  if(!is.null(keep_iter)) {
    
    min_iter <- iter[1] # minimum iteration number
    max_iter <- iter[2] # maximum iteration number
    
    # chained models
    try(out_dat <- load_rdata(paste0(indata, species, "_", max_iter, "_1.rdata")))  # where the data is stored for JASMIN models
    try(out_meta <- load_rdata(paste0(indata, species, "_", min_iter, "_1.rdata"))) # where metadata is stored for JASMIN models
    
    # some models are nested within the objects, e.g., out_dat$out$model
    if(!is.null(out_dat) & is.null(out_dat$model)) out_dat <- out_dat$out 
    
  } else {
    
    # non-chained models
    
    if(filetype == "rds") {
      
      try(out_dat <- readRDS(paste0(indata, species, ".rds")))
      out_meta <- out_dat
      
    }
    
    else if(filetype == "rdata") {
      
      try(out_dat <- load_rdata(paste0(indata, species, ".rdata")))
      out_meta <- out_dat
      
    }
    
  }
  
  if(!is.null(out_dat$model) & !is.null(out_meta)) { # there is a model object to read from with metadata
    
    # retrieve input data
    dat <- out_meta$model$data() 
    
    # non-temporally explicit observation dataframe
    dat_df <- data.frame(year = dat$Year, # year
                      rec = dat$y, # records
                      site = dat$Site) # sites
    
    if(scaleObs == "global") {
      
      # subset to temporal window
      dat_glob <- dat_df[dat_df$year >= (t0 - (out_meta$min_year - 1)) & dat_df$year <= (tn - (out_meta$min_year - 1)), ]
      
      # number of global observations within time window t0 - tn
      nRec_glob <- sum(dat_glob$rec)
      
    } else { # regional scale metadata
      
      # region vs region_aggs
      if(region %in% out_meta$regions) { # explicit region - region
        
        # sites within selected region
        region_site <- dat[[paste0("r_", region)]][dat$Site]
        
      } else { # aggregate region - region_aggs
        
        # this gives you the names of the regions that make up the region_agg
        region_aggs <- unlist(out_meta$region_aggs[region])
        
        # sites within selected aggregate region (i.e., sites across all nested regions)
        region_site <- rowSums(sapply(region_aggs, function(x) dat[[paste0("r_", x)]][dat$Site]))
        
      }
      
      dat_reg <- cbind(dat_df, region_site) # sites included within region
      
      # subset to temporal window
      dat_reg <- dat_reg[dat_reg$region_site == 1 & dat_reg$year >= (t0 - (out_meta$min_year - 1)) & dat_reg$year <= (tn - (out_meta$min_year - 1)), ]
      
      # number of observations within region within time window t0 - tn
      nRec_reg <- sum(dat_reg$rec)
      
    }
    
  }
  
  if(scaleObs == "global") 
    # global number of observations
    nRec <- nRec_glob
  else
    # regional number of observations
    nRec <- nRec_reg
  
  print(paste0("load: ", species, ", ", scaleObs, " records: ", nRec))
  
  if(nRec >= minObs & # there are enough observations globally or in region
     region %in% c(out_meta$regions, names(out_meta$region_aggs)) & # the region or region_agg is listed for the species
     !is.null(out_dat$model) # there is a model object to read from
  ) { # the conditions are met
    
    if(!is.null(keep_iter)) {
      
      # chained models
      
      if(max_iter <= 20000) {
        
        # chained models with chains separated <= 20,000 iterations
        out_dat1 <- NULL
        out_dat2 <- NULL
        out_dat3 <- NULL
        
        try(out_dat1 <- load_rdata(paste0(indata, species, "_", max_iter, "_1.rdata"))) # where occupancy data is stored for JASMIN models 
        raw_occ1 <- data.frame(out_dat1$BUGSoutput$sims.list[REGION_IN_Q])
        try(out_dat2 <- load_rdata(paste0(indata, species, "_", max_iter, "_2.rdata"))) # where occupancy data is stored for JASMIN models 
        raw_occ2 <- data.frame(out_dat2$BUGSoutput$sims.list[REGION_IN_Q])
        try(out_dat3 <- load_rdata(paste0(indata, species, "_", max_iter, "_3.rdata"))) # where occupancy data is stored for JASMIN models 
        raw_occ3 <- data.frame(out_dat3$BUGSoutput$sims.list[REGION_IN_Q])
        
        if(!is.null(out_dat1) & !is.null(out_dat2) & !is.null(out_dat3)) # if all models loaded correctly
          raw_occ <- rbind(raw_occ1, raw_occ2, raw_occ3)
        
      } else {
        
        # chained models with chains combined 32,000 iterations
        raw_occ <- data.frame(out_dat$BUGSoutput$sims.list[REGION_IN_Q])
        
      }
        
    } else {
      
      # non-chained models
      raw_occ <- data.frame(out_dat$BUGSoutput$sims.list[REGION_IN_Q])
      
    }
    
    if(!is.null(raw_occ)) {
      
      # check whether the number of sims is enough to sample 
      # first calculate the difference between n.sims and sample_n.
      # positive numbers indicate we have more than we need
      if(!is.null(keep_iter)) {
        
        # chained models - sims from three chains
        diff <- (out_dat$BUGSoutput$n.sims * 3) - sample_n
        
      } else {
        
        diff <- out_dat$BUGSoutput$n.sims - sample_n
        
      }
      
      if(diff > tolerance) {
        
        # we have more sims in the model than we want, so we need to sample them
        raw_occ <- raw_occ[sample(1:nrow(raw_occ), sample_n), ]
        
      } else 
        
        if(abs(diff) <= tolerance) {
          # The number of sims is very close to the target, so no need to sample
          print(paste("no sampling required: n.sims =", out_dat$BUGSoutput$n.sims))
          
        } else
          stop("Error: Not enough iterations stored. Choose a smaller value of sample_n")
      
      colnames(raw_occ) <- paste("year_", out_meta$min_year:out_meta$max_year, sep = "")
      
      raw_occ$iteration <- 1:sample_n
      raw_occ$species <- species
      
      if(combined_output != TRUE) {
        write.csv(raw_occ, file = paste(output_path, gsub(".rdata", "" ,i), "_sample_", sample_n, "_post_", REGION_IN_Q, ".csv", sep = ""), row.names = FALSE)
      } 
      
      out1 <- raw_occ
      
      if(scaleObs == "global") 
        datm <- dat_glob # temporally explicit global scale metadata
      else
        datm <- dat_reg # temporally explicit regional scale metadata
      
      first <- min(datm$year[datm$rec == 1]) + (out_meta$min_year - 1)
      last <- max(datm$year[datm$rec == 1]) + (out_meta$min_year - 1)
      
      firstMod <- t0
      
      lastMod <- tn
      
      yrs <- sort(unique(datm$year[datm$rec == 1]), decreasing = FALSE)
      
      if (length(yrs) > 1) {
        
        for (i in (1:length(yrs) - 1)) {
          gaps <- c(gaps, yrs[i+1] - yrs[i])
        }
      }
      
      if (!is.null(gaps)) {
        
        gap <- max(gaps)
        
      } else {
        gap <- 1
      } 
      
      # metadata data frame
      out2 <- data.frame(species, nRec_glob, nRec_reg, first, last, gap, firstMod, lastMod)
      
      # add rules of thumb metrics
      if(!is.null((attr(out_meta, "metadata")$analysis$spp_Metrics)))
        rot <- as.data.frame(attr(out_meta, "metadata")$analysis$spp_Metrics)
      else # if model doesn't have rule of thumb data
        rot <- data.frame(median = NA, P90 = NA, visits_median = NA, visits_P90 = NA, prop_list_one = NA, prop_repeats_grp = NA, prop_abs = NA)
      
      # EqualWt and HighSpec decision trees (see https://www.biorxiv.org/content/10.1101/813626v1.full)
      rot$EqualWt <- ifelse(rot$prop_abs >= 0.990, rot$P90 >= 3.1, rot$P90 >= 6.7)
      rot$HighSpec <- ifelse(rot$prop_abs >= 0.958, rot$P90 >= 9.5, rot$P90 >= 29)
      
      # join rules of thumb data
      out2 <- cbind(out2, rot)
      
      print(paste("Sampled:", species))
      
      return(list(out1, out2))
      
    } else {
      
      print(paste("Error loading model:", species)) # this can happen for JASMIN models if one of the models in the chain doesn't load properly
      
      return(NULL)
      
    }
    
  } else {
    
    # informative messages
    if(!is.na(nRec) & !nRec >= minObs) 
      print(paste("Dropped (lack of observations):", species)) 
    else if(!is.na(nRec) & !region %in% c(out_meta$regions, names(out_meta$region_aggs)))
      print(paste("Dropped (region or region_aggs not present for species):", species))
    else print(paste("Error loading model:", species))
    
    return(NULL)
  }
}