R/hurdleModel.R

Defines functions hurdleModel

Documented in hurdleModel

#' Run binomial logistic models to predict species' occurrences and gaussian CRFs
#' to predict species' local abundances.
#'
#'@importFrom magrittr %>%
#'
#'@param flyway A \code{character} string representing the name of the specified flyway to model
#'@param n_reps Positive \code{integer} representing the number of times to repeat
#'10-fold \code{\link[glmnet]{cv.glmnet}} regularized regressions (default is \code{10})
#'@param n_cores Positive integer stating the number of processing cores to split the job across.
#'Default is \code{parallel::detect_cores() - 1}
#'
#'@details This function runs the hurdle model by first estimating predictors of species' occurrence
#'probability using \code{lassoBinomial} and then fitting a conditional random field to species'
#'scaled abundances using \code{lassoGaussian}. Each of these models is run \code{n_reps} times
#'to allow for uncertainty in coefficient estimates. Predictions are then used to estimate model fits and
#'predict community network \emph{B}'os and species' eigencentrality in each site. All models are
#'run for sites within a unique bioregion, ensuring that network metrics reflect variation among
#'sites with similar climate and species compositions.
#'
#'@export
#'
hurdleModel = function(flyway, n_reps, n_cores){

  if(missing(n_reps)){
    n_reps <- 10
  }

# Split into regions
unique_regions <- all_regions %>%
                             dplyr::filter(Site.group == flyway) %>%
                             dplyr::select(Flyway.region.unique.id) %>%
                             dplyr::distinct()

unique_regions <- as.vector(unique_regions$Flyway.region.unique.id)

#### Run separate models for each region within a flyway ####
region_mods <- lapply(seq_len(length(unique_regions)), function(j){

  region_rows <- which(all_regions$Site.group == flyway &
                         all_regions$Flyway.region.unique.id == unique_regions[j])

  # Extract abundance and binary occurrence data for the specified region
  region_abund <- BBS.abundances[region_rows, ]
  region_bin <- BBS.occurrences[region_rows, ]

  # Extract coordinates and altitude to ensure their inclusion in spatial network models
  coords <- all_regions %>%
    dplyr::select(ID, Latitude, Longitude, Year, Altitude)
  coords <- coords[region_rows, ]

  # Remove species occurring in fewer than 15% of observations
  low_occur_cols <- which((colSums(region_bin) / nrow(region_bin)) < 0.15)

  if(length(low_occur_cols) == 0){
    region_bin <- region_bin
    region_abund <- region_abund

  } else {

    region_bin <- region_bin[, -low_occur_cols]
    region_abund <- region_abund[, -low_occur_cols]
  }

  # Species that are too common cannot be assessed in occurrence models
  high_occur_cols <- which((colSums(region_bin) / nrow(region_bin)) > 0.9)

  if(length(high_occur_cols) == 0){
    region_bin_outcome <- region_bin
    region_abund_predict <- region_abund

  } else {

    region_bin_outcome <- region_bin[, -high_occur_cols]
    region_abund_predict <- region_abund[, -high_occur_cols]
  }

  # Remove un-needed rows from covariates data and extract climate / landcover covariates
  # note, year is not included as temporal autocorrelation will be
  # difficult to detect without at least 20 - 25 years worth of data
  # see (Teller et al., 2016) at
  # https://besjournals.onlinelibrary.wiley.com/doi/full/10.1111/2041-210X.12486)
  covs_test <- c("Latitude", "Prop.forest",
                 "Prop.shrubland",
                 "Prop.grassland",
                 "Prop.wetland",
                 "Prop.anthro.urban",
                 "Prop.anthro.cropland",
                 "Tot.precip.yr",
                 "Precip.seas.yr",
                 "Mean.temp.yr",
                 "Mean.min.temp.yr",
                 "Mean.max.temp.yr",
                 "Temp.seas.yr",
                 "Tot.precip.spr",
                 "Mean.temp.spr",
                 "Max.temp.spr",
                 "Min.temp.spr",
                 "Mean.min.temp.spr",
                 "Mean.max.temp.spr",
                 "Altitude", "NDVI")

  covariates <- all_regions[region_rows, covs_test]

  # Remove predictors with strong collinearity (based on pearson correlations)
  covs_remove <- caret::findCorrelation(cor(covariates,
                                            use = "na.or.complete"),
                                        cutoff = .99, exact = T)
  removed_covs <- names(covariates)[covs_remove]

  if(length(covs_remove) == 0){
    covariates <- covariates

  } else {
    covariates <- covariates[, -covs_remove]
  }

  # A small number of year * site observations may still have missing landcover values
  # here we remove these observations from all datasets
  row_has_na <- apply(covariates, 1, function(x){any(is.na(x))})
  covariates <- as.data.frame(covariates[!row_has_na, ])
  region_bin <- as.data.frame(region_bin[!row_has_na, ])
  region_bin_outcome <- as.data.frame(region_bin_outcome[!row_has_na, ])
  region_abund <- as.data.frame(region_abund[!row_has_na, ])
  region_abund_predict <- as.data.frame(region_abund_predict[!row_has_na, ])
  coords <- coords[!row_has_na, ]

  # Bin very large counts at the 99th percentile (these likely add no extra insight, only noise)
  top_count <- ceiling(quantile(as.vector(as.matrix(region_abund)), probs = 0.99))

  region_abund[region_abund > top_count] <- top_count
  region_abund_predict[region_abund_predict > top_count] <- top_count

  # Scale all continuous covariates so magnitudes of effects can be compared
  covariates <- data.frame(covariates %>%
                            dplyr::mutate_if(is.numeric, funs(as.vector(scale(.)))))

  #### Run LASSO occurrence and abundance models ####
  cat('Fitting binary occurrence model ...\n')
  occurrence_mod <- lassoBinomial_comm(outcome_data = region_bin_outcome,
                                       count_data = region_abund_predict,
                                       covariates = covariates,
                                       n_reps = n_reps, n_cores = n_cores)

  cat('Fitting gaussian abundance model ...\n')
  abundance_mod <- lassoAbund_comm(outcome_data = region_abund,
                                   binary_data = region_bin,
                                   covariates = covariates,
                                   n_reps = n_reps, n_cores = n_cores)

  #### Calculate predictive metrics for the binomial model ####
  cat('Calculating binary model fit ...\n')
  occurrence_mets <- lassoBinomial_metrics(outcome_data = region_bin_outcome,
                                           count_data = region_abund_predict,
                                           covariates = covariates,
                                           lassoBinomial = occurrence_mod,
                                           n_cores = n_cores)

  #### Predict network centrality and local network beta diversity (B'os) ####
  cat('Calculating predicted abundances ...\n')
  abund_predictions <- MRFcov::predict_MRF(data = cbind(region_abund, covariates),
                                           MRF_mod = abundance_mod, n_cores = n_cores)

  cat('Predicting species centralities ...\n')
  abundance_cent <- MRFcov::predict_MRFnetworks(data = cbind(region_abund, covariates),
                                                MRF_mod = abundance_mod, metric = "eigencentrality",
                                                cutoff = 2,
                                                omit_zeros = TRUE,
                                                n_cores = n_cores,
                                                cached_predictions = abund_predictions)

  network_metrics <- cbind(abundance_cent, covariates)

  cat('Predicting network beta diversities ...\n')
  abundance_adj <- MRFcov::predict_MRFnetworks(data = cbind(region_abund, covariates),
                                               MRF_mod = abundance_mod, metric = 'adjacency',
                                               cutoff = 2,
                                               omit_zeros = TRUE,
                                               n_cores = n_cores,
                                               cached_predictions = abund_predictions)

  beta_os_primes <- networkBetaOS(abundance_adj, n_cores = n_cores)

  network_metrics$beta_os <- beta_os_primes

  rm(abundance_cent, abundance_adj)

  #### Assess model fit from abundance model ####
  cat('Calculating gaussian abundance model fit ...\n')
  abund_metrics <- MRFcov::cv_MRF_diag(data = cbind(region_abund, covariates),
                                      n_nodes = nrow(abundance_mod$direct_coefs),
                                      n_folds = 10, n_cores = 1, family = 'poisson',
                                      compare_null = FALSE, plot = FALSE,
                                      cached_model = list(mrf = abundance_mod),
                                      cached_predictions = list(predictions = abund_predictions),
                                      sample_seed = 1)
  cat('Finished regional run ...\n')
  list(occurrence_mod = occurrence_mod,
       occurrence_spec = quantile(occurrence_mets$mean_specificity,
                                  probs = c(0.025, 0.5, 0.975),
                                  na.rm = TRUE),
       occurrence_sens = quantile(occurrence_mets$mean_sensitivity,
                                  probs = c(0.025, 0.5, 0.975),
                                  na.rm = TRUE),
       abundance_mod = abundance_mod,
       network_metrics = network_metrics,
       abund_Rsquared = quantile(abund_metrics$Rsquared,
                                 probs = c(0.025, 0.5, 0.975),
                                 na.rm = TRUE),
       coordinates = coords,
       removed_covs = removed_covs,
       top_count = top_count)
})

names(region_mods) <- unique_regions
return(region_mods)
}
nicholasjclark/BBS.occurrences documentation built on July 19, 2020, 8:31 p.m.