S4DM.R
In S4DM: Small Sample Size Species Distribution Modeling

## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----get data, fig.show='hold'------------------------------------------------


# Load libraries

library(BIEN)
library(geodata)
library(ggplot2)
library(S4DM)
library(sf)
library(terra)
library(tidyterra)

# Make a temporary directory to store climate data

  temp <- tempdir()

# Get some occurrence data

  #tv <- BIEN_occurrence_species(species = "Trillium vaseyi")
  data("sample_points")
  
  
# Get environmental data
# To make things a bit faster and easier, we'll limit ourselves to the 2 variables (mean temperature and annual precipitation)

  
  # env <- worldclim_global(var = "bio",
  #                          res = 10,
  #                          path = temp)
  # env <- env[[c(1,12)]]

  env <- rast(system.file('ex/sample_env.tif', package="S4DM"))  
  
# And we'll rescale the variables as well

  env <- scale(env)

# Just to take a look to make sure we didn't mess anything up

plot(env)


## ----rangebagging, echo=FALSE, results='asis'---------------------------------

data("sample_points")

sample_points_rangebagged <- 
make_range_map(occurrences = sample_points[c("longitude","latitude")],
               env = env,
               presence_method = "rangebagging",
               background_method = "none",
               background_buffer_width = 100000)


#Lets see what it looks like

# convert to polygon for easier visualization

sample_points_rangebagged_polygon <-
  sample_points_rangebagged |>
  as.polygons() |>
  st_as_sf()

# get a bbox for plotting

sample_points_bbox <-
sample_points_rangebagged_polygon |>
  st_buffer(dist = 500000) |>
  st_bbox()

#Now, we'll plot the standardized temperature raster, along with the occurrence records and the range map 

ggplot(env)+
  geom_raster(mapping = aes(x=x,y=y,fill=wc2.1_10m_bio_1))+
  scale_fill_viridis_c(name="Temp. C", na.value = "transparent")+
  scale_x_continuous(expand=c(0,0),
                     limits = c(sample_points_bbox[1],sample_points_bbox[3]))+
    scale_y_continuous(limits = c(sample_points_bbox[2],sample_points_bbox[4]),
                     expand=c(0,0))+
  theme_bw()+
  geom_sf(data = sample_points_rangebagged_polygon,
          fill = "grey",
          size=2,
          alpha=0.5)+
    geom_point(data = sample_points,
             mapping = aes(x=longitude,y=latitude))




## -----------------------------------------------------------------------------


# Here, we'll use the same data as before for Trillium vaseyi.

#First, we'll select the background data

sample_points_bg <- get_env_bg(coords = sample_points[c("longitude","latitude")],
                    env = env,
                    width = 50000,
                    standardize = TRUE) #note that we used a small set of background points to expedite model fitting

# The returned object 'xs_bg' contains two objects:
  # 1) sample_points_bg$env a matrix of environmental covariates. This is what we need for modeling.
  # 2) sample_points_bg$bg_cells a vector containing the environmental raster cell IDs that are present in tv_bg$env. This is useful for mapping the results.

# Next, we get the presence data:

  sample_points_presence <- get_env_pres(coords = sample_points[c("longitude","latitude")],
                              env = env,
                              env_bg = sample_points_bg)

#The returned object 'tv_presence' contains two objects:

  # 1) tv_presence$env a matrix of environmental covariates. This is what we need for modeling.
  # 2) tv_presence$occurrence_sf a sf object containing the coordinate data. This is useful for conducting spatially stratified cross-validation.




# Now, we can fit the model.  Previously we used rangebagging, this time we'll use a simple KDE estimation

  sample_points_kde_kde <- fit_plug_and_play(presence = sample_points_presence$env,
                                  background = sample_points_bg$env,
                                  method = "kde")


# The object that was returned is of the class "pnp_model", which is essentially a list of model fits and associated metadata.

# To view this data on a map, we can project it to the background data we used in fitting (or we could project to a new location entirely). In either case, we use the function `project_plu_and_play`.


  sample_points_kde_kde_predictions <- project_plug_and_play(pnp_model = sample_points_kde_kde,
                                                  data = sample_points_bg$env)



#Now we can make a blank raster to store our predictions
  
  sample_points_kde_kde_raster <- env[[1]]

  values(sample_points_kde_kde_raster) <- NA

#Add our predictions to the raster

  sample_points_kde_kde_raster[sample_points_bg$bg_cells] <-
    sample_points_kde_kde_predictions

#Now, we can plot our raster

  plot(sample_points_kde_kde_raster,
       xlim = c(sample_points_bbox[1],sample_points_bbox[3]),
       ylim = c(sample_points_bbox[2],sample_points_bbox[4]))
  points(sample_points[c("longitude","latitude")])
  

## ----thresholding-------------------------------------------------------------

#To threshold this continuous raster to yield a binary raster

sample_points_kde_kde_raster <- sdm_threshold(prediction_raster = sample_points_kde_kde_raster,
                                   occurrence_sf = sample_points_presence$occurrence_sf,
                                   quantile = 0.05,
                                   return_binary = T)


# As before, we'll plot this on top of temperature and occurrence records to see how well we did

  # convert to polygon for easier visualization
  
    sample_points_kde_kde_polygon <-
      sample_points_kde_kde_raster |>
      as.polygons()|>
      st_as_sf()

# Now, we'll plot the standardized temperature raster, along with the occurrence records and the range map 

  ggplot(env)+
    geom_raster(mapping = aes(x=x,y=y,fill=wc2.1_10m_bio_1))+
    scale_fill_viridis_c(name="Temp. C", na.value = "transparent")+
    scale_x_continuous(expand=c(0,0),
                       limits = c(sample_points_bbox[1],
                                  sample_points_bbox[3]))+
      scale_y_continuous(limits = c(sample_points_bbox[2],
                                    sample_points_bbox[4]),
                       expand=c(0,0))+
    theme_bw()+
    geom_sf(data = sample_points_kde_kde_polygon,
            fill = "grey",
            size=2,
            alpha=0.5)+
      geom_point(data = sample_points,
               mapping = aes(x=longitude,y=latitude))




## -----------------------------------------------------------------------------

# We'll rely on the same data as last time for simplicity.
# Since we're using different methods for estimating the presence and background distributions, we need to specify these separately:

sample_points_gaussian_kde <- fit_plug_and_play(presence = sample_points_presence$env,
                                     background = sample_points_bg$env,
                                     presence_method = "gaussian",
                                     background_method = "kde")

sample_points_gaussian_kde_predictions <- project_plug_and_play(pnp_model = sample_points_gaussian_kde,
                                                data = sample_points_bg$env)

# Now, we again convert everything to a raster and then to a polygon

  sample_points_gaussian_kde_raster <- env[[1]]

  values(sample_points_gaussian_kde_raster) <- NA

  sample_points_gaussian_kde_raster[sample_points_bg$bg_cells] <-  sample_points_gaussian_kde_predictions

# Now, we can plot our raster
  
  plot(sample_points_gaussian_kde_raster,
       xlim = c(sample_points_bbox[1],sample_points_bbox[3]),
       ylim = c(sample_points_bbox[2],sample_points_bbox[4]))
  points(sample_points[c("longitude","latitude")])
  

## ----thresholding gk----------------------------------------------------------
# To threshold this continuous raster to yield a binary raster

  sample_points_gaussian_kde_raster <- sdm_threshold(prediction_raster =
                                                       sample_points_gaussian_kde_raster,
                                     occurrence_sf = sample_points_presence$occurrence_sf,
                                     quantile = 0.05,
                                     return_binary = T)


# As before, we'll plot this on top of temperature and occurrence records to see how well we did


# Convert the raster to a polygon for visualization

  # convert to polygon for easier visualization
  
    sample_points_gaussian_kde_polygon <-
      sample_points_gaussian_kde_raster |>
      as.polygons()|>
      st_as_sf()

# Now, we'll plot the standardized temperature raster, along with the occurrence records and the range map 

  ggplot(env)+
    geom_raster(mapping = aes(x = x, y = y, fill = wc2.1_10m_bio_1))+
    scale_fill_viridis_c(name="Temp. C", na.value = "transparent")+
    scale_x_continuous(expand=c(0,0),
                       limits = c(sample_points_bbox[1],
                                  sample_points_bbox[3]))+
      scale_y_continuous(limits = c(sample_points_bbox[2],
                                    sample_points_bbox[4]),
                       expand=c(0,0))+
    theme_bw()+
    geom_sf(data = sample_points_gaussian_kde_polygon,
            fill = "grey",
            size=2,
            alpha=0.5)+
      geom_point(data = sample_points,
               mapping = aes(x=longitude,y=latitude))



## ----maxnet-------------------------------------------------------------------


  sample_points_maxnet <-
  fit_density_ratio(presence = sample_points_presence$env,
                    background = sample_points_bg$env,
                    method = "maxnet")
  
  
  sample_points_maxnet_predictions <-
    project_density_ratio(dr_model = sample_points_maxnet,
                          data = sample_points_bg$env)


#Now, we again convert everything to a raster and then to a polygon
  
  sample_points_maxnet_raster <- env[[1]]

  values(sample_points_maxnet_raster) <- NA

  sample_points_maxnet_raster[sample_points_bg$bg_cells] <-
    sample_points_maxnet_predictions


#Now, we can plot our raster
  
    plot(sample_points_maxnet_raster,
       xlim = c(sample_points_bbox[1],
                sample_points_bbox[3]),
       ylim = c(sample_points_bbox[2],
                sample_points_bbox[4]))
  points(sample_points[c("longitude","latitude")])





## ----ulsif--------------------------------------------------------------------


  sample_points_ulsif <-
  fit_density_ratio(presence = sample_points_presence$env,
                    background = sample_points_bg$env,
                    method = "ulsif")
  
  
  sample_points_ulsif_predictions <-
    project_density_ratio(dr_model = sample_points_ulsif,
                          data = sample_points_bg$env)


#Now, we again convert everything to a raster and then to a polygon
  
  sample_points_ulsif_raster <- env[[1]]

  values(sample_points_ulsif_raster) <- NA

  sample_points_ulsif_raster[sample_points_bg$bg_cells] <-
    sample_points_ulsif_predictions


#Now, we can plot our raster
  
    plot(sample_points_ulsif_raster,
       xlim = c(sample_points_bbox[1],
                sample_points_bbox[3]),
       ylim = c(sample_points_bbox[2],
                sample_points_bbox[4]))
  points(sample_points[c("longitude","latitude")])


## ----model evaluation---------------------------------------------------------

sample_points_gaussian_gaussian_fit <-
  evaluate_range_map(occurrences = sample_points[c("longitude","latitude")],
                     env = env,
                     presence_method = "gaussian",
                     background_method = "gaussian")


#Rather than looking at all of the results, we'll focus on just a few:

sample_points_gaussian_gaussian_fit$fold_results[c('testing_AUC','testing_sensitivity','testing_specificity')]

#The AUC gives us an overall idea of the discriminatory ability of the model, while the sensitivity and specificity tell us how well it discriminates presence vs. background points (respectively).