knitr::opts_chunk$set(echo=TRUE)
knitr::opts_chunk$set(message=FALSE)
knitr::opts_chunk$set(cache=TRUE)

Preliminaries

# Make sure we're in the right place
if (!file.exists("rasvec.Rmd")) stop("This demo must be run from the same directory as the file")

Species data

The R dismo package contains facilities for downloading biodiversity data and constructing species distribution models. This will illustrate pulling data from raster layers using vector (points in this case) locations.

Let's try pulling occurrence records for the Golden-Cheeked Warbler, a threatened bird that breeds near Austin.

pacman::p_load("dismo")
pacman::p_load("tigris")
pacman::p_load("sp")
pacman::p_load("rgeos")

# Since this is slow, I saved the result and reload here
# gcwa = gbif("Setophaga", "chrysoparia", removeZeros = TRUE)
gcwa = readRDS("gcwa.rds")

# Grab the boundary of Texas from the US Census
states = tigris::states(TRUE)
texas = subset(states, NAME == 'Texas')

epsg_3083 = "+proj=aea +lat_1=27.5 +lat_2=35 +lat_0=18 +lon_0=-100 +x_0=1500000 +y_0=6000000 +ellps=GRS80 +datum=NAD83 +units=m +no_defs"
epsg_4326 = "+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs "

# Transform the boundary and simplify for rgeos
texas = sp::spTransform(texas, CRS(epsg_3083))
texas = gSimplify(texas, 10000)     # simplify to 10km scale -- rgeos is very slow
texas = sp::spTransform(texas, CRS(epsg_4326))

Let's plot it and convert the coordinates to sp class.

# Construct the sp object
xy = cbind(gcwa$lon, gcwa$lat)
xy = xy[is.finite(xy[,1]) & is.finite(xy[, 2]), ]
gcwa.sp = SpatialPoints(xy, CRS(epsg_4326))

# Now plot
tigris::plot(texas)
points(gcwa.sp, pch = 19, col = "steelblue")

Let's get rid of the points outside of Texas

pacman::p_load("rgeos")

i = gWithin(gcwa.sp, texas, byid = TRUE)  # Drop points outside
gcwa.sp = gcwa.sp[i, ]

plot(texas); points(gcwa.sp, pch = 19, col = "steelblue")

We can pull down worldclim data and use for modeling.

pacman::p_load("raster")

if (!file.exists("bioclim_texas.tif")) # Just to avoid redoing this
{
  # We only want the worldclim tiles that covers texas
  wcdat1 = getData("worldclim", path = tempdir(), var = "bio", res = 0.5,
                   lon = bbox(texas)[1], lat = bbox(texas)[2])
  wcdat1 = crop(wcdat1, texas)

  # Need to get the adjacent worldclim tiles
  wcdat2 = getData("worldclim", path = tempdir(), var = "bio", res = 0.5,
                   lon = bbox(texas)[1], lat = bbox(texas)[4])
  wcdat2 = crop(wcdat2, texas)

  wcdat3 = getData("worldclim", path = tempdir(), var = "bio", res = 0.5,
                   lon = bbox(texas)[3], lat = bbox(texas)[2])
  wcdat3 = crop(wcdat3, texas)

  wcdat4 = getData("worldclim", path = tempdir(), var = "bio", res = 0.5,
                   lon = bbox(texas)[3], lat = bbox(texas)[4])
  wcdat4 = crop(wcdat4, texas)

  # Merge them together (and crop just to be sure -- not really needed)
  wcdat = crop(merge(wcdat1, wcdat2, wcdat3, wcdat4), texas)

  # Save the result
  writeRaster(wcdat, "bioclim_texas", "GTiff")
}

Now let's get the data out. This is a bit slow.

# Read the data from local file
# Avoids re-downloading
wcdat = brick("bioclim_texas.tif")

bioclim.df = extract(wcdat, gcwa.sp, df = TRUE)

# What we have now are the values of the raster layers at each of the observation points
head(bioclim.df)

# bind the points to the bioclim data
gcwa_bioclim.spdf = SpatialPointsDataFrame(gcwa.sp, bioclim.df)
gcwa_bioclim.spdf = remove.duplicates(gcwa_bioclim.spdf)

Let's get the elevation layer for plotting.

# plot points over the bioclim data
# layer one is elevation
elev = subset(wcdat, 1)
elev = mask(elev, texas)

Now we can plot

plot(elev, xaxs = "r", yaxs = "r")
lines(texas)
points(gcwa_bioclim.spdf, pch = 19, cex = 0.2, col = "red")

With these data, we can build a distribution model that predicts the occurrence of the species in places outside of the training set. We need to generate a collection of pseudo-absences. I'll use a buffer to separate them from the known locations.

gcwa_buffer = gBuffer(gcwa_bioclim.spdf, width = 0.2) # This is crazy slow in rgeos

This shows the points with the buffer. Its better to buffer in a Cartesian coordinate system, but this is sufficient for our use.

plot(elev, xaxs = "r", yaxs = "r")
lines(texas); lines(gcwa_buffer)
points(gcwa_bioclim.spdf, pch = 19, cex = 0.2, col = "red")

Now we get a bunch of random locations not too near the warbler locations. We create a polygon that includes the area of Texas but excludes our buffer areas.

not_gcwa = gDifference(texas, gcwa_buffer)
rpts = spsample(not_gcwa, nrow(gcwa_bioclim.spdf), "random")
plot(texas); points(rpts, pch = 19, cex = 0.2)
points(gcwa_bioclim.spdf, pch = 19, cex = 0.2, col = "red")

Now we get the bioclim variables for each of our random points.

background.df = extract(wcdat, rpts, df = TRUE)
background_bioclim.spdf = SpatialPointsDataFrame(rpts, background.df)
background_bioclim.spdf = remove.duplicates(background_bioclim.spdf)

It will help to put more meaningful names on the columns.

names(background_bioclim.spdf) = make.names(c("ID", "Annual Mean Temperature", "Mean Diurnal Range", "Isothermality", "Temperature Seasonality", "Max Temperature of Warmest Period", "Min Temperature of Coldest Period", "Temperature Annual Range", "Mean Temperature of Wettest Quarter", "Mean Temperature of Driest Quarter", "Mean Temperature of Warmest Quarter", "Mean Temperature of Coldest Quarter", "Annual Precipitation", "Precipitation of Wettest Period", "Precipitation of Driest Period", "Precipitation Seasonality", "Precipitation of Wettest Quarter", "Precipitation of Driest Quarter", "Precipitation of Warmest Quarter", "Precipitation of Coldest Quarter"))
names(gcwa_bioclim.spdf) = names(background_bioclim.spdf)

For modeling, we will combine the warbler locations with the random location and code the warbler locations with a 1 and the random locations with a 0. We call this new variable "Indic" for indicator of presence. We also drop the old ID column.

dismod.df = rbind(cbind(Indic = 1, gcwa_bioclim.spdf@data[, -1]),
                  cbind(Indic = 0, background_bioclim.spdf@data[, -1]))

Now we are ready to model! There are many many methods for this problem. We could choose for example a generalized linear model (logistic regression) or a machine learning algorithm like random forests (TM).

First, we'll try logistic regression.

mod1 = glm(Indic ~ ., data = dismod.df, family = binomial())
summary(mod1)
names(wcdat) = names(gcwa_bioclim.spdf)[-1]
all_texas = as.data.frame(wcdat)  # big!!!
mod1.pred = predict(mod1, all_texas, "response")  # This takes awhile to run
md1pre.rast = rasterFromXYZ(cbind(coordinates(wcdat), mod1.pred))
plot(md1pre.rast)
lines(texas)
points(gcwa_bioclim.spdf, pch = 19, cex = 0.2, col = "red")

Let's try a support vector machine algorithm. This is a particularly powerful way to classify data.

# pacman::p_load(kernlab)
# dismod.df$Indic = as.factor(dismod.df$Indic)
# mod1 = ksvm(Indic ~ ., data = dismod.df)
# print(mod1)

It would be nice to output probabilities rather than a predicted classification, but at least on my workstation, that used too much memory.

#all_texas = as.data.frame(wcdat)  # big!!!
#mod1.pred = predict(mod1, all_texas)  # This takes awhile to run


thk686/keitt.ssi.2019 documentation built on June 28, 2019, 1:28 p.m.