knitr::opts_chunk$set(echo=TRUE) knitr::opts_chunk$set(message=FALSE) knitr::opts_chunk$set(cache=TRUE)
# Make sure we're in the right place if (!file.exists("rasvec.Rmd")) stop("This demo must be run from the same directory as the file")
The R dismo package contains facilities for downloading biodiversity data and constructing species distribution models. This will illustrate pulling data from raster layers using vector (points in this case) locations.
Let's try pulling occurrence records for the Golden-Cheeked Warbler, a threatened bird that breeds near Austin.
pacman::p_load("dismo") pacman::p_load("tigris") pacman::p_load("sp") pacman::p_load("rgeos") # Since this is slow, I saved the result and reload here # gcwa = gbif("Setophaga", "chrysoparia", removeZeros = TRUE) gcwa = readRDS("gcwa.rds") # Grab the boundary of Texas from the US Census states = tigris::states(TRUE) texas = subset(states, NAME == 'Texas') epsg_3083 = "+proj=aea +lat_1=27.5 +lat_2=35 +lat_0=18 +lon_0=-100 +x_0=1500000 +y_0=6000000 +ellps=GRS80 +datum=NAD83 +units=m +no_defs" epsg_4326 = "+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs " # Transform the boundary and simplify for rgeos texas = sp::spTransform(texas, CRS(epsg_3083)) texas = gSimplify(texas, 10000) # simplify to 10km scale -- rgeos is very slow texas = sp::spTransform(texas, CRS(epsg_4326))
Let's plot it and convert the coordinates to sp
class.
# Construct the sp object xy = cbind(gcwa$lon, gcwa$lat) xy = xy[is.finite(xy[,1]) & is.finite(xy[, 2]), ] gcwa.sp = SpatialPoints(xy, CRS(epsg_4326)) # Now plot tigris::plot(texas) points(gcwa.sp, pch = 19, col = "steelblue")
Let's get rid of the points outside of Texas
pacman::p_load("rgeos") i = gWithin(gcwa.sp, texas, byid = TRUE) # Drop points outside gcwa.sp = gcwa.sp[i, ] plot(texas); points(gcwa.sp, pch = 19, col = "steelblue")
We can pull down worldclim data and use for modeling.
pacman::p_load("raster") if (!file.exists("bioclim_texas.tif")) # Just to avoid redoing this { # We only want the worldclim tiles that covers texas wcdat1 = getData("worldclim", path = tempdir(), var = "bio", res = 0.5, lon = bbox(texas)[1], lat = bbox(texas)[2]) wcdat1 = crop(wcdat1, texas) # Need to get the adjacent worldclim tiles wcdat2 = getData("worldclim", path = tempdir(), var = "bio", res = 0.5, lon = bbox(texas)[1], lat = bbox(texas)[4]) wcdat2 = crop(wcdat2, texas) wcdat3 = getData("worldclim", path = tempdir(), var = "bio", res = 0.5, lon = bbox(texas)[3], lat = bbox(texas)[2]) wcdat3 = crop(wcdat3, texas) wcdat4 = getData("worldclim", path = tempdir(), var = "bio", res = 0.5, lon = bbox(texas)[3], lat = bbox(texas)[4]) wcdat4 = crop(wcdat4, texas) # Merge them together (and crop just to be sure -- not really needed) wcdat = crop(merge(wcdat1, wcdat2, wcdat3, wcdat4), texas) # Save the result writeRaster(wcdat, "bioclim_texas", "GTiff") }
Now let's get the data out. This is a bit slow.
# Read the data from local file # Avoids re-downloading wcdat = brick("bioclim_texas.tif") bioclim.df = extract(wcdat, gcwa.sp, df = TRUE) # What we have now are the values of the raster layers at each of the observation points head(bioclim.df) # bind the points to the bioclim data gcwa_bioclim.spdf = SpatialPointsDataFrame(gcwa.sp, bioclim.df) gcwa_bioclim.spdf = remove.duplicates(gcwa_bioclim.spdf)
Let's get the elevation layer for plotting.
# plot points over the bioclim data # layer one is elevation elev = subset(wcdat, 1) elev = mask(elev, texas)
Now we can plot
plot(elev, xaxs = "r", yaxs = "r") lines(texas) points(gcwa_bioclim.spdf, pch = 19, cex = 0.2, col = "red")
With these data, we can build a distribution model that predicts the occurrence of the species in places outside of the training set. We need to generate a collection of pseudo-absences. I'll use a buffer to separate them from the known locations.
gcwa_buffer = gBuffer(gcwa_bioclim.spdf, width = 0.2) # This is crazy slow in rgeos
This shows the points with the buffer. Its better to buffer in a Cartesian coordinate system, but this is sufficient for our use.
plot(elev, xaxs = "r", yaxs = "r") lines(texas); lines(gcwa_buffer) points(gcwa_bioclim.spdf, pch = 19, cex = 0.2, col = "red")
Now we get a bunch of random locations not too near the warbler locations. We create a polygon that includes the area of Texas but excludes our buffer areas.
not_gcwa = gDifference(texas, gcwa_buffer) rpts = spsample(not_gcwa, nrow(gcwa_bioclim.spdf), "random") plot(texas); points(rpts, pch = 19, cex = 0.2) points(gcwa_bioclim.spdf, pch = 19, cex = 0.2, col = "red")
Now we get the bioclim variables for each of our random points.
background.df = extract(wcdat, rpts, df = TRUE) background_bioclim.spdf = SpatialPointsDataFrame(rpts, background.df) background_bioclim.spdf = remove.duplicates(background_bioclim.spdf)
It will help to put more meaningful names on the columns.
names(background_bioclim.spdf) = make.names(c("ID", "Annual Mean Temperature", "Mean Diurnal Range", "Isothermality", "Temperature Seasonality", "Max Temperature of Warmest Period", "Min Temperature of Coldest Period", "Temperature Annual Range", "Mean Temperature of Wettest Quarter", "Mean Temperature of Driest Quarter", "Mean Temperature of Warmest Quarter", "Mean Temperature of Coldest Quarter", "Annual Precipitation", "Precipitation of Wettest Period", "Precipitation of Driest Period", "Precipitation Seasonality", "Precipitation of Wettest Quarter", "Precipitation of Driest Quarter", "Precipitation of Warmest Quarter", "Precipitation of Coldest Quarter")) names(gcwa_bioclim.spdf) = names(background_bioclim.spdf)
For modeling, we will combine the warbler locations with the random location and code the warbler locations with a 1 and the random locations with a 0. We call this new variable "Indic" for indicator of presence. We also drop the old ID column.
dismod.df = rbind(cbind(Indic = 1, gcwa_bioclim.spdf@data[, -1]), cbind(Indic = 0, background_bioclim.spdf@data[, -1]))
Now we are ready to model! There are many many methods for this problem. We could choose for example a generalized linear model (logistic regression) or a machine learning algorithm like random forests (TM).
First, we'll try logistic regression.
mod1 = glm(Indic ~ ., data = dismod.df, family = binomial()) summary(mod1)
names(wcdat) = names(gcwa_bioclim.spdf)[-1] all_texas = as.data.frame(wcdat) # big!!! mod1.pred = predict(mod1, all_texas, "response") # This takes awhile to run md1pre.rast = rasterFromXYZ(cbind(coordinates(wcdat), mod1.pred))
plot(md1pre.rast) lines(texas) points(gcwa_bioclim.spdf, pch = 19, cex = 0.2, col = "red")
Let's try a support vector machine algorithm. This is a particularly powerful way to classify data.
# pacman::p_load(kernlab) # dismod.df$Indic = as.factor(dismod.df$Indic) # mod1 = ksvm(Indic ~ ., data = dismod.df) # print(mod1)
It would be nice to output probabilities rather than a predicted classification, but at least on my workstation, that used too much memory.
#all_texas = as.data.frame(wcdat) # big!!! #mod1.pred = predict(mod1, all_texas) # This takes awhile to run
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.