knitr::opts_chunk$set(echo = TRUE)

Setting Up the Package

First we must install the R package before we can use it.

library(devtools)
devtools::install_github('kaiyaprovost/subsppLabelR',force=F)
library(subsppLabelR)

Let us download some data. We will use Phainopepla nitens as a test species: it has two subspecies that are relatively allopatric. Although the species has hundreds of thousands of points, for now we will only download 2000.

spp = "Phainopepla nitens"
subsppList = c("lepida","nitens")
pointLimit=2000
dbToQuery=c("gbif")

print("Downloading species occurrences")
listFromSubspeciesOcc = subsppLabelR::subspeciesOccQuery(
  spp = spp,
  subsppList = subsppList,
  pointLimit = pointLimit,
  dbToQuery = dbToQuery
)

Now we have returned an object with 2000 occurrences that are not labeled, ~1700 that are labeled as lepida, and ~70 labeled as nitens. Let's do the labeling procedure.

## label the data by subspeces
print("Labeling data by subspecies")
labeledLoc = subsppLabelR::labelSubspecies(subsppOccList = listFromSubspeciesOcc)
labeledLoc = labeledLoc[!(is.na(labeledLoc$longitude)),]
labeledLoc = labeledLoc[!(is.na(labeledLoc$latitude)),]

head(labeledLoc)

This has added a column to the data with the occurrences labeled by subspecies. Lets plot these data. First though, we need a raster layer to plot against. We can either import a raster as "bgLayer", or we can create one from the Phainopepla data. For this example we will create a raster layer. We will then plot the data.

max_long = max(labeledLoc$longitude,na.rm=T)
min_long = min(labeledLoc$longitude,na.rm=T)
max_lat = max(labeledLoc$latitude,na.rm=T)
min_lat = min(labeledLoc$latitude,na.rm=T)
palette(c("red","blue","black"))

ext = raster::extent(c(min_long,max_long,min_lat,max_lat))
bgLayer = raster::raster(ext=ext,nrow=100,ncol=100,vals=0)

subsppNames = unique(labeledLoc$subspecies)

raster::plot(bgLayer,
             col = "grey",
             colNA = "darkgrey",
             main = spp)
points(labeledLoc$longitude,
       labeledLoc$latitude,
       col = as.numeric(as.factor(labeledLoc$subspecies)),
       pch = as.numeric(as.factor(labeledLoc$subspecies)))
legend(
  "top",
  legend = as.factor(unique(labeledLoc$subspecies)),
  bty = "n",
  col = as.numeric(as.factor(unique(labeledLoc$subspecies))),
  pch = as.numeric(as.factor(unique(labeledLoc$subspecies)))
)

As you can see with this example, there is an outlier that is far away from the rest of the distribution. Otherwise, the red and blue dots (which are labeled subspecies) appear to roughly correspond with the known distributions.

Lets do anomaly detection now to remove the outliers and then update the raster.

print("Starting anomaly detection for whole species")

purged_list = c()
kept_list = c()
list_of_anomalies = c()
epsilon = 1e-6

for (i in 0:length(c(subsppNames))) {
  if (i == 0) {
    #print("full")
    #print(nrow(labeledLoc))
    detectedLocs = subsppLabelR::detectSpatialOutliers(localities = labeledLoc, epsilon = epsilon)
  }
  else {
    name = subsppNames[[i]]
    if (name != "unknown") {
      #print(name)
      subset = labeledLoc[labeledLoc$subspecies == name,]
      #print(nrow(subset))
      detectedLocs = subsppLabelR::detectSpatialOutliers(localities = subset, epsilon = epsilon)
    }
  }
  purged = detectedLocs[[1]]
  anomalies = detectedLocs[[2]]
  kept = detectedLocs[[3]] ## DO NOT KEEP THIS
  #print(length(anomalies))

  purged_list = rbind(purged_list, purged)
  list_of_anomalies = c(list_of_anomalies, anomalies)
  kept_list = rbind(kept_list, kept)
}
rows_purged = sort(unique(as.integer(rownames(purged_list))))
head(rows_purged)

We have identified some rows to remove as anomalies for the whole species. Now let us remove them and plot where they are, color coded by subspecies.

print(paste(
  "Removing",
  length(rows_purged),
  "of",
  length(labeledLoc[, 1]),
  "detected anomalies"
))
removed = labeledLoc[(rows_purged),]
labeledLoc = labeledLoc[-(rows_purged),]

raster::plot(
  bgLayer,
  col = "grey",
  colNA = "darkgrey",
  main = paste("Anomalies")
)
points(labeledLoc$longitude,
       labeledLoc$latitude,
       col = "lightgrey",
       pch = 0)
points(removed$longitude,
       removed$latitude,
       col = as.factor(removed$subspecies))
legend(
  "top",
  legend = as.factor(unique(removed$subspecies)),
  pch = 1,
  bty = "n",
  col = as.factor(unique(removed$subspecies))
)

These six outliers come from three localities it seems.

We also will do a quick clean up to make sure that none of the subspecies are now only represented by a single individual, and also to alter the raster file so that it is no longer so wide.

## removing single individual subspecies
print("Removing single-individual subspecies")

for (sub in unique(labeledLoc$subspecies)) {
  #print(sub)
  rows = (nrow(labeledLoc[labeledLoc$subspecies == sub,]))
  if (rows <= 1) {
    print(sub)
    labeledLoc = labeledLoc[labeledLoc$subspecies != sub,]
  }
}
subsppNames = unique(labeledLoc$subspecies)

max_long = max(labeledLoc$longitude,na.rm=T)
min_long = min(labeledLoc$longitude,na.rm=T)
max_lat = max(labeledLoc$latitude,na.rm=T)
min_lat = min(labeledLoc$latitude,na.rm=T)
palette(c("red","blue","black"))

ext = raster::extent(c(min_long,max_long,min_lat,max_lat))
bgLayer = raster::raster(ext=ext,nrow=100,ncol=100,vals=0)
raster::plot(bgLayer,
             col = "grey",
             colNA = "darkgrey",
             main = spp)
points(labeledLoc$longitude,
       labeledLoc$latitude,
       col = as.numeric(as.factor(labeledLoc$subspecies)),
       pch = as.numeric(as.factor(labeledLoc$subspecies)))
legend(
  "topright",
  legend = as.factor(unique(labeledLoc$subspecies)),
  bty = "n",
  col = as.numeric(as.factor(unique(labeledLoc$subspecies))),
  pch = as.numeric(as.factor(unique(labeledLoc$subspecies)))
)

Now let us plot the density maps for the species. We will only retain the top 5% most dense points (the 95th percentile) for each subspecies.

xmax = max_long 
xmin = min_long 
ymax = max_lat
ymin = min_lat
quantile=0.95

#total_range = bgLayer
#raster::values(total_range)[!is.na(raster::values(bgLayer))] = NA

print("Building species kernel density maps")

densityRasters = lapply(subsppNames, function(subspp) {
  print(subspp)
  #subspp="relicta"
  locs = labeledLoc[labeledLoc$subspecies == subspp,]
  #print(head(locs))
  dens = subsppLabelR::subspeciesDensityMap(
    localities = locs,
    quantile = quantile,
    xmin = xmin,
    xmax = xmax,
    ymin = ymin,
    ymax = ymax,
    total_range= bgLayer
  )
  if (is.null(dens)) {
    dens = NA
  }

  if ((length((raster::unique(dens,na.last=NA)))) == 0) {
    dens = NA
  }

  names(dens) = subspp
  return(dens)
})
#print("done density")
names(densityRasters) = subsppNames

## remove failed ones
densityRasters = densityRasters[!(is.na(densityRasters))]
subsppNames = names(densityRasters)

Lets plot these data.

par(mfrow=c(1,3))
for (i in 1:length(densityRasters)) {
  print(i)
  name = names(densityRasters)[[i]]
  raster::plot(
    bgLayer,
    col = "grey",
    colNA = "darkgrey",
    legend=F,
    main = paste("Density, subspp:", name)
  )
  raster::plot(densityRasters[[i]],
               add = T,
               col = viridis::viridis(99))
}

And for fun let's look at how the data vary when we change the density we retain, from 5% to 10% (the 90th percentile)

quantile=0.90

print("Building species kernel density maps -- top 10 %")

densityRasters_10 = lapply(subsppNames, function(subspp) {
  print(subspp)
  locs = labeledLoc[labeledLoc$subspecies == subspp,]
  dens = subsppLabelR::subspeciesDensityMap(
    localities = locs,
    quantile = quantile,
    xmin = xmin,
    xmax = xmax,
    ymin = ymin,
    ymax = ymax,
    total_range= bgLayer
  )
  if (is.null(dens)) {
    dens = NA
  }

  if ((length((raster::unique(dens,na.last=NA)))) == 0) {
    dens = NA
  }

  names(dens) = subspp
  return(dens)
})
#print("done density")
names(densityRasters_10) = subsppNames

## remove failed ones
densityRasters_10 = densityRasters_10[!(is.na(densityRasters_10))]
subsppNames = names(densityRasters_10)


par(mfrow=c(2,3))
for (i in 1:length(densityRasters)) {
  print(i)
  name = names(densityRasters)[[i]]
  raster::plot(
    bgLayer,
    col = "grey",
    colNA = "darkgrey",
    legend=F,
    main = paste("Density for 5%, subspp:", name)
  )
  raster::plot(densityRasters[[i]],
               add = T,
               col = viridis::viridis(99))
}
for (i in 1:length(densityRasters_10)) {
  print(i)
  name = names(densityRasters_10)[[i]]
  raster::plot(
    bgLayer,
    col = "grey",
    colNA = "darkgrey",
    legend=F,
    main = paste("Density for 10%, subspp:", name)
  )
  raster::plot(densityRasters_10[[i]],
               add = T,
               col = viridis::viridis(99))
}

Now we can see where the bulk of the data are found for these ranges. Most of the unknown samples we downloaded appear to be in subspecies lepida. Comparing the top 5% and top 10% shows us how choosing cutoffs can impact our downstream analyses. We will continue for now with the top 5%.

Next let us convert these maps to polygons.

## convert to polygons
print("Converting density maps to polygons")
## can't handle if there's no data in previous step

densityPolygons = lapply(densityRasters, function(dens) {
  #print(names(dens))
  densPol = subsppLabelR::densityMapToPolygons(densityMap = dens)
  return(densPol)
})

raster::plot(
  bgLayer,
  col = "grey",
  colNA = "darkgrey",
  main = paste("Polygons, untrimmed", name)
)
for (i in 1:length(densityPolygons)) {
  raster::plot(
    densityPolygons[[i]],
    add = T,
    border = i,
    lwd = ((3 * i) / 3)
  )
}
legend(
  "top",
  legend = names(densityPolygons),
  bty = "n",
  fill = rgb(0, 0, 0, 0),
  border = 1:length(densityPolygons)
)

We must now check the polygons for overlaps.

## check overlaps between polygons

print("Checking Overlaps of Polygons and Removing Overlaps")

densityPolygons_trim1 = subsppLabelR::polygonTrimmer(polygonList = densityPolygons, namesList = subsppNames)

for (i in 1:length(densityPolygons_trim1)) {
  name = names(densityPolygons_trim1)[[i]]
  raster::plot(
    bgLayer,
    col = "grey",
    colNA = "darkgrey",
    main = paste("Polygon, subspp:", name)
  )
  raster::plot(densityPolygons_trim1[[i]],
               add = T,
               col = viridis::viridis(99))
}

raster::plot(
  bgLayer,
  col = "grey",
  colNA = "darkgrey",
  main = paste("Polygon, subspp:", name)
)
for (i in 1:length(densityPolygons_trim1)) {
  name = names(densityPolygons_trim1)[[i]]
  raster::plot(
    densityPolygons_trim1[[i]],
    add = T,
    border = i,
    lwd = ((3 * i) / 3)
  )
}
legend(
  "top",
  legend = names(densityPolygons_trim1),
  bty = "n",
  fill = rgb(0, 0, 0, 0),
  border = 1:length(densityPolygons_trim1)
)

These polygons look good now, considering we are missing a lot of data from the center of the distribution. Now lets locate the unknown points relative to those polygons and do some clean up.

densityPolygons_trim = densityPolygons_trim1

print("Locating points relative to polygons")

polyLocations = labeledLoc

for (slotA in 1:length(subsppNames)) {
  for (slotB in 1:length(subsppNames)) {
    if (subsppNames[[slotA]] != "unknown" &&
        subsppNames[[slotB]] != "unknown" && slotA != slotB) {
      #print(paste(slotA,slotB,sep=" "))
      polyLocations = subsppLabelR::locatePolygonPoints(
        test_points = polyLocations,
        polygonA = densityPolygons_trim[[slotA]],
        polygonB = densityPolygons_trim[[slotB]],
        nameA = subsppNames[[slotA]],
        nameB = subsppNames[[slotB]],
        setcoord = T
      )

    }

  }

}

print ("Cleaning up duplicate columns")
colsToDelete = c()

#print(polyLocations)

for (colNumA in 5:length(colnames(polyLocations))) {
  for (colNumB in 6:length(colnames(polyLocations))) {
    if (colNumA < colNumB) {
      #print(paste("compare",colNumA,colNumB,sep=" "))
      if (identical(polyLocations[[colNumA]], polyLocations[[colNumB]])) {
        #print("identical, deleting")
        colsToDelete = c(colsToDelete, colNumB)
      }
    }
  }
}
if (!(is.null(colsToDelete))) {
  polyLocations = polyLocations[,-colsToDelete]
}

head(polyLocations)

Now we will label the occurrences based on those polygons and flag anything that looks suspect.

print("Matching subspecies")
checked = subsppLabelR::subspeciesMatchChecker(locfile = polyLocations, subsppNames =
                                                 subsppNames)
loc_suspect = checked$suspect
loc_good = checked$good
pol = densityPolygons_trim

And the final output:

par(mfrow=c(1,1))

raster::plot(
  bgLayer,
  col = "grey",
  colNA = "darkgrey",
  main = "Unknown or Suspect Poimts"
)
points(loc_suspect$longitude,loc_suspect$latitude)
raster::plot(
  bgLayer,
  col = "grey",
  colNA = "darkgrey",
  main = "Good Points by Subspecies"
)
points(loc_good$longitude,loc_good$latitude,col=as.numeric(as.factor(loc_good$assigned)))

raster::plot(
  bgLayer,
  col = "grey",
  colNA = "darkgrey",
  main = "Final Polygons by Subspecies"
)
lapply(1:length(pol),FUN=function(i){
  raster::plot(pol[[i]],add=T,border=as.numeric(as.factor(names(pol)))[i])
})

Putting this all together in one function looks like this:

db = subsppLabelR::databaseToAssignedSubspecies(spp="Phainopepla nitens",
                                                subsppList = c("lepida","nitens"),
                                                pointLimit=2000,
                                                dbToQuery=c("gbif"),
                                                quantile=0.95,
                                                xmin=-125,
                                                xmax=-60,
                                                ymin=10,
                                                ymax=50,
                                                plotIt=T, bgLayer=raster::raster(ext=raster::extent(c(xmin,xmax,ymin,ymax)),nrow=100,ncol=100,vals=0),
                                                outputDir="~/")
print(head(db$loc_suspect))
print(head(db$loc_good))
print(head(db$pol))

Note that when you run this script, it will automatically generate the images as PNG files in the directory specified in the function. Here that is the home directory ("~/").



kaiyaprovost/subsppLabelR documentation built on March 17, 2024, 5:09 p.m.