# Author: Kyle Taylor <kyle.taylor@pljv.org>
# Year : 2017
# Description : various tasks related to formatting and parsing raw IMBCR data
# into Spatial* and Unmarked* objects.
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# Define some useful local functions for manipulating IMBCR data
#
#' hidden function that greps for four-letter-codes
birdcode_fieldname <- function(df=NULL){
return(names(df)[grepl(tolower(names(df)),pattern="^bird")])
}
#' hidden function that greps for four-letter-codes
commonname_fieldname <- function(df=NULL){
return(names(df)[grepl(tolower(names(df)),pattern="^c.*.[.]n.*.")])
}
#' hidden function that greps for the distance field name
distance_fieldname <- function(df=NULL){
return(names(df)[grepl(tolower(names(df)),pattern="^rad")])
}
#' hidden function that greps for the transect field name
transect_fieldname <- function(df=NULL){
return(names(df)[grepl(tolower(names(df)),pattern="^tran")])
}
#' hidden function that greps for the timeperiod field name
timeperiod_fieldname <- function(df=NULL){
return(names(df)[grepl(tolower(names(df)),pattern="^tim")])
}
#' built-in (hidden) function that will accept the full path of a shapefile
#' and parse the string into something that rgdal can understand (DSN + Layer).
parseLayerDsn <- function(x=NULL){
path <- unlist(strsplit(x, split="/"))
layer <- gsub(path[length(path)],pattern=".shp",replacement="")
dsn <- paste(path[1:(length(path)-1)],collapse="/")
return(c(layer,dsn))
}
#' built-in (hidden) function that will accept the full path of a shapefile and read using rgdal::ogr
#' @param path argument provides the full path to an ESRI Shapefile
readOGRfromPath <- function(path=NULL, verbose=F){
path <- OpenIMBCR:::parseLayerDsn(path)
layer <- path[1]
dsn <- path[2]
return(rgdal:::readOGR(dsn,layer,verbose=verbose))
}
#' strip non-essential characters and spaces from a species common name in a field
strip_common_name <- function(x) tolower(gsub(x,pattern=" |'|-",replacement=""))
#' recursively find all files in folder (root) that match the pattern (name)
#' @export
recursive_find_file <- function(name=NULL,root=Sys.getenv("HOME")){
if(is.null(name)){
return(NULL)
} else {
return(list.files(root,pattern=name,recursive=T,full.names=T))
}
}
#' parse a source CSV (published by BCR) for IMBCR data and return
#' as a SpatialPointsDataFrame to the user for post-processing or
#' conversion to unmarked data.frame objects for modeling
#' @export
imbcr_df_to_spatial_points <- function(filename=NULL,outfile=NULL,
write=F, calc_spatial_covs=T){
if(is.null(outfile) && write && !is.character(filename)){
stop("cannot write shapefile to disk without an outfile= or filename.ext to parse")
}
# sanity-check to see if our output shapefile already exists
s <- recursive_find_file(outfile)[1]
if(!is.null(s)){
s <- OpenIMBCR:::readOGRfromPath(s)
# Parse ALL BCR raw data tables into a single table
} else {
if(inherits(filename,"data.frame")){
t <- filename
} else {
if(length(filename)==1){
t <- read.csv(filename)
# legacy support : rbind across multiple CSV files
} else {
t <- lapply(recursive_find_file(
name=filename
),
read.csv
)
t <- do.call(rbind, t)
}
}
names(t) <- tolower(names(t))
#
# iterate over each UTM zone in the table, creating SpatialPoints
# projected to a focal UTM. Then merge all of the zones together into
# a single shapfile with an arbitrary CRS.
#
s <- list()
for (zone in unique(na.omit(t$ptvisitzone))){
s[[length(s) + 1]] <- na.omit(t[t$ptvisitzone == zone,])
s[[length(s)]] <- sp::SpatialPointsDataFrame(
coords = data.frame(
x = s[[length(s)]]$ptvisiteasting,
y = s[[length(s)]]$ptvisitnorthing
),
data = s[[length(s)]],
proj4string = sp::CRS(raster::projection(
paste("+init=epsg:269", zone, sep = "")
))
)
# based on : http://gis.stackexchange.com/questions/155328/merging-multiple-spatialpolygondataframes-into-1-spdf-in-r
row.names(s[[length(s)]]) <-
paste(
letters[length(s)],
row.names(s[[length(s)]]),
sep="."
)
}
# merge our segments and convert to a consistent
# popular CRS
s <- do.call(
sp::rbind.SpatialPointsDataFrame,
lapply(
s,
FUN=sp::spTransform,
sp::CRS(raster::projection("+init=epsg:2163"))
))
s$FID <- 1:nrow(s)
# calculate spatial (lat/lon) covariates for each station?
if(calc_spatial_covs){
# calculate lat/lon covariates in WGS84
coords <- sp::spTransform(s,"+init=epsg:4326")@coords
coords <- cbind(coords, coords^2, log10(coords+361))
colnames(coords) <- c("lon","lat","lon_2","lat_2","ln_lon","ln_lat")
s@data <- cbind(s@data,coords)
rm(coords)
}
# write to disk -- and allow some wiggle-room on filename conventions
if(write){
rgdal::writeOGR(
s,
".",
ifelse(is.null(outfile),gsub(filename,pattern=".csv",replacement=""),outfile),
driver="ESRI Shapefile",
overwrite=T
)
}
}
return(s)
}
#' hidden function that will calculate the centroids of a USNG
#' (SpatialPolygons) and return only unique (non-duplicated)
#' units
drop_overlapping_units <- function(units=NULL){
# warn if units don't appear to be projected in
# meters, which would make our rounding below
# inappropriate
if(!grep(raster::projection(units), pattern="units=m")){
warning("units= polygons do not appear to be metric -- we might over-estimate potential overlap")
}
duplicated <- duplicated(round(
rgeos::gCentroid(units, byid=T)@coords
))
return(
units[!duplicated , ]
)
}
#' accepts a named raster stack of covariates, an IMBCR SpatialPointsDataFrame,
#' and a species common name and returns a formatted unmarked distance data.frame
#' that can be used for model fitting with unmarked.
#' @export
build_unmarked_distance_df <- function(r=NULL, s=NULL, spp=NULL,
vars=c("doy","starttime"), #
fun=mean,
d=c(0,100,200,300,400,500,600,700,800)){
# do our covariates in r=raster stack occur in our IMBCR data.frame object?
if(sum(names(r) %in% names(s@data))<raster::nlayers(r)){
s <- suppressWarnings(raster::extract(r,s,sp=T))
s$doy <- as.numeric(strftime(as.POSIXct(as.Date(as.character(s$date), "%m/%d/%Y")),format="%j")) # convert date -> doy
s@data <- s@data[,!grepl(names(s@data),pattern="FID")]
}
# kludging to select the covariates specified in s= that we will aggregate
# and use at the transect level
if(!is.null(vars)){
vars <- append(names(r), vars)
} else {
vars <- names(r)
}
# parse our dataset for RNEP records
t <- s[s$common.name == spp,]@data
# build a distance table
distances <- data.frame(distance=t$radialdistance,transect=t$transectnum)
y <- unmarked::formatDistData(distances, distCol="distance",transectNameCol="transect",dist.breaks=d)
# build a target matrix
stateCovariates <- matrix(NA,ncol=length(vars),nrow=length(levels(t$transectnum))) # e.g., 300 total transects x n state covariates
rownames(stateCovariates) <- levels(t$transectnum)
# aggregate by field
for(i in 1:length(vars)){
stateCovariates[,i] <- aggregate(s@data[,vars[i]], by=list(Category=s@data$transectnum), FUN=fun, na.rm=T)[,2]
}
# specify covariate names
colnames(stateCovariates) <- vars
stateCovariates <- data.frame(stateCovariates) # unmarked expects this ahtos a data.frame
# format our training data as umf
return(unmarked::unmarkedFrameDS(y=as.matrix(y), siteCovs=stateCovariates, survey="point", dist.breaks=d, unitsIn="m"))
}
#' kludging to back-fill any transect stations in an imbcr data.frame
#' that were sampled, but where a focal species wasn't observed, with
#' NA values.
#' @export
scrub_imbcr_df <- function(df,
allow_duplicate_timeperiods=F,
four_letter_code=NULL){
LARGE_CLUSTER_SIZE=100
# throw-out any lurking 88 values, count before start values, and
# -1 distance observations
df <- df[!df@data[, timeperiod_fieldname(df)] == 88, ]
df <- df[!df@data[, timeperiod_fieldname(df)] == -1, ]
df <- df[!df@data[, distance_fieldname(df)] == -1, ]
# build an empty (NA) data.frame for our species that we will populate with
# valid values interatively
df_final <- unique(df@data[,c('transectnum', 'year', 'point')])
# add-in 6 minute periods for each station sampled
df_final <- df_final[ sort(rep(1:nrow(unique(df@data[,c('transectnum', 'year', 'point')])), 6)), ]
df_final <- cbind(df_final, data.frame(timeperiod=1:6))
# drop in NA values for our species of interest
df_final[, OpenIMBCR:::birdcode_fieldname(df)] <- four_letter_code
df_final$cl_count <- NA
df_final[ , OpenIMBCR:::distance_fieldname(df)] <- NA
# iterate over df_final, pulling matches for our species of interest as we go
columns_retained <- c('transectnum', 'year', 'point', 'timeperiod', 'birdcode', 'radialdistance', 'cl_count')
# prepare to parallelize our large df operation
cl <- parallel::makeCluster(LARGE_CLUSTER_SIZE)
parallel::clusterExport(
cl,
varlist=c("df", "df_final","columns_retained"),
envir=environment()
)
df_final <- do.call(rbind, parallel::parLapply(
cl=cl,
X=1:nrow(df_final),
fun=function(i){
query <- df_final[i, c('transectnum', 'year', 'point', 'timeperiod', 'birdcode')]
match <- merge(df@data, query, all=F)
if(nrow(match) == 0){
# return some sane defaults if there was no match
df_final[ i, 'radialdistance' ] <- NA
df_final[ i, 'cl_count' ] <- 0
return(df_final[i, columns_retained])
} else {
# the return here could have multiple rows in the same minute period
return(match[ , columns_retained ])
}
}
))
# clean-up
parallel::stopCluster(cl);
rm(cl);
# return to user
return(df_final)
}
#' accepts a formatted IMBCR SpatialPointsDataFrame and builds an
#' unmarkedFrameGDS data.frame that we can use for modeling with
#' the unmarked package.
#' @export
build_unmarked_gds <- function(df=NULL,
numPrimary=1,
distance_breaks=NULL,
covs=NULL,
unitsIn="m",
summary_fun=median,
drop_na_values=T
){
if(inherits(df, "Spatial")){
df <- df@data
}
# determine distance breaks / classes, if needed
if(is.null(distance_breaks)){
distance_breaks = df$distance_breaks
distance_classes = append(sort(as.numeric(unique(
df$dist_class))),
NA
)
} else {
distance_classes = append(1:length(distance_breaks)-1, NA)
}
# parse our imbcr data.frame into transect-level summaries
# with unmarked::gdistsamp comprehension
transects <- unique(df[,transect_fieldname(df)])
# pool our transect-level observations
transects <- do.call(rbind,
lapply(
transects,
FUN=pool_by_transect_year,
df=df, breaks=distance_breaks,
covs=covs
)
)
# bug fix : drop entries with NA values before attempting PCA or quantile pruning
if(drop_na_values){
transects <- transects[ !as.vector(rowSums(is.na(transects)) > 0) , ]
transects <- transects[ ,!grepl(colnames(transects), pattern="_NA")]
}
# build our unmarked frame and return to user
return(unmarked::unmarkedFrameGDS(
# distance bins
y=transects[,grepl(names(transects),pattern="distance_")],
# covariates that vary at the site (transect) level
siteCovs=transects[,!grepl(colnames(transects),pattern="distance_")],
# not used (covariates at the site-year level)
yearlySiteCovs=NULL,
survey="point",
unitsIn=unitsIn,
dist.breaks=distance_breaks,
numPrimary=numPrimary # should be kept at 1 (no within-season visits)
))
}
#' hidden function used to clean-up an unmarked data.frame (umdf) by dropping
#' any NA columns attributed by scrub_imbcr_df(), mean-center (scale) our site
#' covariates (but not sampling effort!), and do some optional quantile filtering
#' that drops covariates with low variance, which is a useful 'significance
#' pruning' precursor for principal components analysis. Prefer dropping the NA
#' bin here (rather than in scrub_imbcr_df), so that we still have an accurate
#' account of total sampling effort to attribute in scrub_unmarked_dataframe().
scrub_unmarked_dataframe <- function(x=NULL, normalize=T, prune_cutoff=NULL){
row.names(x@y) <- NULL
row.names(x@siteCovs) <- NULL
x@y <- x@y[,!grepl(colnames(x@y), pattern="_NA")]
x@obsToY <- matrix(x@obsToY[,1:ncol(x@y)],nrow=1)
# do some quantile pruning of our input data, selectively dropping
# an arbitrary number of variables based on a user-specified
# low-variance threshold
if(!is.null(prune_cutoff)){
# e.g., what is the total variance for each cov across all sites?
# drop those standardized variables with < prune_cutoff=0.05 variance
effort_field <- ifelse(
sum(grepl(colnames(x@siteCovs), pattern="effort")),
"effort",
NULL
)
vars_to_scale <- colnames(x@siteCovs)[
!grepl(tolower(colnames(x@siteCovs)), pattern="effort")
]
# bug-fix : only try to prune numeric variables
is_numeric <- apply(
x@siteCovs[1,vars_to_scale],
MARGIN=2,
FUN=function(x) !is.na(suppressWarnings(as.numeric(x)))
)
if(length(vars_to_scale)!=sum(is_numeric)){
warning(paste(
"the following input variables are not numeric and cannot be",
"filtered by quantile and will not be pruned:",
paste(
vars_to_scale[!is_numeric],
collapse=", "
)
))
vars_to_scale <- vars_to_scale[is_numeric]
}
# calculate relative variance across all sites for each variable (column)
variance <- apply(
x@siteCovs[,vars_to_scale],
MARGIN=2,
FUN=function(x) ( (x - min(x)) / (max(x)-min(x)) ) # quick min-max normalize
)
# min-max will return NA on no variance (e.g., divide by zero)
variance[is.na(variance)] <- 0
variance <- apply(
variance,
MARGIN=2,
FUN=var
)
# drop variables that don't meet our a priori variance threshold
dropped <- as.vector(variance < quantile(variance, p=prune_cutoff))
if(sum(dropped)>0){
warning(paste(
"prune_cutoff dropped these variables due to very small variance: ",
paste(colnames(x@siteCovs[,vars_to_scale])[dropped], collapse=", "),
sep=""
))
keep <- unique(c(
names(is_numeric[!is_numeric]),
effort_field,
vars_to_scale[!dropped]
))
x@siteCovs <- x@siteCovs[, keep]
}
}
# normalize our site covariates?
if(normalize){
# don't try to normalize non-numeric values -- drop these as site covs
x@siteCovs <-
x@siteCovs[ , as.vector(unlist(lapply(x@siteCovs[1,], FUN=is.numeric)))]
# don't normalize the "effort" field
vars_to_scale <- colnames(x@siteCovs)[
!grepl(tolower(colnames(x@siteCovs)), pattern="effort")
]
# scaling call
x@siteCovs[,vars_to_scale] <- as.data.frame(
scale(x@siteCovs[,vars_to_scale])
)
# sanity check : do some variable pruning based on variance
# from our normalization step -- drop variables with low variance
# from consideration and report dropped variables to user
dropped <- as.vector(unlist(lapply(x@siteCovs[1,], FUN=is.na)))
if(sum(dropped)>0){
warning(paste(
"scale() dropped these variables due to very small variance: ",
paste(colnames(x@siteCovs)[dropped], collapse=", "),
sep=""
))
x@siteCovs <- x@siteCovs[,!dropped]
}
}
return(x)
}
#' will generate a uniform vector grid within a polygon. Typically
#' this is a 250 meter grid, but the size of each
#' grid cell is arbitrary. Will return units as SpatialPolygons.
polygon_to_fishnet_grid <- function(
usng_unit=NULL,
res=250,
x_offset=0,
y_offset=0,
clip=F,
centers=F
){
MIN_UNIT_SIZE = rgeos::gArea(usng_unit) * 0.03
ZIPPER_UNIT_SIZE = 600245.2 # ~60% of a full USNG unit
N_SUBUNITS = round(rgeos::gArea(usng_unit) / res^2)
# sanity check our unit size
if ( rgeos::gArea(usng_unit) < ZIPPER_UNIT_SIZE ){
warning("dropping zipper grid unit")
return(NA)
}
# solves for the rotation parameter in 'sf' function
rotate <- function(a) matrix(c(cos(a), sin(a), -sin(a), cos(a)), 2, 2)
# get the bounding-box point coords for our unit -- the slot
# handling is sloppy here and may break in the future
ul <- which.max(usng_unit@polygons[[1]]@Polygons[[1]]@coords[,2])
ul <- usng_unit@polygons[[1]]@Polygons[[1]]@coords[ul,]
ur <- which.max(usng_unit@polygons[[1]]@Polygons[[1]]@coords[,1])
ur <- usng_unit@polygons[[1]]@Polygons[[1]]@coords[ur,]
ll <- which.min(usng_unit@polygons[[1]]@Polygons[[1]]@coords[,1])
ll <- usng_unit@polygons[[1]]@Polygons[[1]]@coords[ll,]
# dig up old Pythagoras and solve for the hypotenuse -- we will
# use this to solve for our angle of rotation
opp <- ul[2]-ur[2]
adj <- ur[1]-ul[1]
hyp <- sqrt(opp^2 + adj^2)
# here's the angle of rotation associated with each 250 m
# grid unit
theta <- asin(opp/hyp) * 57.29578 # convert radians-to-degrees
#print(theta)
# build-out our grid using per-unit specifications
grd <- try(sf::st_make_grid(
usng_unit,
n=c(sqrt(N_SUBUNITS),sqrt(N_SUBUNITS)),
what="polygons",
crs=sp::CRS(raster::projection(usng_unit)),
square=T
))
if( inherits(grd, "try-error") ){
warning(
"sf::st_make_grid failed to generate subunits from parent unit ",
"features -- returning NA"
)
return(NA)
}
if( length(grd) < N_SUBUNITS){
warning(
"sf::st_make_grid failed to generate enough subunits -- ",
"returning NA"
)
return(NA)
}
# if we aren't using the n= specification for st_make_grid,
# we'll need to rotate our units so they are spatially
# consistent with our parent unit
# grd_rot <- (grd - sf::st_centroid(sf::st_union(grd))) *
# rotate(theta * pi / 180) + sf::st_centroid(sf::st_union(grd))
# testing: the n= specification can accomodate our rotation for us?
grd_rot <- grd
grd_rot <- try(sf::as_Spatial(grd_rot))
if( inherits(grd_rot, 'try-error') ){
warning(
"sf::as_Spatial may have encountered some failed geometries ",
"in our generated subgrid -- here is a print-out of the geometries: ",
as.character(as.data.frame(grd))
)
return(NA)
}
# restore our projection to the adjusted grid
raster::projection(grd_rot) <- raster::projection(usng_unit)
# make sure we clip any boundaries for grid units so the grid is
# fully consistent with the larger polygon unit
if(clip){
grd_rot <- rgeos::gIntersection(
grd_rot,
usng_unit,
byid=T
)
# drop any slivers from our intersect operation
slivers <- sapply(
sp::split(grd_rot, seq_len(nrow(grd_rot))),
FUN=rgeos::gArea
) < MIN_UNIT_SIZE
grd_rot <- grd_rot[!slivers,]
}
# attribute station id's in a funky zig-zag pattern typically
# used with IMBCR sampling
if(centers){
grd_rot <- rgeos::gCentroid(
grd_rot,
byid=T
)
# sort rows (latitude) by increasing longitudinal-values
rows <- sort(unique(round(sp::coordinates(grd_rot)[,2])), decreasing=T)
coords_order <- vector()
for(row in rows){
coords <- round(coordinates(grd_rot))
y_coords <- coords[,2]
# sort this row by decreasing longitude values
coords_order <- append(
coords_order,
names(sort(coords[y_coords == row , 1], decreasing=T))
)
}
grd_rot <- sp::SpatialPointsDataFrame(
sp::coordinates(grd_rot)[coords_order,],
data=data.frame(station=seq_len(N_SUBUNITS))
)
raster::projection(grd_rot) <- raster::projection(usng_unit)
}
# last sanity check
if( length(grd_rot) < N_SUBUNITS){
warning("failed to generate enough subunits for polygon features")
return(NA)
}
return(grd_rot)
}
#' generate a uniform fishnet grid for a variable polygon dataset containing
#' one or more geometries. Will fix the rotation of the grid using the bounding
#' box of the polygon dataset.
#' @export
generate_fishnet_grid <- function(units=NULL, res=251){
units <- sp::split(units, 1:nrow(units))
e_cl <- parallel::makeCluster(parallel::detectCores()*0.75)
parallel::clusterExport(e_cl, varlist=c('res'), envir=environment())
units <- parallel::parLapply(
cl = e_cl,
X = units,
fun = function(unit){
return(
OpenIMBCR:::polygon_to_fishnet_grid(unit, res=res)
)
}
)
# clean-up our parallelized operation
parallel::stopCluster(e_cl); rm(e_cl);
# accept that some zipper units may product NA values that we drop here
SUCCEEDED <- sapply(
units,
FUN=function(x) inherits(x, 'SpatialPolygons')
)
# drop the failures and return to user a single SpatialPolygonsDataFrame
units <- do.call(
sp::rbind.SpatialPolygons,
units[SUCCEEDED]
)
# force consistent naming of our polygon ID's for SpatialPolygonsDataFrame()
units@polygons <- lapply(
X=1:length(units@polygons),
function(i){ f <- units@polygons[[i]]; f@ID <- as.character(i); return(f) }
)
return(
SpatialPolygonsDataFrame(units, data=data.frame(id=1:length(units@polygons)))
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.