options(warn = -1, error=traceback)
require(OpenIMBCR)
require(glmulti)
require(unmarked)
AIC_RESCALE_CONST <- 100000
AIC_SUBSTANTIAL_THRESHOLD <- 8
CORRELATION_THRESHOLD <- 0.65
calc_transect_summary_detections <- function(s=NULL, name=NULL, field='est_abund'){
return(OpenIMBCR:::calc_route_centroids(
s = s,
four_letter_code = toupper(name),
use='est_abund'
))
}
quadratics_to_keep <- function(m){
quadratic_terms <- grepl(names(coefficients(m)), pattern = "[)]2")
# test : are we negative and are we a quadratic term?
keep <- (coefficients(m) < 0) * quadratic_terms
quadratic_terms <- names(coefficients(m))[keep == 1]
quadratic_terms <- gsub(quadratic_terms, pattern = "[)]2", replacement = ")")
# test: are we a positive linear term and a negative quadratic
direction_of_coeffs <- coefficients(m)/abs(coefficients(m))
steps <- seq(2, length(direction_of_coeffs), by = 2)
keep <- names(which(direction_of_coeffs[steps] + direction_of_coeffs[steps+1] == 0))
quadratic_terms <- gsub(keep, pattern = "[)]1", replacement = ")")
# test are both our linear and quadratic terms negative? drop if so
if (length(quadratic_terms) > 0){
return(quadratic_terms)
} else {
return(NULL)
}
}
pred_hn_det_from_distance <- function(x=NULL, dist=NULL){
param <- exp(coef(x, type = "det"))
return(as.vector(unmarked:::gxhn(x=dist, param)))
}
calc_emp_dispersion_statistic <- function(x = NULL, bs=999999){
observed <- sd(x)/round(mean(x))
predicted <- median(sapply(
X=bs,
FUN=function(i){
predicted <- rpois(n = length(x), round(mean(x)))
return( sd(predicted)/round(mean(predicted)) )
}))
return(round(observed/predicted, 2))
}
calc_intercept_statistics <- function(x=NULL){
load(x); # from an rdata file
PLJV_AREA = 646191.287039 # in square kilometers
return(data.frame(
spp=strsplit(r_data_file, split="_")[[1]][1],
mean_p_det=mean(per_obs_det_probabilities, na.rm=T),
upper_pred=max(unmarked::predict(intercept_m, type="state")[,1]),
lower_pred=min(unmarked::predict(intercept_m, type="state")[,1]),
mean_pred=mean(unmarked::predict(intercept_m, type="state")[,1]),
median_pred=median(unmarked::predict(intercept_m, type="state")[,1]),
max_detections=max(rowSums(detections$y)),
n_hat=mean(unmarked::predict(intercept_m, type="state")[,1])*PLJV_AREA
))
}
#' wrapper function for calc_intercept_statistics that will accept a
#' vector of rdata filenames and build a summary table of results for
#' all birds
calc_descriptive_statistics_rdata_files <- function(x=NULL){
cat(" -- processing:")
descriptive_statistics <- do.call(rbind, lapply(
X=r_data_files,
FUN=function(x){
cat(paste("[",which(r_data_files %in% x),"]",sep=""))
return(
calc_intercept_descriptive_statistics(x)
)
}
))
cat("\n")
write.csv(
descriptive_statistics,
"intercept_model_descriptive_statistics.csv",
row.names=F
)
}
refit_glm <- function(m, s=NULL, effort=NULL){
glm(formula=m$formula,
offset=log(effort),
family=poisson,
data=s@data)
}
#
# MAIN
#
# Read-in our IMBCR transect data
argv <- commandArgs(trailingOnly = T)
cat(" -- fitting a model for :", argv[2], "\n")
r_data_file <- tolower(paste(
tolower(argv[2]),
"_imbcr_pois_glm_workflow_",
gsub(format(Sys.time(), "%b %d %Y"), pattern = " ", replacement = "_"),
".rdata",
sep = ""
))
# Do we have lurking output in the CWD?
if (file.exists(r_data_file)) {
cat(" -- found existing rdata file:", r_data_file, "(skipping)\n");
stop();
}
s <- OpenIMBCR:::scrub_imbcr_df(
OpenIMBCR:::imbcrTableToShapefile(
"/global_workspace/imbcr_number_crunching/results/RawData_PLJV_IMBCR_20161201.csv"
),
four_letter_code = toupper(argv[2])
)
detections <- OpenIMBCR:::calc_dist_bins(s)
effort <- as.vector(OpenIMBCR:::calc_transect_effort(s))
if(sum(s$radialdistance>0, na.rm=T) < 180){
cat(" -- not enough detection to fit a model (quitting)\n")
quit("no")
}
# fit an intercept-only detection function in unmarked
umdf <- unmarked::unmarkedFrameDS(
y=as.matrix(detections$y),
siteCovs=data.frame(effort=effort),
dist.breaks=detections$breaks,
survey="point",
unitsIn="m"
)
intercept_m <- unmarked::distsamp(
formula = ~1 ~1+offset(log(effort)),
data = umdf,
se = T,
keyfun = "halfnorm",
unitsOut = "kmsq",
output = "abund"
)
per_obs_det_probabilities <- round(sapply(
s$radialdistance,
function(x) pred_hn_det_from_distance(intercept_m, dist=x)),
2
)
# calculate an estimate of abundance (accounting for p-det)
s$est_abund <- round(s$cl_count > 0 / per_obs_det_probabilities)
s <- calc_transect_summary_detections(
s=s,
name=toupper(argv[2]),
field='est_abund'
)
# keep a local copy around so we don't lose it while re-scaling
est_abund <- s$est_abund
# read-in habitat covariates
units <- OpenIMBCR:::readOGRfromPath(argv[1])
# spatial join of transect detection data with habitat covariates
# summarized by-unit
s <- OpenIMBCR:::spatial_join(s, units)
# add latitude and longitude
cat(" -- calculating spatial covariates\n")
coords <- as.data.frame(sp::spTransform(s,"+init=epsg:4326")@coords)
colnames(coords) <- c("lon","lat")
s@data <- cbind(s@data, coords)
coords <- as.data.frame(rgeos::gCentroid(sp::spTransform(units,"+init=epsg:4326"), byid=T)@coords)
colnames(coords) <- c("lon","lat")
units@data <- cbind(units@data, coords)
# define the covariates we are going to use in our analysis
vars <- c("grass_ar","shrub_ar","crp_ar","wetland_ar","pat_ct", "map", "mat")
#vars <- c("grass_ar","shrub_ar","crp_ar","wetland_ar","pat_ct", "lat", "lon")
#vars <- c("grass_ar","shrub_ar","crp_ar","wetland_ar","pat_ct")
# ensure a consistent scale for our input data (we will use this a lot)
s@data <- s@data[, vars]
s@data <- s@data[, sapply(s@data[1,], FUN=is.numeric)]
m_scale <- scale(s@data)
s@data <- as.data.frame(scale(s@data))
s$effort <- effort
# make sure the scale of the units we are predicting into is consistent
# with the training data
units@data <- as.data.frame(
scale(units@data[,vars], attr(m_scale, "scaled:center"), attr(m_scale, "scaled:scale"))
)
# tack-on a fake effort variable for predict()
units$effort <- median(effort)
# drop strongly-correlated variables
x_cor_matrix <- cor(s@data[,vars])
cor_threshold <- abs(x_cor_matrix)>CORRELATION_THRESHOLD
passing <- vector()
failing <- vector()
for(i in 1:ncol(cor_threshold)){
not_correlated <- sum(which(cor_threshold[,i])!=i) == 0
if(not_correlated){
passing <- append(passing, colnames(cor_threshold)[i])
} else {
failing <- append(failing, colnames(cor_threshold)[i])
}
}
rm(cor_threshold);
if(length(failing)>0){
cat("-- these variables dropped due to colinearity problems:", failing,"\n")
dev.new();
corrplot::corrplot(x_cor_matrix)
vars <- unlist(strsplit(readline("enter a comma-separated list of vars you want to keep:"), split=","))
} else {
vars <- passing
}
# fit our full model
m <- glm(
formula=paste(
"est_abund~",
paste(paste("poly(",vars,",2,raw=T)",sep=""), collapse="+",sep=""),
"+offset(log(effort))",
sep=""
),
family=poisson,
data=s@data
)
# drop un-intuitive quadratics
quadratics <- quadratics_to_keep(m)
if(length(quadratics)>0){
vars <- vars[!as.vector(sapply(
vars,
FUN=function(p){ sum(grepl(x=quadratics, pattern=p))>0
}))]
# use AIC to justify our proposed quadratic terms
for(q in quadratics){
lin_var <- gsub(
q,
pattern=", 2,",
replacement=", 1,"
)
m_lin_var <- AIC(glm(
formula=paste(
"est_abund~",
paste(
c(paste("poly(", paste(vars, ", 1, raw=T)", sep=""), sep=""),
lin_var,quadratics[!(quadratics %in% q)]), collapse="+"),
"+offset(log(effort))",
sep=""
),
family=poisson,
data=s@data
))+AIC_RESCALE_CONST
m_quad_var <- AIC(glm(
formula=paste(
"est_abund~",
paste(
c(paste("poly(", paste(vars, ", 1, raw=T)", sep=""), sep=""), quadratics),
collapse="+"
),
"+offset(log(effort))",
sep=""
),
family=poisson,
data=s@data
))+AIC_RESCALE_CONST
# if we don't improve our AIC with the quadratic by at-least 8 aic units
# (pretty substatial support), keep the linear version
if( m_lin_var-m_quad_var < AIC_SUBSTANTIAL_THRESHOLD){
quadratics <- quadratics[!(quadratics %in% q)]
vars <- c(
vars,
gsub(
gsub(lin_var, pattern="poly[(]", replacement=""),
pattern=", [0-9], raw.*=*.T[)]",
replacement="")
)
}
}
vars <- c(paste("poly(", paste(vars, ", 1, raw=T)", sep=""), sep=""), quadratics)
# refit our full model minus un-intuitive quadratics
m <- glm(
formula=as.formula(paste(
"est_abund~",
paste(vars, collapse="+"),
sep=""
)),
family=poisson,
offset=log(effort),
data=s@data
)
}
# use model selection with interactions across our candidate variables
tests <- glmulti::glmulti(
m,
intercept=T,
family=poisson,
offset=log(effort),
level=1,
plotty=F)
# predict across our full run dataset
predicted <- units
# are there multiple top models to work from?
if(sum((tests@crits)-min(tests@crits) < AIC_SUBSTANTIAL_THRESHOLD) > 1){
models <- which((tests@crits)-min(tests@crits) < AIC_SUBSTANTIAL_THRESHOLD)
weights <- tests@crits[models]
weights <- OpenIMBCR:::akaike_weights(weights)
models <- lapply(
X=models,
FUN=function(x){
m <- tests@objects[[x]]
return(refit_glm(m, s, effort))
})
models <- do.call(cbind, lapply(
X=models,
FUN=function(x){
m <- predict(
x,
offset=log(effort),
family=poisson,
type="response",
newdata=units@data
)
}))
predicted@data <- data.frame(
pred=apply(models, MARGIN=1, FUN=weighted.mean, weights=weights)
)
rm(models)
# if there was only one top model, averaging won't work
} else {
# re-fit our standard model
tests <-
glm(formula=tests@objects[[1]]$formula,
offset=log(effort),
family=poisson,
data=s@data)
predicted@data <- data.frame(
pred=as.vector(floor(suppressWarnings(predict(
tests,
newdata=units@data,
offset=log(effort),
type="response")))
)
)
}
if(inherits(tests, 'glm')){
# censor any predictions greater than K (max)
cat(
" -- number of sites with prediction greater than predicted max(K):",
sum(predicted$pred > max(round(predict(tests, type="response")))),
"\n"
)
predicted$pred[( predicted$pred > max(round(predict(tests, type="response"))) )] <-
max(round(predict(tests, type="response")))
} else {
# censor any predictions greater than K (max)
cat(
" -- number of sites with prediction greater than predicted max(K):",
sum(predicted$pred > max(round(predict(tests@objects[[1]], type="response")))),
"\n"
)
predicted$pred[( predicted$pred > max(round(predict(tests@objects[[1]], type="response"))) )] <-
max(round(predict(tests@objects[[1]], type="response")))
}
predicted$pred[predicted$pred<1] <- 0
rgdal::writeOGR(
predicted,
".",
tolower(paste(argv[2],
"_imbcr_pois_glm_prediction_",
gsub(format(Sys.time(), "%b %d %Y"), pattern=" ", replacement="_"),
sep="")
),
driver="ESRI Shapefile",
overwrite=T
)
save(
compress=T,
list=ls(),
file=r_data_file
)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.