#' @export
#'
#' @title F.efficiency.model
#'
#' @description Estimate the efficiency model from a data frame containing
#' efficiency trials.
#'
#' @param obs.eff.df A data frame with at least variables \code{batchDate} and
#' \code{efficiency}, where \code{efficiency} is \code{NA} for all days
#' requiring an estimate.
#'
#' @param plot A logical indicating if raw efficiencies and the model(s)
#' should be plotted.
#'
#' @param max.df.spline The maximum degrees of freedom allowed for splines.
#'
#' @param plot.file The name of the file prefix under which output is to be
#' saved. Set to \code{NA} to plot to the plot window.
#'
#' @return A data frame with all observed and imputed \code{efficiency} values,
#' where variable \code{gam.estimated} identifies days with imputed values.
#'
#' @section Efficiency model method:
#'
#'
#' \itemize{
#' \item{Less than \code{eff.min.spline.samp.size} trials :
#' A "weighted average constant model with bias correction." This model
#' uses constant efficiency over the season, and estimates it
#' using a ratio-of-means bias-corrected ("ROM+1") average. For each
#' trap, estimated efficiency is \deqn{\frac{(\sum nCaught)
#' + 1}{(\sum nReleased) + 1}{(sum(nCaught) + 1) / (sum(nReleased) + 1)}}. Values
#' for \code{nCaught} and \code{nReleased} come from function
#' \code{F.get.releases}. }
#'
#'
#' \item{\code{eff.min.spline.samp.size} trials or more : A "B-spline model."
#' This model starts by estimating a constant logistic regression where
#' recaptures (i.e., \code{nCaught}) is the number of "successes" and releases
#' (i.e., \code{nReleased}) is number of "trials". Assuming this constant
#' model is successful, the method estimates a series of increasingly complex
#' b-spline logistic regression models until AIC is minimized or model
#' estimation fails (failure to converge or estimates at boundary). B-spline
#' models, in general, divide the date range into intervals by adding 'knots'.
#' Between 'knots', b-spline models fit cubic polynomials in a way that
#' connects smoothly at knots (refer to b-spline methods for details).
#'
#' The first (lowest order) b-spline model fitted contains zero knots and
#' therefore estimates a cubic model. Assuming that model was successful and
#' that AIC improved relative to the constant model, the method adds one knot
#' at the median date and re-estimates. If that model was successful and AIC
#' improved relative to the previous model, the method adds another knot at
#' the (1/(knots+1))-th quantiles of date and re-estimates. The method
#' containues to add knots until one or more of the following conditions
#' happen: (1) AIC does not improve, (2) estimation fails somehow, or (3) the
#' maximum number of knots (i.e., \code{max.df.spline-3}) is fitted.
#'
#' Using the default value of \code{max.df.spline}, the efficiency model is
#' either constant (intercept-only), cubic, or b-spline with one interval
#' knot.
#'
#' When the best logistic regression model is constant (intercept-only),
#' estimated efficiency is the ratio-of-means estimator WITHOUT the "+1" bias
#' correction. With many efficiency trial, the "+1" bias correction is tiny
#' and inconsequential. The exact efficiency model used at each subsite is
#' listed in the campR log file.
#'
#' The \eqn{\beta}s from the final logistic regression are saved for use in
#' bootstrapping by function \code{F.boostrap.passage}. Modeled efficiencies
#' are used for all days, even if a particular day contained an efficiency
#' trial.
#'
#' All dates outside the efficiency trial season use the mean of estimates
#' within the season. This means the efficiency model can vary within a
#' season, but is always constant before the first and after the last
#' efficiency trial.}
#'
#' }
#'
#' @seealso \code{F.get.releases}, \code{F.bootstrap.passage}, \code{reMap},
#' \code{reMap2}
#'
#' @author WEST Inc.
#'
#' @examples
#' \dontrun{
#' # ---- Fit an efficiency model for each unique trapPositionID
#' # ---- in data frame obs.eff.df.
#' F.efficiency.model( obs.eff.df, plot=T, max.df.spline=4, plot.file=NA)
#' }
F.efficiency.model <- function( obs.eff.df, plot=T, max.df.spline=4, plot.file=NA ){
# obs.eff.df <- eff
# plot <- plot
# max.df.spline <- 4
# plot.file <- plot.file
#save.image("C:/Users/jmitchell/Desktop/save/rbdd.RData")
ans <- NULL
enhmodel <- attr(obs.eff.df,"enhmodel")
min.date <- attr(obs.eff.df,"min.date")
max.date <- attr(obs.eff.df,"max.date")
site <- attr(obs.eff.df,"site")
subsites <- attr(obs.eff.df,"subsites")
site.name <- attr(obs.eff.df,"site.name")
catch.subsites <- attr(obs.eff.df,"catch.subsites")
# ---- If we are using enhanced efficiency trials, we should always have an efficiency model,
# ---- unless a trap is new for the year defined by original min.date and max.date. Define
# ---- traps vector appropriately so the looping works appropriately.
traps <- sort(unique(as.character(obs.eff.df$TrapPositionID)))
fits <- all.X <- all.ind.inside <- all.dts <- obs.data <- eff.type <- vector("list", length(traps))
names(fits) <- traps
names(all.X) <- traps
names(all.dts) <- traps
names(all.ind.inside) <- traps
names(obs.data) <- traps
names(eff.type) <- traps
# ---- Obtain necessary variables from the global environment.
time.zone <- get("time.zone",envir=.GlobalEnv)
# ---- Set this up here, so it can evaluated in the if below when enhmodel == FALSE.
doOldEff <- rep(TRUE,length(traps))
names(doOldEff) <- traps
# ---- We know traps from immediately above.
#load("//lar-file-srv/Data/PSMFC_CampRST/ThePlatform/CAMP_RST20160601-DougXXX-4.5/R-Interface/campR/data/betas.rda")
data(betas,envir=environment())
betas <- betas[betas$subsiteID %in% traps,]
# ---- If betas has zero rows, then we know this is a new subsite, and this is the only
# ---- trap we care about. So, this means we probably have no enhanced efficiency
# ---- model for it, nor data in the environmental covariate database. So, flip enhmodel
# ---- to FALSE and skip all the enhmodel stuff.
iChangedYourChoice <- 0
if(nrow(betas) == 0){
if(enhmodel == TRUE){
iChangedYourChoice <- 1
}
enhmodel <- FALSE
}
# ---- Decide if we're going to use enhanced efficiency.
if(enhmodel == TRUE){
# ---- Get stuff we need to fit the enhanced efficiency models.
# ---- Get together covariates. We need to query for days before and after the first and last
# ---- eff trial date, so we need to make the obs.eff.df "bigger."
big.obs.eff.df <- F.assign.batch.date(data.frame(EndTime=seq(as.POSIXct(min.date,format="%Y-%m-%d",tz=time.zone) - 90*24*60*60,
as.POSIXct(max.date,format="%Y-%m-%d",tz=time.zone) + 90*24*60*60,by="2 hours")))
big.obs.eff.df$EndTime <- NULL
big.obs.eff.df <- data.frame(batchDate=unique(big.obs.eff.df$batchDate))
all.Dates <- expand.grid(TrapPositionID=traps,batchDate=big.obs.eff.df$batchDate)
all.Dates <- all.Dates[!(all.Dates$batchDate %in% obs.eff.df$batchDate),]
otherCols <- names(obs.eff.df)[!(names(obs.eff.df) %in% c("TrapPositionID","batchDate"))]
all.Dates[otherCols] <- NA
# ---- Update 6/21/2018. We now have a makeFake_release.df.R program that finds exactly
# ---- when we need a fake e-trial to get things to go. What can sometimes happen is that
# ---- given a short time frame, say, a month, we have ONE trap with good spline days during
# ---- that time frame, but another that has none. We don't want to fit anything for that
# ---- second trap -- we have no spline enh eff for those days. This means we cannot
# ---- possible have a real trial on those days either. So turn this section off.
# # ---- It could be that all.Dates has more traps than obs.eff.df, if we have fish in catch
# # ---- from a trap on which there were no eff trials this year. Find these traps.
# extraTraps <- unique(all.Dates$TrapPositionID)[!(unique(all.Dates$TrapPositionID) %in% unique(obs.eff.df$TrapPositionID))]
#
# # ---- If we have extraTraps, add in rows to obs.eff.df, so the rbind below creates the
# # ---- same number of rows for each trap, as expected.
# if(length(extraTraps) > 0){
# if(is.factor(obs.eff.df$TrapPositionID)){
# theFirst <- as.character(droplevels(obs.eff.df$TrapPositionID[1]))
# } else {
# theFirst <- obs.eff.df$TrapPositionID[1]
# }
# for(i in 1:length(extraTraps)){
# extra.obs.eff.df <- data.frame(TrapPositionID=extraTraps[i],
# batchDate=obs.eff.df[obs.eff.df$TrapPositionID == theFirst,]$batchDate,
# nReleased=NA,
# nCaught=NA,
# bdMeanNightProp=NA,
# bdMeanMoonProp=NA,
# bdMeanForkLength=NA,
# efficiency=NA,
# thisIsFake=NA)
# obs.eff.df <- rbind(obs.eff.df,extra.obs.eff.df)
# }
# }
# ---- Combine the original obs.eff.df from eff with all.Dates, so we have bigger date ranges
# ---- for use with splines.
obs.eff.df <- rbind(obs.eff.df,all.Dates)
obs.eff.df <- obs.eff.df[order(obs.eff.df$TrapPositionID,obs.eff.df$batchDate),]
# ---- Get covariate data on bigger obs.eff.df.
stuff <- getTogetherCovarData(obs.eff.df,min.date,max.date,traps,enhmodel=TRUE)
# ---- Unpack 'stuff' so that we have the dbCovar dataframes available for plotting below.
obs.eff.df <- stuff$obs.eff.df
obs.eff.df$covar <- NULL # This only hold meaning when building enhanced eff models.
dbDisc <- stuff$dbDisc
dbDpcm <- stuff$dbDpcm
dbATpF <- stuff$dbATpF
dbTurb <- stuff$dbTurb
dbWVel <- stuff$dbWVel
dbWTpC <- stuff$dbWTpC
dbLite <- stuff$dbLite
dbFlPG <- stuff$dbFlPG
dbTpPG <- stuff$dbTpPG
dbPerQ <- stuff$dbPerQ
# ---- Create a means to identify traps for which we end up lacking data.
doOldEff <- rep(FALSE,length(traps))
names(doOldEff) <- traps
# ---- Run over individual traps.
for(trap in traps){
# ---- Objects created in non-efficiency models -- need something for bootstrapping.
df <- obs.eff.df[ is.na(obs.eff.df$TrapPositionID) | (obs.eff.df$TrapPositionID == trap), ]
ind <- !is.na(df$efficiency)
# ---- Get the temporal spline basis matrix goods.
#here <- "L:/PSMFC_CampRST/ThePlatform/CAMP_RST20160601-DougXXX-4.5/R-Interface/campR/inst/enhEffStats" # for testing before you have the package.
here <- paste0(find.package("campR"),"/enhEffStats") # <- for when you have a package.
.tmpDataEnv <- new.env(parent=emptyenv()) # not exported
# ---- If a trap is new this year, we won't have enhanced efficiency prior-fit info.
# ---- See if we have that stuff waiting around.
if(file.exists(paste0(here,"/",site,"_",trap,"_splineCoef.RData"))){
load(paste0(here,"/",site,"_",trap,"_splineCoef.RData"),envir=.tmpDataEnv)
load(paste0(here,"/",site,"_",trap,"_splineDays.RData"),envir=.tmpDataEnv)
load(paste0(here,"/",site,"_",trap,"_splineBegD.RData"),envir=.tmpDataEnv)
load(paste0(here,"/",site,"_",trap,"_splineEndD.RData"),envir=.tmpDataEnv)
load(paste0(here,"/",site,"_",trap,"_fit.RData"),envir=.tmpDataEnv)
splineCoef <- .tmpDataEnv$splineCoef
splineDays <- .tmpDataEnv$splineDays
splineBegD <- .tmpDataEnv$splineBegD
splineEndD <- .tmpDataEnv$splineEndD
fit <- .tmpDataEnv$fit
# ---- Stuff we just loaded.
#splineDays ...came from... df$batchDate2[eff.ind.inside]
#splineCoef ...came from... fit$coefficients[grepl("tmp",names(fit$coefficients))]
#splineBegD ...came from... bsplBegDt
#splineEndD ...came from... bsplEndDt
#fit ...came from... fit
# ---- Remap back to the present.
reMap2list <- reMap2(min.date,max.date,splineDays)
strt.dt <- reMap2list$strt.dt
end.dt <- reMap2list$end.dt
# ---- By design, can only have the two spline 59/60 60/60 paradigms coded above.
# ---- Identify dates for which we have splined information.
# ind.inside <- (strt.dt <= df$batchDate) & (df$batchDate <= end.dt)
# inside.dates <- c(strt.dt, end.dt)
# all.ind.inside[[trap]] <- inside.dates # save season dates for bootstrapping
min.date.p <- as.POSIXct(min.date,format="%Y-%m-%d",tz=time.zone)
max.date.p <- as.POSIXct(max.date,format="%Y-%m-%d",tz=time.zone)
ind.inside <- (max(min.date.p,as.POSIXct(strt.dt)) <= df$batchDate) & (df$batchDate <= min(max.date.p,as.POSIXct(end.dt)))
inside.dates <- c(max(min.date.p,as.POSIXct(strt.dt)), min(max.date.p,as.POSIXct(end.dt)))
all.ind.inside[[trap]] <- inside.dates # save season dates for bootstrapping
# ---- The fitting data frame
tmp.df <- df[ind & ind.inside,]
m.i <- sum(ind & ind.inside)
# ---- Find out which covariates this trap cares about.
thisB1 <- betas[betas$subsiteID == trap,] # Restrict to trap.
thisB2 <- thisB1[,names(thisB1)[sapply(thisB1,function(x) ifelse(is.numeric(x),sum(x),x)) != "0"]] # Restrict to non-zero covariates.
# ---- Get the Intercept.
covarI <- as.numeric(thisB2[names(thisB2) == "(Intercept)"])
# ---- Reduce down to a dataframe. Handle the contingency when the "dataframe" is of length one,
# ---- and thus drops down to a vector.
if(sum(!(names(thisB2) %in% c("subsiteID","(Intercept)","threshold","available","Stage"))) > 1){
thisB3 <- thisB2[,!(names(thisB2) %in% c("subsiteID","(Intercept)","threshold","available","Stage"))] # Restrict to covariates.
} else {
theName <- names(thisB2)[!(names(thisB2) %in% c("subsiteID","(Intercept)","threshold","available","Stage"))]
thisB3 <- data.frame(thisB2[,!(names(thisB2) %in% c("subsiteID","(Intercept)","threshold","available","Stage"))])
names(thisB3) <- theName
}
covarB <- thisB3
if(length(names(covarB)) > 0){
cat(paste0("\n\nEnhanced efficiency model for trap ",trap," seeks recorded data on covariates ",paste0(names(covarB),collapse=", "),".\n"))
} else {
cat(paste0("\n\nEnhanced efficiency model for trap ",trap," seeks no recorded data -- it's an intercept-only model.\n"))
}
# ---- Check to make sure we have the data we need to fit enhanced efficiency trials. Not necessary
# ---- if we have an intercept-only model.
if(length(covarB) > 0){
checkCovarValidity <- checkValidCovars(df,tmp.df,min.date,max.date,covarB,site,strt.dt,end.dt)
df <- checkCovarValidity$df
doEnhEff <- checkCovarValidity$doEnhEff
} else {
doEnhEff <- TRUE
}
} else{
# ---- If we are here, we have a new trap with no previous enhanced efficiency model. If there were efficiency trials
# ---- this year, we could at least try to fit an eff model the old way. So tell it to do just that.
doEnhEff <- FALSE
}
# ---- If doEnhEff == FALSE, I exit the loop for this trap. I'll run regular eff below.
if(doEnhEff == FALSE){
doOldEff[names(doOldEff) == trap] <- TRUE
} else {
# ---- We good to go.
# ---- I have identified variables I care about. Get them together, for all batchDates.
covarC <- c("discharge_cfs","waterDepth_cm","airTemp_F","turbidity_ntu","waterVel_fts","waterTemp_C","lightPenetration_ntu")#,"dissolvedOxygen_mgL","conductivity_mgL","barometer_inHg","precipLevel_qual")
covarE <- c("bdMeanNightProp","bdMeanMoonProp","bdMeanForkLength")
# ---- See if we have any temporal spline tmp bits.
if(sum(grepl("tmp",names(splineCoef))) > 0){
# ---- Build the basis matrix. This does includes the intercept.
N <- nrow(bs(splineDays, df=length(splineCoef)))
tB <- c(covarI,splineCoef)
timeX <- cbind(rep(1,N),bs( splineDays, df=length(splineCoef)))
timeDF <- data.frame(timeX=timeX,batchDate2=splineDays)
names(timeDF)[names(timeDF) == "timeX.V1"] <- "Intercept"
c0 <- timeDF[!duplicated(timeDF$batchDate2),]
} else {
# ---- If we're here, the temporal spline was fit as a simple intercept. We still
# ---- need to be sure we grab the 1960 batchDate2 dates, so do that here.
tB <- c(covarI)
timeDF <- data.frame(Intercept=rep(1,length(splineDays)),batchDate2=splineDays)
c0 <- timeDF[!duplicated(timeDF$batchDate2),]
c0 <- c0[order(c0$batchDate2),]
}
# ---- Get the environmental covariate data.
if(sum(c("temp_c","flow_cfs") %in% names(covarB)) > 0){
c1 <- df[,c("batchDate",names(covarB)[names(covarB) %in% c("temp_c","flow_cfs")])]
c1 <- c1[,c("batchDate",names(covarB)[names(covarB) %in% c("temp_c","flow_cfs")])]
# ---- The times are off, probably because c1 originates from obs.eff.df, which has been used with dates recorded
# ---- in the CAMP mdb. These dates don't record time zones, so POSIX gets tricky.
c1$batchDate <- as.POSIXct(strptime(c1$batchDate,format="%Y-%m-%d",tz=time.zone),format="%Y-%m-%d",tz=time.zone)
} else {
c1 <- NULL
}
# ---- Get the CAMP environmental covariate data.
if(sum(c("waterDepth_cm","turbidity_ntu","waterVel_fts","waterTemp_C") %in% names(covarB)[names(covarB) %in% covarC]) > 0){
c2 <- df[,c("batchDate",names(covarB)[names(covarB) %in% covarC])]
} else {
c2 <- NULL
}
# ---- Get the percent-Q covariate data.
if(sum(c("percQ") %in% names(covarB)) > 0){
#do stuff
c3 <- df[,c("batchDate",names(covarB)[names(covarB) %in% "percQ"])]
} else {
c3 <- NULL
}
# ---- Get the e-trial data.
if(sum(c("bdMeanNightProp","bdMeanMoonProp","bdMeanForkLength") %in% names(covarB)) > 0){
# ---- Variable batchDate doesn't include all dates, since releases average over days. Fill in the missing
# ---- dates. This creates a step function ish for meanNightProp, meanMoonProp, and meanForkLength.
if(nrow(tmp.df) > 0 & !all(tmp.df$thisIsFake == 1)){
# ---- If we're here, we had efficiency trials this year, and thus "bdMeanNightProp","bdMeanMoonProp","bdMeanForkLength"
# ---- obtained from this year's efficiency trials.
c4 <- stepper(tmp.df[,c("batchDate","TrapPositionID","bdMeanNightProp","bdMeanMoonProp","bdMeanForkLength")],
c("bdMeanNightProp","bdMeanMoonProp","bdMeanForkLength"),
min.date,
max.date)
} else {
# ---- If we're here, we didn't have efficiency trials this year, so we're using a constant Season estimate,
# ---- from annual_records, for these variable(s) that we need.
c4 <- df[,c("batchDate",names(covarB)[names(covarB) %in% c("bdMeanNightProp","bdMeanMoonProp","bdMeanForkLength")])]
}
# ---- Function stepper designed to run over non-unique TrapPositionIDs. Get rid of this variable.
c4$TrapPositionID <- NULL
# ---- I like that the stepper function renames variables with a "Step" suffix, but that is not useful here.
# ---- So, rename those variables to get rid of that suffix.
names(c4) <- sapply(names(c4),function(x) gsub("Step","",x))
c4 <- c4[,c("batchDate",names(covarB)[names(covarB) %in% c("bdMeanNightProp","bdMeanMoonProp","bdMeanForkLength")])]
} else {
c4 <- NULL
}
# ---- Map back to current time frame.
reMapped <- reMap(c0,"batchDate2",min.date,max.date,strt.dt,end.dt)
c0 <- reMapped$c0
allDates <- reMapped$allDates
# ---- We use allDates as the backbone for bringing in all the different pieces.
allDates <- merge(allDates,c0 ,by=c("batchDate"),all.x=TRUE)
if(!is.null(c1 )) allDates <- merge(allDates,c1 ,by=c("batchDate"),all.x=TRUE)
if(!is.null(c2 )) allDates <- merge(allDates,c2 ,by=c("batchDate"),all.x=TRUE)
if(!is.null(c3 )) allDates <- merge(allDates,c3 ,by=c("batchDate"),all.x=TRUE)
if(!is.null(c4 )) allDates <- merge(allDates,c4 ,by=c("batchDate"),all.x=TRUE)
# ---- We used the enhanced efficiency dates batchDate2 to drive this. If, in the result of
# ---- allDates here, batchDate2 is NA, this means the user put in a set of dates in the
# ---- Platform that span outside the real range of e-trials encapsulated by batchDate2.
# ---- Put in a boring intercept estimate for these, so we report something.
if(sum(is.na(allDates$batchDate2)) > 0){
allDates[is.na(allDates$batchDate2),]$Intercept <- 1
}
# ---- If we're outside the splineDays for this trap (strt.dt and end.dt), we just use the
# ---- intercept estimate. We don't know what happens outside the enh eff temporal range.
allDates$inSplineDays <- 0
allDates[(allDates$batchDate) < strt.dt | (allDates$batchDate > end.dt),names(allDates)[!(names(allDates) %in% c("batchDate","batchDate2","Intercept"))] ] <- 0
allDates[(allDates$batchDate) >= strt.dt & (allDates$batchDate <= end.dt),]$inSplineDays <- 1
# ---- MAKE THE EFF ESTIMATE. Don't forget that tB includes the intercept already.
X <- as.matrix(allDates[allDates$inSplineDays == 1,names(allDates)[!(names(allDates) %in% c("batchDate","batchDate2","inSplineDays"))]],ncol=(1 + length(splineCoef) + length(covarB)))
B <- as.matrix(unlist(c(tB,covarB[colnames(X)[colnames(X) %in% names(covarB)]])),ncol=1) # <--- Make sure B in same order as cols in X.
# ---- Save X, and the dates at which we predict, for bootstrapping.
all.X[[trap]] <- X
all.dts[[trap]] <- df$batchDate[ind.inside] # Can't use sort(unique(splineDays)) here because it has Feb 29th, by construction.
fits[[trap]] <- fit #B
pred <- 1 / (1 + exp(-1*X %*% B))
#plot(allDates$batchDate,allDates$pred)
# ---- Add in predicted values for enh eff spline days.
df[ind.inside,]$efficiency <- pred
# ---- Use the mean of spline estimates for all dates outside efficiency trial season.
mean.p <- mean(pred, na.rm=T)
df$efficiency[!ind.inside] <- mean.p
# ---- We added in fake days before and after the start of e-trials, so as to query for env covars
# ---- correctly. Reduce df back to what it should be, based on min.date and max.date.
orig.dates <- data.frame(batchDate=seq(as.POSIXct(min.date,format="%Y-%m-%d",tz=time.zone),
as.POSIXct(max.date,format="%Y-%m-%d",tz=time.zone),by="1 DSTday"))
df <- df[df$batchDate %in% orig.dates$batchDate,]
# ---- We are done with the covariates, so get rid of them so that subsequent code doesn't get hung
# ---- up on them.
df <- df[,c("TrapPositionID","batchDate","nReleased","nCaught","efficiency")]
# ---- Find any fake efficiency trials used and removed them.
# do me do me do me?
# ---- Save the fit for bootstrapping.
#fits[[trap]] <- fit
# ---- Save the raw efficiency data.
obs.data[[trap]] <- tmp.df
eff.type[[trap]] <- 5
# ---- Uncomment the following line if using imputed value for all days. Otherwise, comment it out,
# ---- and imputed.eff will tell which are observed. With the following uncommented, you can find
# ---- efficiency trials in grand.df with !is.na(grand.df$nReleased).
ind <- rep(F, nrow(df))
df$imputed.eff <- factor( !ind, levels=c(T,F), labels=c("Yes", "No"))
df$enhanced.eff <- rep("Yes",nrow(df))
df$trapPositionID <- trap
# ---- Sometimes, covariates don't play nice, and we end up with absurdly low efficiencies. Apply
# ---- an indicator in this case, so that any efficiencies below the 20th percentile AND below 0.005
# ---- (a practical efficiency minimum -- 5 out of 1000 fish) are replaced with 20th p. I don't think
# ---- fivePThreshold can be NA...but I don't want to have to dig into this and restart the BL.
fivePThreshold <- quantile(df$efficiency,0.20) # <--- It looks 0.05 is still too small...try 0.20.
if(!is.na(fivePThreshold)){
if(any( df$efficiency < fivePThreshold & rep(fivePThreshold < 0.005,nrow(df)) )){
df[df$efficiency < fivePThreshold & rep(fivePThreshold < 0.005,nrow(df)),]$efficiency <- fivePThreshold # <- If below threshold, replace.
}
rm(fivePThreshold)
}
# ---- If we have a fake efficiency trial here, get rid of it now. Flip its data to NA, like any
# ---- other non-eff trial day.
if( any(obs.eff.df[obs.eff.df$TrapPositionID == trap,]$thisIsFake == 1 & !is.na(obs.eff.df[obs.eff.df$TrapPositionID == trap,]$thisIsFake)) ){
# df[df$batchDate %in% c(F.assign.batch.date(data.frame(EndTime=obs.eff.df[obs.eff.df$thisIsFake == 1 & obs.eff.df$trapPositionID == trap,]$ReleaseDate))$batchDate),]$nReleased <- NA
# df[df$batchDate %in% c(F.assign.batch.date(data.frame(EndTime=obs.eff.df[obs.eff.df$thisIsFake == 1 & obs.eff.df$trapPositionID == trap,]$ReleaseDate))$batchDate),]$nCaught <- NA
df[df$batchDate %in% c(obs.eff.df[obs.eff.df$thisIsFake == 1 & obs.eff.df$TrapPositionID == trap & !is.na(obs.eff.df$thisIsFake),]$batchDate),]$nReleased <- NA
df[df$batchDate %in% c(obs.eff.df[obs.eff.df$thisIsFake == 1 & obs.eff.df$TrapPositionID == trap & !is.na(obs.eff.df$thisIsFake),]$batchDate),]$nCaught <- NA
}
#plot(df$batchDate,df$efficiency)
ans <- rbind(ans, df)
#plot(ans[ans$TrapPositionID == "42010",]$batchDate,ans[ans$TrapPositionID == "42010",]$efficiency)
}
# par(mfrow=c(5,1))
# for(trap in traps){
# x <- ans[ans$TrapPositionID == trap,]$batchDate
# y <- ans[ans$TrapPositionID == trap,]$efficiency
# plot(x,y,type="b")
# }
# par(mfrow=c(1,1))
}
}
if( enhmodel == FALSE | sum(doOldEff) > 0 ){
# ---- If sum(doOldEff) > 0, then we have at least one trap for which we lack the requisite data from attempt to do enh eff models.
# ---- Redefine vector traps to only include those for which the old way is necessary. Of course,
# ---- only do this redefining if we really hoped for Enh Eff.
if( enhmodel == TRUE ){
oldTraps <- traps
traps <- traps[doOldEff]
}
# ---- If number of trials at a trap less than this number,
# ---- assume constant and use ROM+1 estimator
eff.min.spline.samp.size <- get("eff.min.spline.samp.size", pos=.GlobalEnv)
# ---- Estimate a model for efficiency for each trap in obs.eff.df.
for( trap in traps ){
if( enhmodel == TRUE ){
cat(paste0("I'm in the old efficiency code sequence for trap ",trap,".\n"))
# ---- No efficiency trials at this trap. BUT! If we have an enhanced efficiency model at
# ---- this trap, but lack a covariate's data, we can at least do it the old way, on the 1960 paradigm
# ---- batchDate2, and at least bust out the temporal spline (with no covariates). We use the old
# ---- data to refit the spline -- we do NOT use the temporal spline fit with the covariates.
# ---- Because we now imputed mean coefficient values in the enh eff traps above, when covariate data
# ---- are missing, we should rarely end up here; i.e., enhmodel == TRUE and sum(doOldEff) > 0
# ---- should be rare.
if(file.exists(paste0(here,"/",site,"_",trap,"_fit.RData"))){
# ---- Get the current trap enh eff fit back in memory.
load(paste0(here,"/",site,"_",trap,"_fit.RData"),envir=.tmpDataEnv)
load(paste0(here,"/",site,"_",trap,"_splineDays.RData"),envir=.tmpDataEnv)
# load(paste0(here,"/",site,"_",trap,"_splineCoef.RData"),envir=.tmpDataEnv)
splineDays <- .tmpDataEnv$splineDays
# splineCoef <- .tmpDataEnv$splineCoef
fit <- .tmpDataEnv$fit
tmp.df <- fit$data
# ---- Could have the same month-day represent more than one e-trial, over more than one year.
# ---- Collapse these down to help with a merge below.
nReleased <- aggregate(tmp.df$nReleased,list(batchDate2=tmp.df$batchDate2),sum)
names(nReleased)[names(nReleased) == "x"] <- "nReleased"
nCaught <- aggregate(tmp.df$nCaught,list(batchDate2=tmp.df$batchDate2),sum)
names(nCaught)[names(nCaught) == "x"] <- "nCaught"
tmp.df <- merge(nReleased,nCaught,by=c("batchDate2"))
# ---- Pluck off batchDate2.
bd2 <- tmp.df$batchDate2
bd2 <- bd2[!is.na(bd2)]
# ---- Remap back to the current time frame we care about.
reMap2list <- reMap2(min.date,max.date,splineDays)
strt.tmp.dt <- reMap2list$strt.dt
end.tmp.dt <- reMap2list$end.dt
# ---- Build up a crosswalk between batchDate and batchDate2.
date.seq.bd <- data.frame(batchDate=seq(strt.tmp.dt,end.tmp.dt,by="1 DSTday"))
date.seq.bd2 <- data.frame(batchDate2=seq(min(tmp.df$batchDate2),max(tmp.df$batchDate2),by="1 DSTday"))
# ---- Make sure we agree on 2/29. Default batchDate2 seq always has this, because it's 1960.
if(any(as.POSIXlt(date.seq.bd2$batchDate2)$mon == 1 & as.POSIXlt(date.seq.bd2$batchDate2)$mday == 29)){
date.seq.bd2 <- date.seq.bd2[!(as.POSIXlt(date.seq.bd2$batchDate2)$mon == 1 & as.POSIXlt(date.seq.bd2$batchDate2)$mday == 29),]
date.seq.bd2 <- data.frame(batchDate2=date.seq.bd2)
}
# ---- Make the batchDate crosswalk.
bdxwalk <- data.frame(date.seq.bd,date.seq.bd2)
# ---- And now, disguise the data frame like the old efficiency code expects it.
tmp.df <- merge(bdxwalk,tmp.df,by=c("batchDate2"),all.x=TRUE)
tmp.df$TrapPositionID <- trap
tmp.df$efficiency <- tmp.df$nCaught / tmp.df$nReleased
tmp.df <- tmp.df[,c("TrapPositionID","batchDate","nReleased","nCaught","efficiency")]
# ---- We need the dates from the original obs.eff.df for this trap, so it matches the other traps'
# ---- temporal sequence.
obs.eff.df.df <- obs.eff.df[obs.eff.df$TrapPositionID == trap,]
df <- merge(obs.eff.df.df[,!(names(obs.eff.df.df) %in% c("TrapPositionID","nReleased","nCaught","efficiency"))],
tmp.df,
by=c("batchDate"),
all.x=TRUE)
# ---- In this part of the code, we don't care about covariates.
df <- df[,c("TrapPositionID","batchDate","nReleased","nCaught","efficiency")]
df$TrapPositionID <- trap
} else {
# ---- We have a trap that is new this year -- no enhanced efficiency model exists.
df <- obs.eff.df[ is.na(obs.eff.df$TrapPositionID) | (obs.eff.df$TrapPositionID == trap), ]
}
} else {
# ---- We just want old-school efficiency estimation. Do it the old way.
if(iChangedYourChoice == 1){
# ---- Get the subsite name for the trap on-hand. Inefficiency to query for each
# ---- trap like this, but it is fast, and there shouldn't ever be too many
# ---- to query. This is a copy of code down below (outside of this if-branch).
db <- get( "db.file", envir=.GlobalEnv )
ch <- odbcConnectAccess(db)
newSubsites <- sqlQuery(ch, "SELECT DISTINCT subSiteName, subSiteID FROM SubSite;")
F.sql.error.check(newSubsites)
close(ch)
newSubsites <- newSubsites[newSubsites$subSiteID %in% catch.subsites,]
if(is.factor(newSubsites$subSiteName)){
# Note: as.character drops missing factor levels, like droplevels
# And: don't need this inside an if(is.factor()), cause as.character works same on characters and factors, but I'll leave it
newSubsites$subSiteName <- as.character(newSubsites$subSiteName)
}
pos <- 1
envir <- as.environment(pos)
progbar <- get( "progbar", pos=envir)#.GlobalEnv )
setWinProgressBar(progbar,
getWinProgressBar(progbar),
label=paste0("Trap-Efficiency Model for Trap ",trap,": ",newSubsites[newSubsites$subSiteID == trap,]$subSiteName," failed; it didn't exist during previous enhanced-efficiency model estimation. Defaulting to Mark-Recapture efficiency."))
}
df <- obs.eff.df[ is.na(obs.eff.df$TrapPositionID) | (obs.eff.df$TrapPositionID == trap), ]
}
cat("Trent -- You are saving a copy of the efficiency data in .GlobalEnv.\n")
assign("effDf", df, pos = .GlobalEnv)
ind <- !is.na(df$efficiency)
# ---- Find the "season", which is between first and last trials
strt.dt <- suppressWarnings(min( df$batchDate[ind], na.rm=T )) # Earliest date with an efficiency trial
cat("\nTrent: you set start date of efficiency model to 2018-03-14 in eff_model.r Set it back.\n\n")
strt.dt <- as.POSIXct("2018-03-14", tz = "America/Los_Angeles")
end.dt <- suppressWarnings(max( df$batchDate[ind], na.rm=T )) # Latest date with efficiency trial
ind.inside <- (strt.dt <= df$batchDate) & (df$batchDate <= end.dt)
inside.dates <- c(strt.dt, end.dt)
all.ind.inside[[trap]] <- inside.dates # save season dates for bootstrapping
# ---- Dataframe df possibly too big here, since we buffed it out up top temporally, so as to induce
# ---- better fitting splines. Knock it back down.
min.date.p <- as.POSIXct(min.date,format="%Y-%m-%d",tz=time.zone)
max.date.p <- as.POSIXct(max.date,format="%Y-%m-%d",tz=time.zone)
# ---- The fitting data frame
tmp.df <- df[ind & ind.inside,]
m.i <- sum(ind & ind.inside)
if( m.i == 0 ){
cat( paste("NO EFFICIENCY TRIALS FOR TRAP", trap, ".\n") )
cat( paste("Catches at this trap will not be included in production estimates.\n"))
fits[[trap]] <- NA
all.X[[trap]] <- NA
df$efficiency <- NA
obs.data[[trap]] <- NA
eff.type[[trap]] <- 1
} else if( (m.i < eff.min.spline.samp.size) | (sum(tmp.df$nCaught) == 0) ){
# ---- Take simple average of efficiency trials: constant over season.
cat("Fewer than 10 trials found or no recaptures. 'ROM+1' estimator used.\n")
# ---- This is MOR, or mean of ratios.
#obs.mean <- mean(obs)
#cat(paste("MOR efficiency= ", obs.mean, "\n"))
# ---- This is ROM = Ratio of means, with bias correction.
obs.mean <- (sum(tmp.df$nCaught)+1) / (sum(tmp.df$nReleased)+1)
cat(paste("'ROM+1' efficiency= ", obs.mean, "\n"))
# ---- If want to use ROM for missing efficiencies only, uncomment the next line.
#df$efficiency[!ind] <- obs.mean
# ---- If, however, you want to use ROM for all days, missing or not, uncomment the next line.
# ---- Fit a model here so we have dispersion statistic, beta, and covar matrix for use in
# ---- bootstrapping later.
fits[[trap]] <- glm(nCaught / nReleased ~ 1, family=binomial, data=tmp.df, weights=tmp.df$nReleased )
df$efficiency <- obs.mean
obs.data[[trap]] <- tmp.df
eff.type[[trap]] <- 2
# ---- Make a design matrix for ease in calculating predictions. Used in bootstrapping.
# ---- Very simple design matrix in this case, since we're only fitting an intercept.
if( length(coef(fits[[trap]])) == 1 ){
#pred <- matrix( coef(fit), sum(ind.inside), 1 )
X <- matrix( 1, sum(ind.inside), 1)
}
# ---- Save X, and the dates at which we predict, for bootstrapping.
all.X[[trap]] <- X
all.dts[[trap]] <- df$batchDate[ind.inside]
} else {
# ---- There are enough observations to estimate B-spline model.
# ---- Fit glm model, increasing degress of freedom, until minimize AIC or something goes wrong.
cat(paste("\n\n++++++Spline model fitting (no covariates) for trap:", trap, "\n Trials are:\n"))
print(tmp.df)
# ---- At least one efficiency trial "inside" for this trap.
# ---- Fit a null model.
fit <- glm( nCaught / nReleased ~ 1, family=binomial, data=tmp.df, weights=tmp.df$nReleased )
fit.AIC <- AIC(fit)
cat(paste("df= ", 1, ", conv= ", fit$converged, " bound= ", fit$boundary, " AIC= ", round(fit.AIC, 4), "\n"))
if( !fit$converged | fit$boundary ){
# ---- Something went wrong with the constant model.
# ---- I don't think this can actually happen because m.i > 10 and sum(nCaught) > 0,
# ---- but I'm adding this clause just in case (maybe something is missing).
# ---- In this case, use ROM+1 estimator.
cat("Constant (intercept-only) logistic model for efficiency failed. Using 'ROM+1' estimator. ")
obs.mean <- (sum(tmp.df$nCaught)+1) / (sum(tmp.df$nReleased)+1)
cat(paste("'ROM+1' efficiency= ", obs.mean, "\n"))
df$efficiency <- obs.mean
fits[[trap]] <- data.frame(nCaught=df$nCaught[ind], nReleased=df$nReleased[ind])
eff.type[[trap]] <- 3
} else {
# ---- Fit increasingly complex models.
# Note, we skip the quadratic:
# df = 3 means cubic model (no internal knots)
# df = 4 means cubic spline w/ 1 internal knot at median
# df = 5 means cubic spline w/ 2 internal knots at 0.33 and 0.66 of range
# etc.
# Subtract 3 from df to get number of internal knots.
cur.df <- 3
repeat{
cur.bspl <- splines::bs( df$batchDate[ind.inside], df=cur.df )
tmp.bs <- cur.bspl[!is.na(df$efficiency[ind.inside]),]
cur.fit <- glm( nCaught / nReleased ~ tmp.bs, family=binomial, data=tmp.df, weights=tmp.df$nReleased )
cur.AIC <- AIC(cur.fit)
cat(paste("df= ", cur.df, ", conv= ", cur.fit$converged, " bound= ", cur.fit$boundary, " AIC= ", round(cur.AIC, 4), "\n"))
cat("\nTrent: you changed AIC stopping rule in eff_model.r. Change back to fit.AIC - 2.\n\n")
if( !cur.fit$converged | cur.fit$boundary | cur.df > max.df.spline | cur.AIC > (fit.AIC + 4) ){
break
} else {
fit <- cur.fit
fit.AIC <- cur.AIC
bspl <- cur.bspl
cur.df <- cur.df + 1
}
}
cat("\nFinal Efficiency model for trap: ", trap, "\n")
print(summary(fit, disp=sum(residuals(fit, type="pearson")^2)/fit$df.residual))
# ---- Make a design matrix for ease in calculating predictions.
if( length(coef(fit)) <= 1 ){
pred <- matrix( coef(fit), sum(ind.inside), 1 )
X <- matrix( 1, sum(ind.inside), 1)
} else {
X <- cbind( 1, bspl )
pred <- X %*% coef(fit)
}
# ---- Save X, and the dates at which we predict, for bootstrapping.
all.X[[trap]] <- X
all.dts[[trap]] <- df$batchDate[ind.inside]
# ---- Standard logistic prediction equation.
# ---- "Pred" is all efficiencies for dates between min and max of trials.
pred <- 1 / (1 + exp(-pred))
# ---- If you want to use observed efficiency on days when efficiency trials were run, uncomment.
#miss.eff.inside <- ind.inside & !ind # missing efficiencies inside first and last trials, sized same as df
#miss.eff <- miss.eff.inside[ind.inside] # missing efficiencies inside first and last trials, sized same as pred
#df$efficiency[miss.eff.inside] <- pred[miss.eff]
# ---- If, however, you want to use the modeled efficiency for all days, even when a trial was done, use these.
df$efficiency[ind.inside] <- pred
# ---- Use the mean of spline estimates for all dates outside efficiency trial season.
mean.p <- mean(pred, na.rm=T)
df$efficiency[!ind.inside] <- mean.p
# ---- Reduce down to the set we know we need. We upped temporally for splining way above.
df <- df[df$batchDate >= min.date.p & df$batchDate <= max.date.p,]
# ---- Save the fit for bootstrapping.
fits[[trap]] <- fit
}
# ---- Save the raw efficiency data.
obs.data[[trap]] <- tmp.df
eff.type[[trap]] <- 4
}
# ---- Uncomment the following line if using imputed value for all days. Otherwise, comment it out,
# ---- and imputed.eff will tell which are observed. With the following uncommented, you can find
# ---- efficiency trials in grand.df with !is.na(grand.df$nReleased).
ind <- rep(F, nrow(df))
df$imputed.eff <- factor( !ind, levels=c(T,F), labels=c("Yes", "No"))
df$trapPositionID <- trap
df$enhanced.eff <- rep("No",nrow(df))
df <- df[,c("TrapPositionID","batchDate","nReleased","nCaught","efficiency","imputed.eff","enhanced.eff","trapPositionID")]
# ---- If we have a fake efficiency trial here, get rid of it now. Flip its data to NA, like any
# ---- other non-eff trial day. I think I need this here as well as above.
if(any(obs.eff.df[obs.eff.df$TrapPositionID == trap,]$thisIsFake == 1 & !is.na(obs.eff.df[obs.eff.df$TrapPositionID == trap,]$thisIsFake))){
df[df$batchDate %in% c(obs.eff.df[obs.eff.df$thisIsFake == 1 & obs.eff.df$TrapPositionID == trap & !is.na(obs.eff.df$thisIsFake),]$batchDate),]$nReleased <- NA
df[df$batchDate %in% c(obs.eff.df[obs.eff.df$thisIsFake == 1 & obs.eff.df$TrapPositionID == trap & !is.na(obs.eff.df$thisIsFake),]$batchDate),]$nCaught <- NA
}
ans <- rbind(ans, df)
}
}
# ---- With enhanced efficiency models, could have traps in ans with NA entries. This happens because
# ---- we didn't have the covariates this year to fit a trap's enh eff model, and there were no efficiency
# ---- trials at that trap. But, we have catch at that trap. Make sure we have the trap location's pretty
# ---- name by directly throwing unique values in ans against the originating CAMP database.
db <- get( "db.file", envir=.GlobalEnv )
ch <- odbcConnectAccess(db)
newSubsites <- sqlQuery(ch, "SELECT DISTINCT subSiteName, subSiteID FROM SubSite;")
F.sql.error.check(newSubsites)
close(ch)
newSubsites <- newSubsites[newSubsites$subSiteID %in% catch.subsites,]
if(is.factor(newSubsites$subSiteName)){
newSubsites$subSiteName <- as.character(newSubsites$subSiteName)
}
attr(ans,"subsites") <- newSubsites
attr(ans,"site.name") <- site.name
attr(ans,"eff.type") <- eff.type
# ---- Make a plot if called for.
if( !is.na(plot.file) ) {
out.fn <- F.plot.eff.model( ans, plot.file )
} else {
out.fn <- NULL
}
cat("Observed efficiency data used in efficiency models.\n")
print(obs.data)
cat("\n")
ans <- list(eff=ans, fits=fits, X=all.X, ind.inside=all.ind.inside, X.dates=all.dts, obs.data=obs.data, eff.type=eff.type, doOldEff=doOldEff)
attr(ans, "out.fn.list") <- out.fn
ans
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.