ForecastR-Package: Forecast stuff

# This script creates a list object with all the core estimation subroutines
# all take the same standard input object



####################################
# HANDLING OF LIST OBJECTS SHOULD BE CLEANED UP FOR CONSISTENCY
# model.fit element is created in each of the *.est functions,
# but added to the output list without making it a sub-list.
# should be: ..., model.fit = model.fit, ...
# currently is: ..., model.fit, ...
# Feb 2021 GP ---------------------
# changing return rate model from what it should be to what all the other ones are,
# because implementing the fix means changing it throughout all the models and all the app pieces




#### Generic Linear Model #### NOTE: THIS IS NOT A *GENERALIZED LINEAR MODEL (GLM)* -> GLM is used only for No Age with Covariates model
# used in the following
# * simple sibling regression
# * simple log power sibling regression
# Should work for complex sibreg as well? -> need to discuss


lm.fit <- function(formula.use,data.use){
  fits <- lm(formula=formula.use,data=data.use)

  summary.fits <- summary(fits)

  fits.out <- list(coefficients= fits$coefficients ,coefficients.table = summary.fits$coefficients ,
  					residuals= fits$residuals,
  					fit.obj = fits,
  					obs.values = data.use[,1]   ,fitted.values.raw = fits$fitted.values,
  					data = data.use[,2:dim(data.use)[2]],
  					sigma 	= summary.fits$sigma, df 	= summary.fits$df, fstatistic 	= summary.fits$fstatistic,
  					r.squared 	= summary.fits$r.squared , adj.r.squared 	= summary.fits$adj.r.squared
  			)

  return(fits.out)

}#END lm.fit


#### Generic Naive Fit ####

naive.fit <- function(data.use,avg.yrs = 3,method = "classic" ){
# data.use is named vector of abundances, with names corresponding to run years
# avg.yrs  is the running avg that gets used (1 = like last year, 3 = avg of last 3 years)
# method is either:
#        - "classic":  fitted values are just the running averages, residuals calculated from there
#        - "lm" : use the lm.fit() function applied to the "data ~ (offset rng avg)"   -> Conceptual idea, Needs review and discussion
#        - "fcpack" : use the naive() function from the forecast package

# NOTE: this assumes that the data has already been checked for missing years in naive.datacheck() before
# being fed into this subroutine.

# running avg calc is the same for methods "classic" and "lm"

yrs.in <- as.numeric(names(data.use))
rng.avg.vals <- stats::filter(data.use, filter=rep(1/avg.yrs,avg.yrs),side=1) # side = 1 means "past-looking rng avg"

names(rng.avg.vals) <- yrs.in +1
#print(rng.avg.vals)
#rng.avg.vals <-  rng.avg.vals[1:(length(rng.avg.vals)-1)] # drop the last element (b/c would be fitted value for yr after data set)


yrs.out <- yrs.in[yrs.in %in% (yrs.in + avg.yrs)]  # only leave years where you have a rng avg for the year before

filter.coeff <- avg.yrs
names(filter.coeff) <- "filter.coeff"

if(method == "classic"){
	fits.out <-   list(coefficients = filter.coeff,
										 obs.values = data.use[as.character(yrs.out)] ,
										 fitted.values.raw = rng.avg.vals[as.character(yrs.out)],
											data = data.use,
										 residuals= data.use[as.character(yrs.out)]-rng.avg.vals[as.character(yrs.out)],
										run.yrs = yrs.out)
} # end if classic

if(method == "lm"){
	fits.out <-   list(obs.values = NA ,fitted.values.raw = NA, data = data.use, residuals= NA,
					run.yrs = yrs.out)
	warning("naive.fit with method = lm not implemented yet")
} # end if lm()


if(method == "fcpack"){
	fits.out <-   list(obs.values = NA ,fitted.values.raw = NA, data = data.use, residuals= NA,
					run.yrs = yrs.out)
		warning("naive.fit with method = fcpack not implemented yet")
} # end if fcpack




return(fits.out)

} # end naive.fit






# NAIVE FIT


# NOTE: This was built completely separately from old code n1, n3, n5 (withage)
# extensive cross-checking required !!!!!!!


naive.datacheck <- function(model.data,tracing=FALSE){
# verify that all the requires components are there
# and check for any special values that might crash the estimate

if(tracing){print("Starting naive.datacheck() - Placeholder only for now")}


# NA values a problem? -> don't think see, need to test
# Missing years a problem? -> YES, need to discuss and implement a fix (insert NA values?)
# Zero values a problem? -> don't think so, need to test

# just a placholder step
tmp.out <- range(model.data)

return(tmp.out)

} # end naive.datacheck


naive.est <- function(model.data,avg.yrs,tracing=FALSE){
# do the estimation (1 instance)
# for now NOT building in a user option for alternative methods
#  -> need to test and discuss those more before proceeding with that
# model.data format required as per preamble in naive.fit() subroutine above

if(tracing){print("Starting naive.est()")}


model.fit <- naive.fit(data.use=model.data,avg.yrs = avg.yrs,method = "classic" )
return(c(list(model.type = "Naive",formula=paste("y = avg(y in",avg.yrs,"previous years)"), var.names = "abd" , est.fn = "classic"),
				 model.fit,list(fitted.values = model.fit$fitted.values.raw) ))

} # end naive.est



naive.pt.fc <- function(fit.obj=NULL, data,settings=NULL){
# don't need any coefficients, because just averaging the years fed in by the previous step
# data = vector of N years, as pre-filtered by the sub.fcdata() subroutine
# current setting: if ANY of the input values are NA, then the pt fc is NA

# How to get prediction intervals for naive? See https://github.com/avelez-espino/forecastR_phase4/issues/125

# Shouldn't this use predict() and fitobj? See https://github.com/MichaelFolkes/forecastR_package/issues/5


			if(length(data)>1){	pt.fc.out <- c(mean(data,na.rm=TRUE),unlist(quantile(data,probs=c(0.1,0.9),na.rm=TRUE)) ) }

			if(length(data)==1){
								pt.fc.out <- mean(data,na.rm=TRUE)
								pt.fc.out <- c( pt.fc.out, pt.fc.out * c(0.5,1.5))
							}

			return(pt.fc.out)

} # end naive.pt.fc



# Merge object

naive.list <- list(estimator = naive.est, datacheck= naive.datacheck, pt.fc =naive.pt.fc )






#### MECHANISTIC (Return RATE) ####


rate.datacheck <- function(data.use, pred.label = NULL, tracing=FALSE){
	# verify that all the required components are there
	# and check for any special values that might crash the estimate

	if(tracing){print("Starting rate.datacheck()")}

	# NA values a problem? -> don't think, need to test
	# Missing years a problem? -> maybe
	# Zero values a problem? -> if in denominator yes!

	if(!is.null((pred.label))){
		pred.check <- pred.label %in% names(data.use)
	}


	if(is.null((pred.label))){
		pred.check <- sum(grepl("Pred_", names(data.use))) > 0
	}


	yrs.check <-   sum(!(min(data.use$Run_Year):max(data.use$Run_Year) %in% data.use$Run_Year)) == 0

	tmp.out <- list(var.check = pred.check,
									yrs.check = yrs.check,
									Predictor = paste("User-selected predictor variable in data set:", pred.check),
									Years = paste("Complete years:", yrs.check)
	)

	return(tmp.out)

}#END rate.datacheck


rate.est <- function(data.use, avg="wtmean", pred.label = NULL, last.n  = NULL){
	# data.use is a data frame with at least 3 columns: first column is run year, second is abd, remaining are Pred
	# avg is the type of average to use for the rate
	# pred.label is the column label for the predictor variable. If NULL, function picks the first one (Put this in the Documentation)
	# last.n determines the number of years to use for the rate calc. If NULL, use all years

	data.orig <- data.use # for later

	if(is.null(pred.label)){ pred.label <-  names(data.use)[min(grep("Pred_",names(data.use)))]} # pick the first one, if none specified
	#print(pred.label)

	last.year <- max(data.use[[1]])

	# handling year filter as per https://github.com/MichaelFolkes/forecastR_package/issues/15
	if(!is.null(last.n)){ data.use <- data.use[data.use[[1]] > (last.year -last.n), ] }

	data.use$rate <- data.use[[2]]/data.use[[pred.label]]

	if(avg == "wtmean"){  data.use <- na.omit(data.use); rate.use <- sum(data.use[[2]]) / sum(data.use[[pred.label]])	}
	if(avg == "mean"){ rate.use <- mean(data.use$rate,na.rm=TRUE) }
	if(avg == "median"){ rate.use <- median(data.use$rate,na.rm=TRUE) }

	# see https://github.com/MichaelFolkes/forecastR_package/issues/11
	if(avg == "geomean"){ data.use <- data.use %>% dplyr::filter(rate > 0) ;  rate.use <- exp(mean(log(data.use$rate,na.rm=TRUE))) }

	#if(avg == "min"){ rate.use <- min(data.use$rate,na.rm=TRUE) }
	#if(avg == "max"){ rate.use <- max(data.use$rate,na.rm=TRUE) }

	#use these for the prediction interval in pt.fc fn

	if(dim(data.use)[1]>1){
		lower.rate.use <- quantile(data.use$rate,prob=0.1)
		upper.rate.use <- quantile(data.use$rate,prob=0.9)
	}

	if(dim(data.use)[1]==1){
		lower.rate.use <- rate.use *0.5
		upper.rate.use <- rate.use *1.5
	}


	fits <- data.orig[[pred.label]] * rate.use


	model.fit <- list(coefficients = rate.use,
										lower.coeff = lower.rate.use,
										upper.coeff = upper.rate.use,
										obs.values = data.orig[[2]] ,
										fitted.values = fits,
										data = data.orig,
										data.used = data.use,
										residuals= data.orig[[2]] - fits	)

	results <- c(list(model.type = "ReturnRate",formula=paste0(names(data.orig)[2],"* return rate based on last",last.n,"yrs of", pred.label),
										var.names = pred.label,
										est.fn = paste0(avg," of (rate[last", last.n,"yrs])"),
										fitted.values = fits,
							      obs.values = data.orig[[2]],
										residuals = data.orig[[2]] - fits,
										run.yrs = data.orig[[1]],
							      num.obs.used = sum(!is.na(data.use$rate))  ),
								 model.fit
										)

	#print("--------------------------------------")
	#print(names(results))
	return(results)
}#END rate.est


rate.pt.fc <- function(fit.obj=NULL, data,settings=NULL){
	# fit.obj = object created from fitModel()
	# data = value of predictor variable
	# settings argument is here for consistency with the other pt.fc functions. It doesn't do anything (for now)

	# How to get prediction intervals for rate model? See https://github.com/MichaelFolkes/forecastR_package/issues/12
	# lower/upper step is in rate.est, here using only the resulting coeff
	#print("------------")
	#print(data)
	#print("----rate.pt.fc--------")
	#print(names(fit.obj))
	#print(data)
	#print(fit.obj$coefficients)
	#print(fit.obj$lower.coeff)
	#print(fit.obj$upper.coeff)

	pt.fc.out <- c(data * fit.obj$coefficients,
								 data * fit.obj$lower.coeff,
								 data * fit.obj$upper.coeff)


	names(pt.fc.out) <- c("Point","Lower", "Upper")
	#print(pt.fc.out)
	return(pt.fc.out)

} #END rate.pt.fc



# Merge object

rate.list <- list(estimator = rate.est, datacheck= rate.datacheck, pt.fc =rate.pt.fc )


#### SIMPLE SIBLING REGRESSION ####


# try to use (or build on) the following functions from old code:
# datafile_extract_age_class() -> incorporated into prepData() with out.labels="v2"
# prepare_data_and_model_formulas()
# sibling_regression_model_fits()


sibreg.simple.datacheck <- function(model.data,tracing=FALSE){
# verify that all the requires components are there
# and check for any special values that might crash the estimate

if(tracing){print("Starting sibreg.simple.datacheck() - Placeholder only for now")}

# make sure that years are lined up by "run year"
# (i.e. run year columns in ageclass_y and ageclass_x need to be identical
# this is the case with the sample dataset, but should build in a formal check


# NA values a problem? -> don't think see, need to test
# Missing years a problem? -> don't think so, need to test
# Zero values a problem? -> don't think so, need to test



# just a placholder step
tmp.out <- range(model.data)

return(tmp.out)

} # end sibreg.simple.datacheck


sibreg.simple.est <- function(model.data,settings=NULL,tracing=FALSE){
# do the estimation (1 instance)
# model.data needs to have the following columns:
#  Column 1 =  Age X by run year
#  Column 2 =  Age X-1 by run year, with run year offset by 1
#  Using the convention that first col always has the reponse var, in order to set up consistency with covariate models
#  For example  if Col 1 is Age6_2008, then Col 2 has to be Age5_2007
#  column names can be any valid R column name. Outputs (e.g. coefficients) will be labelled accordingly


# Note: settings argument doesn't do anything for now, but it needs to be a placeholder because of the way
# generic calls are set up to alternative SIbReg models, and the Kalman filter Sib Reg needs settings.

# NOTE: This fits a linear regression through the origin
# (i.e. intercept set to 0 by specifying "y ~ -1 + x")

if(tracing){print("Starting sibreg.simple.est()")}


sibreg.formula <-  paste(names(model.data)[1], " ~ -1 + ",names(model.data)[2])
model.fit <- lm.fit(formula.use = sibreg.formula ,data.use=model.data)

return(c(list(model.type = "SibRegSimple",formula=sibreg.formula,var.names = names(model.data)[2],  est.fn = "lm()"),
					model.fit,list(fitted.values = model.fit$fitted.values.raw) ))


} # end sibreg.simple.est


sibreg.pt.fc <- function(fit.obj, data,settings=NULL){
# fit.obj = object created from fitModel()
# data = data frame with one element of the list created by sub.fcdata()


# This has a temporary patch below
# fit.obj$fit.obj should not be necessary
# -> should fix list object handling between sibreg.simple.est() and this fn.

    		#print("entering sibreg.pt.fc -----------------------------")
			#print(names(fit.obj))

			pt.fc <- predict.lm(fit.obj$fit.obj,newdata = data, interval= "prediction", level=0.8 )

	return(pt.fc)

} # end sibreg.pt.fc

# Merge object

sibreg.simple.list <- list(estimator = sibreg.simple.est, datacheck= sibreg.simple.datacheck , pt.fc = sibreg.pt.fc)






#### COMPLEX SIBLING REGRESSION ####

# IMPORTANT NOTE: Noage Covar model (below) uses some of the same structure to set up alt model forms and select a best model
#                 Any changes here need to be implemented there as well!


sibreg.complex.datacheck <- function(model.data,tracing=FALSE){
# verify that all the requires components are there
# and check for any special values that might crash the estimate

if(tracing){print("Starting sibreg.simple.datacheck() - Placeholder only for now")}

# make sure that years are lined up by "run year"
# (i.e. run year columns in ageclass_y and ageclass_x need to be identical
# this is the case with the sample dataset, but should build in a formal check


# NA values a problem? -> don't think see, need to test
# Missing years a problem? -> don't think so, need to test
# Zero values a problem? -> don't think so, need to test

# just a placholder step
tmp.out <- range(model.data)

return(tmp.out)

}#END sibreg.complex.datacheck



# OLD ATTEMPT
#sibreg.complex.est <- function(model.data,tracing=FALSE){
# do the estimation (1 instance)
# model.data needs to have the following columns:
#  Column 1 =  Age X by run year
#  Column 2 =  Age X-1 by run year, with run year offset by 1
#  Using the convention that first col always has the reponse var, in order to set up consistency with covariate models
#  For example  if Col 1 is Age6_2008, then Col 2 has to be Age5_2007
#  column names can be any valid R column name. Outputs (e.g. coefficients) will be labelled accordingly

# NOTE: This fits a linear regression through the origin
# (i.e. intercept set to 0 by specifying "y ~ -1 + x")

#if(tracing){print("Starting sibreg.simple.est()")}
#
#sibreg.formula <-  paste(names(model.data)[1], " ~ -1 + ",names(model.data)[2])
#
#model.fit <- lm.fit(formula.use = sibreg.formula ,data.use=model.data)
#return(c(list(model.type = "SibRegSimple",formula=sibreg.formula,var.names = names(model.data)[2],  est.fn = "lm()"),
#					model.fit,list(fitted.values = model.fit$fitted.values.raw) ))
#}#END sibreg.complex.est


# NEW VERSION WITH BUILT-IN MODEL SELECTION

sibreg.complex.est <- function(X, settings = list(tol.AIC = 0.75,
								tol.r.sq = 0.02,
								base.eq ="Age_4 ~ -1 + Age_3",
								 incl.base.eq = TRUE),
								tracing=FALSE){
# X data frame with Run_Year, Brood_Year Age_4 Age_3, and several covariates marked as  "Cov_LABEL"
# tol.AIC  AIC tolerance: include in shortlist any with x% prob that this version will "minimize information loss" (i.e. best fit)
# tol.r.sq  Rsq tolerance: include in shortlist any with adj.r.sq >=  max(adj.r.sq)
# base.eq  simple sibling regression equation specifying the age classes to use (e.g.: "Age_4 ~ -1 + Age_3")
# incl.base.eq if TRUE also include the simple sibreg in the model ranking (i.e. if running this functions outside the app)


	# MODEL SELECTION STEP

	covars.list <- names(X)[grepl("Cov_",names(X))]

	# expand.grid(covars.list,covars.list, stringsAsFactors = FALSE)

	covars.combos <- covars.list # individual covars

if(length(covars.list)>=2){
	covars.combos <- c(covars.combos,
			combn(covars.list,2) %>% apply(MARGIN = 2, paste,collapse = " + "), #pairs
			combn(covars.list,2) %>% apply(MARGIN = 2, paste,collapse = " * ")  # pairs with interaction
			)
		}
if(length(covars.list)>=3){
		covars.combos <- c(covars.combos,
			combn(covars.list,3) %>% apply(MARGIN = 2, paste,collapse = " + "), # triples
			paste("( ", combn(covars.list,3) %>% apply(MARGIN = 2, paste,collapse = " + ")," )^2")  # triples with pairwise interaction
			)
		}


	#covars.combos

	if(settings$incl.base.eq){eq.list <- c(settings$base.eq,paste(settings$eq.base,covars.combos,sep=" + "))} # also include the "no cov" option in the candidate models
	if(!settings$incl.base.eq){eq.list <- paste(settings$base.eq,covars.combos,sep=" + ")} # only include variations of the covariate model
	#


	diy.out <- data.frame(ID = 1:length(eq.list), equ = eq.list, numCoeff = NA, adj.r.sq = NA, AIC = NA)

	for(i in 1:length(eq.list)){
		#print("------------------")
		#print(eq.list[i])

		fit.tmp <- lm(eq.list[i],  X)
		fit.tmp
		diy.out$adj.r.sq[i] <- round(summary(fit.tmp)$adj.r.squared,3)
		diy.out$AIC[i] <- round(AIC(fit.tmp),3)
		diy.out$numCoeff[i] <- length(fit.tmp$coefficients)


	}

	#print("AIC debug ---")
	#print(diy.out$AIC)
	#print(settings$tol.AIC)

	diy.out <- diy.out %>%
		mutate(diffAIC = min(AIC)-AIC) %>%
		mutate(probAIC = exp(diffAIC/2)) %>%
		mutate(rankAIC = rank(AIC,ties.method="min"),rankRsq = rank(-adj.r.sq,ties.method="min")) %>%
		mutate(shortAIC = probAIC >= settings$tol.AIC, shortRsq = adj.r.sq >= (max(adj.r.sq) - settings$tol.r.sq)  ) %>%
		mutate(shortBoth = shortAIC & shortRsq)

	# REVISED!!!!!!!!!!!!!!!!!!!!!!
	# if shortBoth = TRUE, then a model is on the shortlist for both AIC and r2
	# 1) if have only 1 TRUE for shortBoth, pick that one
	# 2) if have multiple TRUE in shortBoth, go by highest R2 (not min number coeff!)
	# if have none in shortBoth, look at shortAIC = TRUE, and from that list pick the highest R2

	diy.out$selected <- FALSE

	# OLD
	#select.idx <- diy.out$shortAIC & diy.out$numCoeff == min(diy.out$numCoeff[diy.out$shortAIC])
	#if(sum(select.idx)>1){select.idx <- select.idx & diy.out$AIC == min(diy.out$AIC[select.idx]) }

	# NEW
	n.shortboth <- sum(diy.out$shortBoth)
	#print(n.shortboth)

	if(n.shortboth == 1){ diy.out$selected[diy.out$shortBoth] <- TRUE}

	if(n.shortboth > 1){ diy.out$selected[diy.out$shortBoth & diy.out$rankRsq == min(diy.out$rankRsq[diy.out$shortBoth]) ] <- TRUE  }

	if(n.shortboth == 0){diy.out$selected[diy.out$shortAIC & diy.out$rankRsq == min(diy.out$rankRsq[diy.out$shortAIC]) ] <- TRUE  }


	#print(diy.out)


	# FITTING STEP (Using selected model, get lm obj, fitted vals etc)

	eq.use <- diy.out$equ[diy.out$selected]
	model.fit <- lm.fit(formula.use = eq.use ,data.use=X %>% select(-Run_Year, -Brood_Year) )

	# OUTPUT

return(c(list(model.type = "SibRegComplex",
							formula = eq.use,
							var.names = paste(paste(names(X)[grepl("Age_",names(X))],collapse = ","), covars.combos[diy.out$selected],sep = ","),
							est.fn = "lm()"),
				      model.fit,
				      list(fitted.values = model.fit$fitted.values.raw),
			        list(model.selection = diy.out))
							)


} # end sibreg.complex.est




sibreg.complex.pt.fc <- function(fit.obj, data, settings = NULL){
# fit.obj = object created from fitModel()
# data = data frame with one element of the list created by sub.fcdata()


	# This has a temporary patch below
	# fit.obj$fit.obj should not be necessary
	# -> should fix list object handling between sibreg.simple.est() and this fn.

	#print("entering sibreg.pt.fc -----------------------------")
	#print(names(fit.obj))

	pt.fc <- predict.lm(fit.obj$fit.obj,newdata = data, interval= "prediction", level=0.8 )

	return(pt.fc)

}#END sibreg.complex.pt.fc

# Merge object

sibreg.complex.list <- list(estimator = sibreg.complex.est,
														datacheck= sibreg.complex.datacheck ,
														pt.fc = sibreg.complex.pt.fc)




#--------------------------------------------------------




#### KALMAN FILTER SIBLING REGRESSION ####

calc_multivariatePI <- function(a.vec, var.vec, n=1000){

	df <- data.frame(a=a.vec, var=var.vec, n)
	samples <- apply(df, 1, function(x){
		rnorm(x['n'], mean = x['a'], sd=sqrt(x['var']))
	})

	cross_a_means <- rowMeans(samples)
	a.pi <- quantile(cross_a_means, probs = c(0.1, 0.5, 0.9))

	return(list(params.original=df, pi.estimates=a.pi))
}#END calc_multivariatePI



sibreg.kalman.datacheck <- function(model.data,tracing=FALSE){
# verify that all the requires components are there
# and check for any special values that might crash the estimate

if(tracing){print("Starting sibreg.kalman.datacheck() - Placholder only for now")}


# NA values a problem? -> need to think this through and test
# Missing years a problem? -> need to think this through and test
# Zero values a problem? -> need to think this through and test


} # end sibreg.Kalman.datacheck


sibreg.kalman.est <- function(model.data,settings=list(int.avg=5),tracing=FALSE){
# do the estimation (1 instance)

# int.avg = the number of years for averaging the "recent intercept"
if(!is.null(settings)){
	if(is.na(settings$int.avg)){ settings$int.avg <- 5 } # if getting NA from GUI, set to same default as NULL would use
	}
if(is.null(settings)){ settings$int.avg <- 5  }


if(tracing){print("Starting sibreg.kalman.est()")}

# using functions provided by Carrie Holt, and adapting her example code
# keeping the notation consistent for now
# BUT: changed the initial values for ln.sig.e and ln.sig.w from log(1) to 10,
# and for var.a from 1 to 10 (based on trial and error)

x <- model.data[,2]
y <- model.data[,1]

###Set up initial parameters (Ts=1, but remaining initials can be adjusted)
initial<-list(lm(y~x)$coef[2], 10, 10 , lm(y~x)$coef[1], 10, 1)
names(initial)<-c("b", "ln.sig.e", "ln.sig.w", "mean.a", "var.a", "Ts")
output<-kf.rw(initial=initial,x=x,y=y)

num.a <- length(output$smoothe.mean.a)


a.vec <- output$smoothe.mean.a[(num.a-settings$int.avg+1):num.a]
var.vec <- output$smoothe.var.a[(num.a-settings$int.avg+1):num.a]

# Using mean a and mean variance across the user-specified time period
#a.sampled <- quantile(rnorm(1000,mean(a.vec),sqrt(mean(var.vec))),probs=c(0.1,0.9))
#a.lower <- a.sampled[1]
#a.upper <- a.sampled[2]


# fancier version
a.pi <- calc_multivariatePI(a.vec,var.vec,n=1000)
#print(a.pi)
a.lower <- a.pi$pi.estimates[1]
a.upper <- a.pi$pi.estimates[3]

a.out <- mean(a.vec)


coef.kf <- c( a.out , output$b, a.lower,a.upper)
names(coef.kf) <- c("a.kf","b","a.lower","a.upper")





fitted.kf <-  output$smoothe.mean.a + output$b * x

model.fit <- list(   coefficients= coef.kf ,coefficients.table = coef.kf ,
					residuals= y - fitted.kf , # change from fitted - y to be consistent with other models
					fit.obj = list(coefficients = coef.kf, output),
					obs.values = model.data[,1]   ,fitted.values.raw = fitted.kf,
					data = model.data[,2:dim(model.data)[2]],
					sigma 	= NA, df 	= NA , fstatistic 	= NA,
					r.squared 	= NA , adj.r.squared 	= NA
			)





return(c(list(model.type = "SibRegKalman",formula=NA,var.names = names(model.data)[2],  est.fn = "kf.rw()"),
					model.fit,list(fitted.values = fitted.kf)))


} # end sibreg.Kalman.est


sibreg.kalman.pt.fc <- function(fit.obj, data, settings=NULL){
# fit.obj = object created from fitModel()
# data = data frame with one element of the list created by sub.fcdata()

	# don't understand why unlist() is needed here, but without it this
	# messes up the storage matrix in sub.pt.fc()

	# Need to figure out how to get prediction intervals from the a and b mean and var

	pt.fc <-  c(unlist(fit.obj$coefficients["a.kf"] + fit.obj$coefficients["b"] * data	),
				unlist(fit.obj$coefficients["a.lower"] + fit.obj$coefficients["b"] * data	),
				unlist(fit.obj$coefficients["a.upper"] + fit.obj$coefficients["b"] * data	))


	return(pt.fc)

} # end sibreg.pt.fc

sibreg.kalman.list <- list(estimator = sibreg.kalman.est, datacheck= sibreg.kalman.datacheck, pt.fc = sibreg.kalman.pt.fc)






#### SIMPLE LOG POWER SIBLING REGRESSION ####


# try to use (or build on) the following functions from old code:
# datafile_extract_age_class() THIS FUNCTION IS ALSO IN SIMPLE SIB REG -> CHECK IF SAME
# prepare_data_and_model_formulas() THIS FUNCTION IS ALSO IN SIMPLE SIB REG -> CHECK IF SAME
# fit function

logpower.simple.datacheck <- function(model.data,tracing=FALSE){
# verify that all the requires components are there
# and check for any special values that might crash the estimate

if(tracing){print("Starting logpower.simple.datacheck() - Placholder only for now")}

# just a placholder step
tmp.out <- range(model.data)


# NA values a problem? -> don't think see, need to test
# Missing years a problem? -> don't think so, need to test
# Zero values a problem? -> YES -> need to agree on a replacement value (1 Fish?, 0.1 Fish?, NA) -> pull on fit?
#                                -> WG discussion required



return(tmp.out)

} # end logpower.simple.datacheck


logpower.simple.est <- function(model.data, settings = NULL, tracing=FALSE){
# do the estimation (1 instance)
# model.data needs to have the following columns:
#  Column 1 =  Age X by run year
#  Column 2 =  Age X-1 by run year, with run year offset by 1
#  Using the convention that first col always has the reponse var, in order to set up consistency with covariate models
#  For example  if Col 1 is Age6_2008, then Col 2 has to be Age5_2007
#  column names can be any valid R column name. Outputs (e.g. coefficients) will be labelled accordingly

# Note: settings argument doesn't do anything for now, but it needs to be a placeholder because of the way
# generic calls are set up to alternative SIbReg models, and the Kalman filter Sib Reg needs settings.

# NOTE: This fits a linear regression WITHOUT forcing the line through the origin
# (i.e. specifying model as "y ~ 1 + x")  (the +1 is just for notation consistency with the simple sib reg model

if(tracing){print("Starting logpower.simple.est()")}


logpower.formula <-  paste("log(",names(model.data)[1], ") ~ 1 + log(",names(model.data)[2],")")   # +1
model.fit <- lm.fit(formula.use = logpower.formula ,data.use=model.data)
return(c(list(model.type = "SibRegLogPower",formula=logpower.formula,var.names = names(model.data)[2],est.fn = "lm()"),
					model.fit,list(fitted.values = exp(model.fit$fitted.values.raw)) ))


} # end logpower.simple.est


logpower.pt.fc <- function(fit.obj, data, settings = NULL){
# fit.obj = object created from fitModel()
# data = data frame with one element of the list created by sub.fcdata()

# This has a temporary patch below
# fit.obj$fit.obj should not be necessary
# -> should fix list object handling between logpower.simple.est() and this fn.

			#print("entering logpower.pt.fc -----------------------------")
			#print(names(fit.obj))

			pt.fc.raw <- predict.lm(fit.obj$fit.obj,newdata = data , interval= "prediction", level=0.8 )
			n <- length(fit.obj$fitted.values)
			bias.correction <- (summary(fit.obj$fit.obj)$sigma^2 * ((n-2)/n)) / 2
			pt.fc <- exp(pt.fc.raw + bias.correction) # convert back from log, including bias correction
			# bias correction as per Sprugel (1983),
			# adapting code from earlier version of ForecastR, which used:
			# sigma.ols <- summary(model)$sigma
            # n <- nrow(model.frame(model))
            # sigma.squared.mle <- sigma.ols^2 * ((n-2)/n)
			# round(exp(p + (sigma.squared.mle/2)))

	#print(pt.fc)
	return(pt.fc)

} # end sibreg.pt.fc

# Merge object

logpower.simple.list <- list(estimator = logpower.simple.est, datacheck= logpower.simple.datacheck,pt.fc =logpower.pt.fc )



#### ARIMA ####

arima.datacheck <- function(model.data,tracing=FALSE){
# verify that all the requires components are there
# and check for any special values that might crash the estimate

if(tracing){print("Starting arima.datacheck() - Placholder only for now")}

# just a placholder step
tmp.out <- range(model.data)


# NA values a problem? -> don't think see, need to test
# Missing years a problem? -> YES! see below
# Zero values a problem? -> don't think see, need to test

# note: auto.arima assumes all the values have the same interval and no missing intervals.



return(tmp.out)

} # end arima.datacheck


arima.est <- function(model.data,settings=list(BoxCox=FALSE), tracing=FALSE){
# do the estimation (1 instance)
# model.data is a univariate time series

if(tracing){print("Starting arima.est()")}

if(settings$BoxCox){lambda.use <- forecast::BoxCox.lambda(model.data, method="guerrero") }
if(!settings$BoxCox){lambda.use <- NULL }
#print("model.data")
#print(model.data)
#print("lambda fit")
#print(lambda.use)

model.fit <- forecast::auto.arima(model.data, allowmean=TRUE, allowdrift=FALSE, lambda=lambda.use)



return(c(list(model.type = "TimeSeriesArima",formula= NA ,var.names = NA,est.fn = "auto.arima()"),
					model.fit,list(fit.obj=model.fit),list(obs.values = model.data, fitted.values = as.numeric(fitted(model.fit)) ) ))


} # end arima.est


arima.pt.fc <- function(fit.obj, data ,settings=list(BoxCox=FALSE)){
# fit.obj = object created from fitModel()
# data = not needed, because captured in output from auto.arima
# just feeding in an obj with NA


# This has a temporary patch below
# fit.obj$fit.obj should not be necessary
# -> should fix list object handling between arima.est() and this fn.


# https://stats.stackexchange.com/questions/155305/how-does-r-calculate-prediction-intervals-in-the-forecast-package

   #print("entering arima.pt.fc -------------")
   #print(names(fit.obj))


	#print("settings$BoxCox")
	#print(settings$BoxCox)
	#print(data)
	if(settings$BoxCox){lambda.use <- forecast::BoxCox.lambda(data, method="guerrero") }
	if(!settings$BoxCox){lambda.use <- NULL }

	#print("data")
	#print(data)
	#print("lambda fc")
	#print(lambda.use)

	# why does ets() need/use forecast() while auto.arima uses predict()?  They are both in the {forecast} #library -> need to do some searching/testing
	# Also, for the ARIMA output, need to store the exact object as a list element (some replication)
	# but it seems the predict() needs it

	# BoxCox conversion issue: predict.Arima does not use lambda argument
	# pt.fc <- as.numeric(predict(fit.obj,n.ahead=1,lambda=lambda.use)$pred  )

	# do it via forecast(), which is the same approach used for exp smooth fc via ets()

	fc.out <- forecast::forecast(fit.obj$fit.obj, h=1,lambda=lambda.use,biasadj=TRUE,level=80)
	pt.fc <-  as.numeric(c(fc.out$mean,fc.out$lower, fc.out$upper))

	#why still need to back convert? The forecast() call already uses lambda???
	#if(settings$BoxCox){pt.fc <- InvBoxCox(pt.fc,lambda.use) }

	#print(pt.fc)
	return(pt.fc)

} # end arima.pt.fc

# Merge object

arima.list <- list(estimator = arima.est, datacheck= arima.datacheck,pt.fc =arima.pt.fc )


#### EXP SMOOTHING ####

expsmooth.datacheck <- function(model.data,tracing=FALSE){
# verify that all the requires components are there
# and check for any special values that might crash the estimate

if(tracing){print("Starting expsmooth.datacheck() - Placholder only for now")}

# just a placholder step
tmp.out <- range(model.data)


# NA values a problem? -> don't think see, need to test
# Missing years a problem? -> YES! see below
# Zero values a problem? -> don't think see, need to test

# note: ets() assumes all the values have the same interval and no missing intervals.



return(tmp.out)

} # end expsmooth.datacheck


expsmooth.est <- function(model.data,settings=list(BoxCox=FALSE), tracing=FALSE){
# do the estimation (1 instance)
# model.data is a univariate time series

if(tracing){print("Starting expsmooth.est()")}

if(settings$BoxCox){lambda.use <- forecast::BoxCox.lambda(model.data, method="guerrero") }
if(!settings$BoxCox){lambda.use <- NULL }

#print("model.data")
#print(model.data)
#print("lambda fit")
#print(lambda.use)


model.fit <- forecast::ets(model.data, model="ZZZ", lambda=lambda.use)


return(c(list(model.type = "TimeSeriesExpSmooth",formula= NA ,var.names = NA,est.fn = "ets()"),
					model.fit ,list(fit.obj=model.fit),list(obs.values = model.data, fitted.values = as.numeric(fitted(model.fit)) ) ))


} # end expsmooth.est


expsmooth.pt.fc <- function(fit.obj, data, settings=list(BoxCox=FALSE)){
# fit.obj = object created from fitModel()
# data = not needed, because captured in output from ets()
# just feeding in an obj with NA

# https://stats.stackexchange.com/questions/155305/how-does-r-calculate-prediction-intervals-in-the-forecast-package

# This has a temporary patch below
# fit.obj$fit.obj should not be necessary
# -> should fix list object handling between arima.est() and this fn.


   #print("entering arima.pt.fc -------------")
   #print(names(fit.obj))


if(settings$BoxCox){lambda.use <- forecast::BoxCox.lambda(data, method="guerrero") }
if(!settings$BoxCox){lambda.use <- NULL }


	#print("data")
	#print(data)
	#print("lambda fc")
	#print(lambda.use)

    fc.out <- forecast::forecast(fit.obj$fit.obj, h=1,lambda=lambda.use,biasadj=TRUE,level=80)
	pt.fc <-  as.numeric(c(fc.out$mean,fc.out$lower, fc.out$upper))


	#why still need to back convert? The forecast() call already uses lambda???
	#if(settings$BoxCox){pt.fc <- InvBoxCox(pt.fc,lambda.use) }

#print(lambda.use)
#print(pt.fc)

	return(pt.fc)

} # end expsmooth.pt.fc

# Merge object

expsmooth.list <- list(estimator = expsmooth.est, datacheck= expsmooth.datacheck,pt.fc =expsmooth.pt.fc )








#----------------------------------------------------------------------------------

#### Noage Covar Model  (No age classes, but using covariates to total return####

# IMPORTANT NOTE: Complex Sibling regression model (above) uses some of the same structure to set up alt model forms and select a best model
#                 Any changes here need to be implemented there as well!


noage.covar.datacheck <- function(model.data,tracing=FALSE){
	# verify that all the requires components are there
	# and check for any special values that might crash the estimate

	if(tracing){print("Starting noage.covar.datacheck()")}

	total.check <- !is.null(model.data$data$Total)
	covar.check <- !is.null(model.data$covariates)


	# NA values a problem? -> don't think see, need to test
	# Missing years a problem? -> don't think so, need to test
	# Zero values a problem? -> don't think so, need to test


	tmp.out <- list(total.check = total.check,
									covar.check = covar.check,
									covars = names(model.data$covariates)[-1]
									)

	return(tmp.out)

}#END noage.covar.datacheck




noage.covar.est <- function(X, settings = list(glm.family = "poisson",
																							 	tol.AIC = 0.75,
																									tol.r.sq = 0.02,
																									base.eq ="Total ~"),
															 tracing=FALSE){
	# X data frame with Run_Year, Brood_Year Total, and several covariates marked as  "Cov_LABEL"

	# tol.AIC  AIC tolerance: include in shortlist any with x% prob that this version will "minimize information loss" (i.e. best fit)
	# tol.r.sq  Rsq tolerance: include in shortlist any with adj.r.sq >=  max(adj.r.sq)
	# base.eq  simple sibling regression equation specifying the age classes to use (e.g.: "Age_4 ~ -1 + Age_3")



	# MODEL SELECTION STEP

	covars.list <- names(X)[grepl("Cov_",names(X))]

	# expand.grid(covars.list,covars.list, stringsAsFactors = FALSE)

	covars.combos <- covars.list # individual covars

	if(length(covars.list)>=2){
		covars.combos <- c(covars.combos,
											 combn(covars.list,2) %>% apply(MARGIN = 2, paste,collapse = " + "), #pairs
											 combn(covars.list,2) %>% apply(MARGIN = 2, paste,collapse = " * ")  # pairs with interaction
		)
	}
	if(length(covars.list)>=3){
		covars.combos <- c(covars.combos,
											 combn(covars.list,3) %>% apply(MARGIN = 2, paste,collapse = " + "), # triples
											 paste("( ", combn(covars.list,3) %>% apply(MARGIN = 2, paste,collapse = " + ")," )^2")  # triples with pairwise interaction
		)
	}


eq.list <- paste(settings$base.eq,covars.combos,sep=" ") # only include variations of the covariate model

diy.out <- data.frame(ID = 1:length(eq.list), equ = eq.list, numCoeff = NA, adj.r.sq = NA, AIC = NA)

	for(i in 1:length(eq.list)){
   #print("------------------")
	 #print(paste(i,"of",length(eq.list)))
	 #print(eq.list[i])

		fit.tmp <- glm(eq.list[i], data = X, family = settings$glm.family)

		#print(summary(fit.tmp))

		#calculate McFadden's R-squared for model
		# Source: https://www.statology.org/glm-r-squared/
		# issue discussion: https://github.com/SalmonForecastR/ForecastR-Package/issues/4
		diy.out$adj.r.sq[i] <- round(	with(summary(fit.tmp), 1 - deviance/null.deviance),3)

		diy.out$AIC[i] <- round(summary(fit.tmp)$aic,3)
		diy.out$numCoeff[i] <- length(fit.tmp$coefficients)


	}

#	print("AIC debug ---")
#	print(diy.out$AIC)
#	print(settings$tol.AIC)


	diy.out <- diy.out %>%
		mutate(diffAIC = min(AIC)-AIC) %>%
		mutate(probAIC = exp(diffAIC/2)) %>%
		mutate(rankAIC = rank(AIC,ties.method="min"),rankRsq = rank(-adj.r.sq,ties.method="min")) %>%
		mutate(shortAIC = probAIC >= settings$tol.AIC, shortRsq = adj.r.sq >= (max(adj.r.sq) - settings$tol.r.sq)  ) %>%
		mutate(shortBoth = shortAIC & shortRsq)


	# REVISED!!!!!!!!!!!!!!!!!!!!!!
	# if shortBoth = TRUE, then a model is on the shortlist for both AIC and r2
	# 1) if have only 1 TRUE for shortBoth, pick that one
	# 2) if have multiple TRUE in shortBoth, go by highest R2 (not min number coeff!)
	# if have none in shortBoth, look at shortAIC = TRUE, and from that list pick the highest R2

	diy.out$selected <- FALSE

	# NEW
	n.shortboth <- sum(diy.out$shortBoth)

	if(n.shortboth == 1){ diy.out$selected[diy.out$shortBoth] <- TRUE}
	if(n.shortboth > 1){ diy.out$selected[diy.out$shortBoth & diy.out$rankRsq == min(diy.out$rankRsq[diy.out$shortBoth]) ] <- TRUE  }
	if(n.shortboth == 0){diy.out$selected[diy.out$shortAIC & diy.out$rankRsq == min(diy.out$rankRsq[diy.out$shortAIC]) ] <- TRUE  }

	# FITTING STEP (Using selected model, get lm obj, fitted vals etc)

	eq.use <- diy.out$equ[diy.out$selected]

	model.fit <- glm(eq.use, data = X, family = settings$glm.family)


	#print(fitted(model.fit))



	# OUTPUT

	return(c(list(model.type = "NoAgeCovar",
								formula = eq.use,
								var.names = paste("Total", covars.combos[diy.out$selected],sep = ","),
								est.fn = paste("glm() with family=",settings$lm.family)),
					 #model.fit,
					 list(glm.obj = model.fit),  # TESTING THIS TO HELP WITH predict.glm() error
					 list(fitted.values = fitted(model.fit),obs.values = X$Total,residuals = X$Total - fitted(model.fit)  ),
					 list(model.selection = diy.out)))


} # end noage.covar.est




noage.covar.pt.fc <- function(fit.obj, data, settings = NULL){
	# fit.obj = object created from fitModel()
	# data = data frame with one element of the list created by sub.fcdata()


	# This has a temporary patch below
	# fit.obj$fit.obj should not be necessary
	# -> should fix list object handling between noage.covar.est() and this fn.

	#print("entering noage.covar.pt.fc -----------------------------")
	#print("names(fit.obj)")
	#print(sort(names(fit.obj)))
	#print("data")
	#print(data)


	#print(sort(names(fit.obj$glm.obj)))

	#pt.fc <- predict.glm(fit.obj$glm.obj,newdata = data, type= "response", level=0.8 )

	# as per https://cran.r-project.org/web/packages/ciTools/vignettes/ciTools-glm-vignette.html
	df_ints <- data %>% add_ci(fit = fit.obj$glm.obj, names = c("lwr", "upr"), alpha = 0.2) %>%dplyr::rename(fit = pred)

  pt.fc <- df_ints %>% select(fit, lwr,upr) %>% unlist()
	#print(pt.fc)

	return(pt.fc)

} #END noage.covar.pt.fc

# Merge object

noage.covar.list <- list(estimator = noage.covar.est,
														datacheck= noage.covar.datacheck ,
														pt.fc = noage.covar.pt.fc)


















#### MERGING ALL THE MODELS ####

estimation.functions <- list(Naive = naive.list,
														 ReturnRate = rate.list,
                             SibRegSimple = sibreg.simple.list,
														 SibRegPooledSimple = sibreg.simple.list, # using same functions as simple sibreg, but feeding in pooled var
                             SibRegKalman = sibreg.kalman.list,
                             SibRegLogPower = logpower.simple.list,
														 SibRegPooledLogPower = logpower.simple.list,  # using same functions as logpower sibreg, but feeding in pooled var
                             TimeSeriesArima = arima.list,
                             TimeSeriesExpSmooth = expsmooth.list,
                             SibRegComplex=sibreg.complex.list,
														 NoAgeCovar = noage.covar.list)
SalmonForecastR/ForecastR-Package documentation built on March 10, 2023, 2:18 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
SalmonForecastR/ForecastR-Package
Forecast stuff

R/Module_Sub_EstimationFunctions.R
In SalmonForecastR/ForecastR-Package: Forecast stuff

R Package Documentation

Browse R Packages

We want your feedback!

SalmonForecastR/ForecastR-Package Forecast stuff

R/Module_Sub_EstimationFunctions.R In SalmonForecastR/ForecastR-Package: Forecast stuff

R Package Documentation

Browse R Packages

We want your feedback!

SalmonForecastR/ForecastR-Package
Forecast stuff

R/Module_Sub_EstimationFunctions.R
In SalmonForecastR/ForecastR-Package: Forecast stuff