ForecastR-Package: Forecast stuff

Documented in fitModel

#' @title General Model Fitting Function
#'
#' @param model A character vector of length one. The name of a model, it has to
#'   match one of the model names in the "estimation.functions" list objects.
#'   which is generated from the script R/Module_Sub_EstimationFunctions.R.
#' @param data Same as the data element from the list object generated by
#'   prepData()
#' @param data.sibreg Same as the sibling regression input element *sibreg.in* from the
#'  list object generated by prepData()
#' @param settings A list. Model-specific list of settings, where applicable.
#' @param tracing A Boolean. Default is FALSE.
#'
#' @description This function applies the estimation step for the various
#'   models. It sets up the data inputs (e.g. loop through age classes, ) and
#'   calls the estimation subroutine then stores the output in a list object.
#'   Model-specific sub-routines live in the file
#'   "Module_Sub_EstimationFunctions.R".
#'
#' @details FOR NOW: JUST MAKING THIS WORK WITH THE BASIC SIBLING REGRESSION AND
#'   KALMAN FILTER SIBLING REGRESSION
#'
#' @return A list.
#' @export
#'
#' @examples
fitModel <- function(model= c("Naive", "ReturnRate", "Mechanistic",
						"SibRegSimple","SibRegKalman","SibRegLogPower","SibRegPooledSimple","SibRegPooledLogPower", "SibRegComplex",
						"TimeSeriesArima","TimeSeriesExpSmooth", "NoAgeCovar"),
						data = NULL, data.sibreg = NULL, settings = NULL,tracing=FALSE){
# Check inputs
model <- match.arg(model)

# if(!(model %in% c("Naive","SibRegSimple","SibRegKalman","SibRegLogPower","TimeSeriesArima","TimeSeriesExpSmooth"))){
# 					warning("Specified model not part of current inventory")
# 					stop()
# 					}

# any other check needed here?


#  create empty list to store outputs
out.list <- list()


# NAIVE FIT VARIATIONS
# Note this section is largely identical to the sibreg variations.
# The only differences are:
# - here the naive fit is applied to all age classes, not just the youngest
# - the year range for the running avg HAS TO BE specified as an input setting.

if(model %in%  c("Naive")){
# Note: This uses "v2" of the output labels from prepData(). The code below does
# not automatically detect the column labels (i.e. "Run_Year" is hardwired)


if(tracing){print("starting naive variations -------------------")}

#replacing stop() with return(NULL) will stop the function, not put it into debug mode
#print("flag1")
if(is.null(settings) | !("avg.yrs" %in% names(settings))){print("flag2");warning("avg.yrs for naive model not specified"); stop()}


if(tracing){print("starting data reorg for naive fits")}

age.classes <- names(data)

ages <- as.numeric(gsub("\\D", "", age.classes)) # as per https://stat.ethz.ch/pipermail/r-help/2011-February/267946.html
age.prefix <- gsub(ages[1],"",age.classes[1])
#print(age.prefix)

# for now this handles the "noage" version, need to test to ensure robustness
# also: should be able to combine the 2 versions into 1 generic, but for now just make it work
if(any(is.na(ages))){
	data.in <- data[["Total"]][,2] # FIX to dynamic col label
	names(data.in) <- data[["Total"]][,1] # FIX to dynamic col label
	out.list[["Total"]] <- estimation.functions[[model]]$estimator(model.data = data.in,avg.yrs=settings$avg.yrs)
} # end if no age classes


if(!any(is.na(ages))){  # if have age classes, loop through them
	for(age.do in ages){
		if(tracing){print(paste("starting age",age.do))}
		data.in <- data[[paste(age.prefix,age.do,sep="")]][,3] # FIX to dynamic col label
		names(data.in) <- data[[paste(age.prefix,age.do,sep="")]][,1] # FIX to dynamic col label
		out.list[[paste(age.prefix,age.do,sep="")]] <- estimation.functions[[model]]$estimator(model.data = data.in,avg.yrs=settings$avg.yrs)
}} # end looping through age classes if have them



} # end if naive variation


if(model %in%  c("ReturnRate")){


	if(tracing){print("starting return rate model")}

	#check if there are settings
	if(is.null(settings)) {warning("required settings avg, pred.label, and last.n for rate model not specified. Using default values")
		avg.use <- "wtmean"
		pred.label.use <- NULL
		last.n.use <- NULL
		}

	if(!is.null(settings)) {
		avg.use <- settings$avg
		pred.label.use <- settings$pred.label
		last.n.use <- settings$last.n
	}


  # too many variations, so just forcing all or nothing settings
	#if(!is.null(settings)){
	#	  if(!("avg" %in% names(settings))){warning("avg for rate model not specified. Using default = wtmean")}
	#		if(!("pred.label" %in% names(settings))){warning("pred.label for rate model not specified.Using default = first Pred_ column from left")}
	#	  if(!("last.n" %in% names(settings)) ){	warning("last.n for rate model not specified.Using default = all years")}
	#			# Need to decide how to handle last.n vs missing years, (as per https://github.com/MichaelFolkes/forecastR_package/issues/15)
	#    	# then build in check and warning accordingly
	#			} # end checking settings


	if(tracing){print("starting data reorg for rate fits")}

	age.classes <- names(data)

	ages <- as.numeric(gsub("\\D", "", age.classes)) # as per https://stat.ethz.ch/pipermail/r-help/2011-February/267946.html
	age.prefix <- gsub(ages[1],"",age.classes[1])
	#print(age.prefix)

	# for now this handles the "noage" version, need to test to ensure robustness
	# also: should be able to combine the 2 versions into 1 generic, but for now just make it work
	if(any(is.na(ages))){
		data.in <- data[["Total"]][,c("Run_Year","Total",pred.label.use)]

		# check the data
		dt.chk <- rate.datacheck(data.use = data.in, pred.label = pred.label.use, tracing=FALSE)
		if(!dt.chk$var.check){warning("selected predictor variable not in data set"); stop()}
		if(!dt.chk$yrs.check & !is.null(last.n.use)){warning("last.n specified, but missing years in data"); stop()}

		out.list[["Total"]] <- estimation.functions[[model]]$estimator(data.use = data.in,avg=avg.use, pred.label = pred.label.use, last.n  = last.n.use)
	} # end if no age classes


	if(!any(is.na(ages))){  # if have age classes, loop through them
		for(age.do in ages){
			if(tracing){print(paste("starting age",age.do))}

			data.in <- data[[paste(age.prefix,age.do,sep="")]][,c("Run_Year",paste(gsub(" ","_",age.prefix),age.do,sep=""),pred.label.use)]
			if(tracing){print(data.in)}
   		# check the data
   		dt.chk <- rate.datacheck(data.use = data.in, pred.label = pred.label.use, tracing=FALSE)
   		if(!dt.chk$var.check){warning("selected predictor variable not in data set"); stop()}
   		if(!dt.chk$yrs.check & !is.null(last.n.use)){warning("last.n specified, but missing years in data"); stop()}

   		out.list[[paste(age.prefix,age.do,sep="")]] <- estimation.functions[[model]]$estimator(data.use = data.in,avg=avg.use, pred.label = pred.label.use, last.n  = last.n.use)


		}} # end looping through age classes if have them



}#END Rate


#SIBLING REGRESSION VARIATIONS

if(model %in%  c("SibRegSimple","SibRegKalman","SibRegLogPower","SibRegPooledSimple","SibRegPooledLogPower",
								 "SibRegComplex")){
# if doing any of the sibling regressions, fit the specified model to all age classes
# except the youngest. For the youngest age class, use N5 -> eventually convert this into a user choice
# Note: to be consistent with old code, the sibreg variations use
# "v2" of the output labels from prepData(). The code below does not
# automatically detect the column labels (i.e. "Run_Year" is hardwired)


if(tracing){print("starting sibling regression variations -------------------")}


if(tracing){print("starting data reorg for sibling regressions")}

age.classes <- names(data)
ages <- as.numeric(gsub("\\D", "", age.classes)) # as per https://stat.ethz.ch/pipermail/r-help/2011-February/267946.html
age.prefix <- gsub(ages[1],"",age.classes[1])
#print(age.prefix)

if(any(is.na(ages))){ warning("Input data does not have age classes. Cannot do sibling regressions!");stop()}


# do the youngest age class (only 1 pass through, but keeps the variable assignment contained)
for(age.do in ages[1]){
if(tracing){print(paste("starting age",age.do))}


data.in <- data[[paste(age.prefix,age.do,sep="")]][,3] # FIX to dynamic col label
names(data.in) <- data[[paste(age.prefix,age.do,sep="")]][,1] # FIX to dynamic col label

out.list[[paste(age.prefix,age.do,sep="")]] <- estimation.functions[["Naive"]]$estimator(model.data = data.in,avg.yrs=5)
}

# do the "not youngest" age classes

for(age.do in ages[-1]){
if(tracing){print(paste("starting age",age.do,"--------------------"))}


# old version, before prepSibRegData()
#if(!grepl("Pool",model )){
#if(FALSE){
# match up the run years
#age.y <- data[[paste(age.prefix,age.do,sep="")]] # response age class
#age.x <- data[[paste(age.prefix,age.do-1,sep="")]] # predictor age class
# need this to handle inputs where have different starting brood years for age classes
# (e.g. after trimming data by run year using slide in GUI)
#yrs.use.y <- age.y[,"Run_Year"] # run years available for the older age class
#yrs.use.x <- yrs.use.y-1    # run years available for th eyounger age class
# only keep the ones that are available
#yrs.use.x <- yrs.use.x[yrs.use.x %in% age.x[,"Run_Year"]] # check which of the "required" years are available for the younger age
#yrs.use.y <- yrs.use.y[yrs.use.y %in% (yrs.use.x+1)]
# debug testing
#print("yrs.x")
#print(yrs.use.x)
#print("yrs.y")
#print(yrs.use.y)
#data.in <- data.frame(y  = age.y[age.y[,"Run_Year"] %in% (yrs.use.y),paste("Age_",age.do,sep="")],
#				x = age.x[age.x[,"Run_Year"] %in% yrs.use.x,paste("Age_",age.do-1,sep="")])
#names(data.in) <- paste("Age",c(age.do,age.do-1),sep="")
#} # end excluded old version
#} # end if NOT pooled old version
# print(data.sibreg)


if(! grepl("Complex",model)){
# the complex model with covariates needs a different source for the input

# NOT POOLED
# this generates a string like  "Age_4"
if(!grepl("Pool",model )){	pred.col <- paste0("Age_",age.do-1) }

# POOLED
	# this generates a string like  "Pooled_4_3" with numbers depending on age.do and max.pool
if(grepl("Pool",model )){
  num.pool <-  min(age.do- min(ages), settings$max.pool)   #adjust how many age classes are pooled if have fewer than max.pool sibling ages
  if(num.pool >1) {pool.label<-"Pooled_"} else {pool.label<- "Age_"}	# if have only 1, switch "Pooled" to "Age"
   pred.col <- paste0(pool.label,paste(age.do - c(1:num.pool),collapse = "_"))
			 }

# extract the identified predictor col from the new sibreg data frame
# need to exclude NA, because brood year coverage differ by age class
if(tracing){print("predictor label");print(pred.col)}
data.sub <- data.frame(br.yr = data.sibreg$Brood_Year,
						y  = data.sibreg[, paste0("Age_",age.do)],
						x = data.sibreg[, pred.col ]) %>%
					drop_na()

#print("source: data.sibreg")
#print(head(data.sibreg))

#print("sibreg input (not complex)")
#print(head(data.sub))

data.in <- data.sub %>% select(y,x)
yrs.use.y <- data.sub[,"br.yr"] + age.do # for consistency with old version, needed downstream
# Changed from brood year to run year, need to verify

} # end data prep if sibreg, but not complex


# set up the data for the complex sibreg
if(grepl("Complex",model)){

# add the previous age to the df for the forecasted age df which also has the covariates
# then standardize the covariates to mean = 0, sd =1

age.fc.label	<- paste(age.prefix,age.do,sep="")
age.predictor.label <- paste(age.prefix,age.do-1,sep="")
pred.col <- gsub(" ","_",age.predictor.label)

complex.in.df <- data[[age.fc.label]] %>% left_join(data[[age.predictor.label]] %>%
									select(Brood_Year,all_of(pred.col)), by= "Brood_Year")

complex.in.df[,grepl("Cov_",names(complex.in.df))] <- transformDF(X = complex.in.df[,grepl("Cov_",names(complex.in.df)),drop=FALSE],
																												type = "standardize", logfirst = FALSE)
#print(test.df)
#stop()


data.in <- complex.in.df
yrs.use.y <- data.in[,"Run_Year"] # for consistency with old version, needed downstream
# Changed from brood year to run year, need to verify

settings$base.eq = paste(gsub(" ","_",age.fc.label), "~ -1 +", pred.col)

#print(settings)

} # end data prep if complex sibreg



if(tracing){print("sibreg data.in");print(data.in)}


# plan is that all the sibreg models are identical before and after this step
# only the estimator code differs (and the output will have some model-specific list elements)
out.list[[paste(age.prefix,age.do,sep="")]] <- c(estimation.functions[[model]]$estimator(data.in,settings=settings) ,
													list(run.yrs = yrs.use.y,sib.pred.used = pred.col))






} # end looping through age classes


} # end if simple or pooled sibling regression variation


if(tolower(model) == "complexsibreg"){
  #browser()
age.classes <- names(data)


}#END if(tolower(model) == "complexsibreg"){



#TIME SERIES VARIATIONS
# Note this section is largely identical to the other variations.
# The only differences are:
# - TBI


if(model %in%  c("TimeSeriesArima","TimeSeriesExpSmooth")){
# Note: This uses "v2" of the output labels from prepData(). The code below does not
# automatically detect the column labels (i.e. "Run_Year" is hardwired)


if(tracing){print("starting time series variations -------------------")}


if(tracing){print("starting data reorg for time series fits")}

age.classes <- names(data)
ages <- as.numeric(gsub("\\D", "", age.classes)) # as per https://stat.ethz.ch/pipermail/r-help/2011-February/267946.html
age.prefix <- gsub(ages[1],"",age.classes[1])
#print(age.prefix)


# for now this handles the "noage" version, need to test to ensure robustness
# also: should be able to combine the 2 versions into 1 generic, but for now just make it work
if(any(is.na(ages))){
	data.in <- data[["Total"]][,2] # FIX to dynamic col label
	names(data.in) <- data[["Total"]][,1] # FIX to dynamic col label
	out.list[["Total"]] <- c(estimation.functions[[model]]$estimator(model.data = data.in,settings = settings),
													list(run.yrs = data[["Total"]][,1]))
} # end if no age classes


if(!any(is.na(ages))){  # if have age classes, loop through them
	for(age.do in ages){
		if(tracing){print(paste("starting age",age.do))}

		data.in <- data[[paste(age.prefix,age.do,sep="")]][,3] # FIX to dynamic col label (col 3 has the values)
		# note: auto.arima assumes all the values have the same interval and no missing intervals.
		# => should handle this in the  data check function for ARIMA. For now just made a note there

		out.list[[paste(age.prefix,age.do,sep="")]] <- c(estimation.functions[[model]]$estimator(model.data = data.in,settings = settings),
													list(run.yrs = data[[paste(age.prefix,age.do,sep="")]][,1]))
}} # end looping through age classes if have them



} # end if time series variation




# NO Age Data with Covariate model
# Note this section is largely identical to the other variations.
# The only differences are:
# - TBI


if(model %in%  c("NoAgeCovar")){
	# Note: This uses "v2" of the output labels from prepData(). The code below does not
	# automatically detect the column labels (i.e. "Run_Year" is hardwired)


	if(tracing){print("starting noage with covar  -------------------")}


	if(tracing){print("starting data reorg for noage with covar")}

	age.classes <- names(data)
	ages <- as.numeric(gsub("\\D", "", age.classes)) # as per https://stat.ethz.ch/pipermail/r-help/2011-February/267946.html
	age.prefix <- gsub(ages[1],"",age.classes[1])
	#print(age.prefix)


#	if(any(is.na(ages))){
		data.in <- data[["Total"]]
		out.list[["Total"]] <- c(estimation.functions[[model]]$estimator(data.in,settings = settings),
														 list(run.yrs = data[["Total"]][,1]))

		print(names(out.list[["Total"]] ))
		print(out.list[["Total"]]$fitted.values )
		print(out.list[["Total"]]$obs.values )
		print(out.list[["Total"]]$fitted.values - out.list[["Total"]]$obs.values)

#	} # end if no age classes



	# SET IT UP TO WORK  WITH AGE CLASS DATA AS WELL?



} # end if no age with covar







#calculate performance measure summary



pm.out <- resids.pm(out.list,type="fitted")
out.list <- c(out.list,list(fitted.pm=pm.out),list(settings = settings))

return(out.list)

}# end fit.FC()