#' Title: ESAModel
#' Author: Economics and Strategic Analysis Team
#' Date created: 22.09.2021
#' Date modified: 22.09.2021
#' Changelog:
#' - 22.09.21: file created.
#' Description:
#'
ESAModel <- R6Class(
classname='ESAModel',
public=list(
initialize=function(ESAEDAggregated,
model.type=c('poisfe', 'olsfe'),
args=NULL,
bedOccupancy=NULL,
slotVars=NULL,
nonSlotVars=NULL,
fixedEffects=NULL,
clusterSE=NULL,
printSummary=FALSE,
withAME=FALSE,
vcovFn=NULL,
vcovArgs=NULL,
panelID=NULL,
dependent=NULL){
if(!is(ESAEDAggregated, 'ESAEDAggregated')){
stop('ESAEDAggregated must be ESAEDAggregated Object.')
}
# check vcov is function and vcovargs is list if fixed effects is null
if (is.null(fixedEffects)){
if (!is.null(vcovArgs)){
if (!is(vcovArgs,'list')){
stop('vcovArgs must be list if not null')
}
}
}
private$esaEDObj <- ESAEDAggregated
private$bedOccupancyVars <- bedOccupancy
private$modelType <- model.type
private$fixedEffects <- fixedEffects
private$dependent <- dependent
# get formuli
formuli <- private$getFormulas(obj=ESAEDAggregated,
bedOccupancy=bedOccupancy,
slotVars=slotVars,
nonSlotVars=nonSlotVars,
fixedEffects = fixedEffects)
# run model
private$runModels(obj=ESAEDAggregated,
formuli=formuli,
model.type=model.type,
args=args,
printSummary=printSummary,
withAME=withAME,
fixedEffects=fixedEffects,
vcovFn=vcovFn,
vcovArgs=vcovArgs,cl=clusterSE,
dynPanelID=panelID)
},
#' @description This method has not been implemented.
runScenario=function(scenario){
if(!is(scenario,'ESAModelScenario')){
stop('Scenario must be ESAModelScenario object.')
}
stop('Not supported')
},
#' @description Returns slot
# results.slot.avg=function(sig=0.1,min.slot.sig=3){
# return(private$model.results.avg(sig=sig,min.slot.sig=min.slot.sig))
# },
#' @description Get descriptive sample of regression dataset, calculating
#' the mean and the standard deviation
#' @param variables character vector - variables for which descriptives desired
#' @return data.table
getRegressionDescriptives=function(variables=NULL){
# loop through all the regression datasets
results <- lapply(names(private$modelRegression.datasets),function(nm){
dt <- copy(private$modelRegression.datasets[[nm]])
# extract numeric columns
colsEval <- colnames(dt)[unlist(lapply(dt,is.numeric))]
# if user has supplied specific variables to focus on, select these (use string match)
if (!is.null(variables)){
colsEval <- colsEval[grepl(paste(variables,collapse='|',sep=''),colsEval)]
}
# calculate averages
dtMeans <- dt[,lapply(.SD,mean,na.rm=TRUE),.SDcols=colsEval]
dtMeans[, metric:="mean"]
dtMeans <- dcast(melt(dtMeans,id.vars='metric'),variable~metric)
# calculate standard deviations
dtStdev <- dt[,lapply(.SD,sd,na.rm=TRUE),.SDcols=colsEval]
dtStdev[, metric:="stddev"]
dtStdev <- dcast(melt(dtStdev,id.vars="metric"),variable~metric)
# merge the two together
dtCombined <- merge(x=dtMeans,y=dtStdev,by="variable",all=TRUE)
dtCombined[,(nm):=paste0(round(mean,3), " (",round(stddev,3),")")]
setnames(dtCombined,old=c("mean","stddev"),new=paste0(nm,"_",c("mean","stddev")))
dtCombined[,variable:=gsub(paste0(nm,"_"),"",variable)]
dtCombined[,variable:=gsub("_"," ",variable)]
# set the key for merges of all
setkeyv(dtCombined,"variable")
return(dtCombined)
})
# merge the results
dtResult <- Reduce(function(...) merge(...,all=TRUE),results)
return(dtResult)
},
#' @description Get a table of results with some form of derived magniture
#' @param sig decimal - significance level (e.g. 0.1 for 10%, 0.05 for 5% etc)
#' @param min.slot.sig integer - the minimum number of slots required for a result to be reported as 'consistent'
#' @param useSampleAvgs boolean - whether to use regression sample's statistcs, or that of whole sample
#' @param from string - whether to start at the min/mean/max for the magnitude
#' @param to string - whether to finish at min/mean/max for the magnitude
#' @param by integer - default=NULL, whether to use the standard deviation instead, and by how many
#' @param poisScaleCoeff numeric - a scaling factor applied to Poisson coefficients
#' @param hideCalc boolean - whether or not to hide calculation columns
#' @return data.table
getResults=function(sig=0.1, min.slot.sig=3, useSampleAvgs=TRUE,from='max',to='mean',useSD=NULL,poisScaleCoeff=NULL,hideCalc=TRUE){
if (private$modelType!='poisfe'&!is.null(poisScaleCoeff)){
warning('argument poisScaleCoeff is ignored due to invalid model type')
poisScaleCoeff <- NULL
}
resAll <- private$model.results.avg(sig=sig,min.slot.sig=min.slot.sig)
if (is.null(resAll)){
warning('There were no results obtained from this model.')
return(NULL)
}
if ((!from %in% c('max','mean','min'))| (!to %in% c('max','mean','min'))){
warning('from or to arguments invalid. defaulting to from=max,to=mean')
from <- 'max'
to <- 'mean'
}
if (!is.null(useSD)){
# check that argument is numeric
if (!is.numeric(useSD)) stop('useSD argument must be numeric')
message('Defaulting to using SD rather than from mean to max for example')
}
# get sample averages
sampleAvgs <- private$model.sample.averages(useSampleAvgs)
slots <- unlist(lapply(private$esaEDObj$getSlots(), function(x) x$slotID))
# create a character which is the suffix to magnitude derived columns
magnitudeCalcSuff <- ifelse(test=is.null(useSD),
yes=paste0('_',from,'_to_',to),
no=paste0('_',useSD,'_stddev'))
# handle SD and non-SD differently:
# for non-SD, subtract 'from' metric from the 'to' metric
# for SD, multiply number of standard deviations with std dev
# loop through all of the slots, and derive the above...
for (slt in slots){
# as it makes no sense to scale factors for this, set all factors to 1
if (!is.null(useSD)){
sampleAvgs[,(paste0(slt,magnitudeCalcSuff)):=fcase(
# set factors to 1
is_factor==TRUE,as.numeric(1),
# is not factor and in SD mode
(is_factor!=TRUE|is.na(is_factor)),as.numeric((sampleAvgs[[paste0(slt,'_sd')]]*useSD))
)]
} else {
sampleAvgs[,(paste0(slt,magnitudeCalcSuff)):=fcase(
# set factors to 1
is_factor==TRUE,as.numeric(1),
# is not factor and in non-SD mode
(is_factor!=TRUE|is.na(is_factor)),as.numeric((sampleAvgs[[paste0(slt,'_',to)]]-sampleAvgs[[paste0(slt,'_',from)]]))
)]
}
}
# merge sample averages with main results
resAll <- merge.data.table(x=resAll,y=sampleAvgs,by='varname',all.x=TRUE)
# if poisson model and no scaling factors presented, or ols, multiply
# slot with scaling factor
outCols <- paste0(slots,paste0(magnitudeCalcSuff,'_coeff_scaled'))
if ((private$modelType=='poisfe'&is.null(poisScaleCoeff))|private$modelType=='olsfe'){
resAll[,(outCols):=lapply(slots,function(x) resAll[[paste0(x,magnitudeCalcSuff)]]*resAll[[x]])]
} else if(private$modelType=='poisfe'&!is.null(poisScaleCoeff)){
if (!is.data.table(poisScaleCoeff)){
stop('scale table is not data.table. require varname and scalefactor col')
}
if (('varname'%in%colnames(poisScaleCoeff))&('scalefactor'%in%colnames(poisScaleCoeff))){
# merge scale table
resAll <- merge(x=resAll,y=poisScaleCoeff,by='varname',all.x=TRUE)
# fill na with 1
resAll[,scalefactor:=fifelse(is.na(scalefactor),1,scalefactor)]
# multiply sslot with scale factor
resAll[,(outCols):=lapply(slots,function(x) resAll[[x]]*resAll[['scalefactor']])]
} else {
stop('scale table requires varname and scalefactor col')
}
}
# calculate poisson multiplicative - exponentiate scaled coefficients (and slot var)
if (private$modelType=='poisfe'){
poisIrrCols <- (paste0(c(slots,outCols),'_pois_irr'))
resAll[,(poisIrrCols):=lapply(.SD,exp),.SDcols=c(slots,outCols)]
resAll[,(paste0(c(slots,outCols),'_pois_irr_perc')):=lapply(.SD,function(x) (x-1)*100),
.SDcols=(poisIrrCols)]
}
# check if need to hide certain columns from the results table
if (hideCalc){
resAll[,c("count_na","has_pos","has_neg","should_retain","slot_avg",
"slot_min","slot_max","is_factor"):=NULL]
}
return(resAll)
},
#' @description Create coefficient plots
#' @param exclusions character vector - variables to exclude from plotting
#' @return list of ggplot objects
getCoefficientPlots=function(exclusions=NULL){
# create coefficient plots for all of the models
# get models
models <- unique(private$modelResults.all$model_key)
plots <- lapply(models, function(x){
df <- private$modelResults.all[model_key==x]
# remove rows relating to bed occupancy
if (!is.null(private$bedOccupancyVars)){
df <- df[!grepl(pattern=paste(private$bedOccupancyVars,collapse='|'),factor),]
}
# remove slot/acuity/queue prefix
df[, factor := gsub('_',' ', gsub(paste0(x,'_'),'',factor))]
# remove any exlusions if there are any
if (!is.null(exclusions)){
df <- df[!grepl(pattern=paste(exclusions,collapse='|',sep=''),x=factor)]
}
# create coefficient plots
plot <- ggplot(df, aes(x=df[['factor']], y=df[['estimate']])) +
geom_hline(yintercept = 0, colour=gray(1/2), lty= 2) +
geom_point(aes(x=df[['factor']], y=df[['estimate']])) +
geom_linerange(aes(x=df[['factor']], ymin=df[['lower_ci_90']], ymax=df[['upper_ci_90']]), lwd=1) +
geom_linerange(aes(x=df[['factor']], ymin=df[['lower_ci_95']], ymax=df[['upper_ci_95']]), lwd=1/2) +
ggtitle(paste0('Coefficient plot for ',x)) + coord_flip() + ylab('coefficient') + xlab('variable')
print(plot)
return(plot)
})
names(plots) <- models
return(plots)
},
#' @description Create various bed occupancy plots for the bed occupancy
#' variables for the models
#' @param sig decimal - significance level (0.1=10%, 0.05=5%)
#' @param min.slot.sig integer - minimum number of slots where signifance occurs for the result to be 'consistent'
#' @param useSampleAvgs boolean - whether to use regression sample, or overall sample for deriving descriptive statistics
#' @param groupSize integer - how many percentage increments to group together (e.g. groupSize=5 would have 96,97,98,99,100)
#' @param reverse boolean - whether to reverse the ordering of the chart
#' @return list of data.table and ggplot objects.
getBedOccupancyPlots=function(sig=0.1, min.slot.sig=3, useSampleAvgs=TRUE, groupSize=5, reverse=FALSE){
# get sample averages
sampAvgs <- private$model.sample.averages(useSampleAvgs)
# get results
results <- private$model.results.avg(sig=sig, min.slot.sig=min.slot.sig)
if (is.null(results)){
warning('There were no results obtained from this model')
return(NULL)
}
bedOccVars <- private$bedOccupancyVars
if (is.null(bedOccVars)){
warning('There are no bed occupancy variables set.')
return(NULL)
}
# get the sample average for the dependent variable of the model in
# each of the 6 slots
modelName <- private$getModelSuff(private$esaEDObj)
slots <- private$esaEDObj$getSlots()
slotsNames <- paste0(unlist(lapply(slots,function(x) x$slotID)))
slotsNamesMeans <- paste0(slotsNames,'_mean')
depAvgs <- as.list(sampAvgs[varname==modelName,][,..slotsNamesMeans])
# mean of all the slot means for dep
meanDepAvgs <- mean(unlist(depAvgs),na.rm=TRUE)
# columns to retain from results for this calculation
resultsCols <- c('varname', slotsNames)
results <- results[,..resultsCols]
# empty list to store all plots in
allPlots <- list()
# create charts for each different bed occupancy variable
for (occ in bedOccVars){
# filter for results containing bed occupancy variable in varname
df <- results[grepl(occ,varname)]
# calculate the percentage difference with the slot sample average for
# each slot
df[, (paste0(slotsNames,'_perc_eff')) := lapply(slotsNames, function(x) (df[[x]]+depAvgs[[paste0(x,'_mean')]])/depAvgs[[paste0(x,'_mean')]])]
# calculate average effect across slots, as well as min and max
withCallingHandlers({
suppressWarnings(
df[, `:=` (
occ_avg_perc=rowMeans(.SD, na.rm=TRUE),
occ_min_perc=apply(.SD,1,FUN=min,na.rm=TRUE),
occ_max_perc=apply(.SD,1,FUN=max,na.rm=TRUE)
), .SDcols=paste0(slotsNames,'_perc_eff')]
)
})
# calculations for min/max chart, get effect by subtracting 1
df[, (c('occ_avg','occ_min','occ_max')) := lapply(.SD, function(x) ifelse(x>=1,(x-1)*100,(1-x)*100)),
.SDcols=c('occ_avg_perc','occ_min_perc','occ_max_perc')]
# remove bed occupancy text from varname [as R for factor will set bedOcc1,bedOcc2]
# note this will extract the first numeric chars found and use for ordering
df[, 'kTempName' := as.numeric(sub('\\D*(\\d+).*', '\\1', varname))]
# order table
setorderv(df,c('kTempName'),order=ifelse(reverse==TRUE,-1,1))
# set a row number for index
df[, 'kIndex' := .I]
#### plot overall bed occupancy chart
mainChart <- ggplot(df) +
theme_classic() +
theme(axis.text.x=element_text(angle=90, vjust=0),
panel.grid.major.y = element_line(colour='grey'),
panel.grid.minor.y=element_line(colour='lightgrey'),
plot.title=element_text(hjust=0.5)) +
labs(y='Percentage change in average crowding metric',
x='Bed occupancy intervals') +
# add title for plot
ggtitle(label=paste0(modelName,' ', occ, ' Bed Occupancy Chart'),
subtitle = 'The blue line represents the average effect of bed occupancy on crowding across the slots. The error bars represent the maximum and minimum effects on crowding from the individual slots.') +
# create line for chart
geom_path(aes(x=kIndex,
y=occ_avg,
group=1),
size=2,
colour='steelblue') +
# create error bars, which are just the min&max across slots
geom_errorbar(aes(x=kIndex,
ymin=occ_min,
ymax=occ_max),
colour='grey',
size=1,
width=.5,
position=position_dodge(.9)) +
scale_x_discrete(limits=df$varname, labels=df$varname) +
# black circle on each y point
geom_point(aes(x=kIndex,y=occ_avg), size=2.1, colour='black')
# add plot to return list
allPlots[[paste0(occ,'_main')]] <- mainChart
# add data powering plot to return list
allPlots[[paste0(occ,'_main_data')]] <- df
###### derive waterfall bed occupancy charts
# calculate buckets to group into
indexVec <- 1:nrow(df)
# number of rows must be divisible by number of groups
if (nrow(df)%%groupSize != 0){
warning(paste0('Cannot reconcile groups for waterfall. Levels: ',nrow(df)))
} else{
groupingsIdx <- c()
currentGroup <- 1
for (i in indexVec){
groupingsIdx <- c(groupingsIdx,currentGroup)
if (i%%groupSize==0){
currentGroup <- currentGroup+1
}
}
# create temporary table of groupings
groupingsDF <- data.table(index=indexVec,group=groupingsIdx)
# join this to df
df <- merge.data.table(x=df,y=groupingsDF,by.x='kIndex',by.y='index',all.x=TRUE)
# create grouping name per group
df[, 'kGroupName' := paste0(min(kTempName),'-',max(kTempName)),by='group']
# group by group and find mean of average bed occupancy
waterfallDF <- df[, .(average=mean(occ_avg_perc,na.rm=TRUE)), by=c('kGroupName')]
# if there are any infinite or na values, skip from the bed occ loop
if (sapply(waterfallDF, function(x) sum(is.na(x)|is.infinite(x)))[['average']]>0){
warning('Could not plot waterfall due to NAs/Infs')
next
}
# multiply percentage by average of number of people in each slot
waterfallDF[, 'effect' := average*meanDepAvgs]
# add start and end point to results [for the start and end bars]
x.all <- rbindlist(
list(
list('baseline',NA,meanDepAvgs),
waterfallDF,
list('max', NA, waterfallDF[nrow(waterfallDF),]$effect)
)
)
# add xmin xmax for bars
x.all[, 'xmin' := fcase(.I <= nrow(x.all),.I,default=NA)]
x.all[, 'xmax' := xmin+1]
# add ymin and ymax
x.all[, 'ymin' := effect]
x.all[, 'ymax' := fcase(!is.na(shift(effect)),shift(effect, n=1),default=0)]
x.all[nrow(x.all), 'ymin':=0]
x.all[, 'bar_cat' := fcase(.I==nrow(x.all), 'end', .I==1,'start', default='value')]
# difference in values
x.all[, 'difference' := effect-shift(effect,n=1)]
x.all[, 'kGroupName' := as.factor(kGroupName)]
# calculate effect of moving from baseline to max, and in % terms
effectStart <- x.all[1,]$effect
effectEnd <- x.all[nrow(x.all),]$effect
effectDiff <- round(effectEnd-effectStart,0)
effectPercDiff <- round(((effectEnd-effectStart)/effectStart)*100,0)
effectDesc <- paste0(effectDiff, ifelse(effectDiff<0,' fewer',' more'), ' patients (',effectPercDiff,'%)')
# width of bars
w <- 0.5
# create the waterfall plot
plot <- ggplot(x.all) +
theme_classic() +
theme(legend.position='none',
panel.grid.major.y=element_line(colour='grey'),
panel.grid.minor.y = element_line(colour='lightgrey'),
axis.text.x=element_text(angle=0),
plot.title=element_text(hjust=0.5)) +
labs(y='Average number of patients',
x='Bed Occupancy Intervals',
title=paste0(modelName,' ', occ, ' Waterfall Bed Occupancy Chart')) +
# plot the individual bars for each bed ocupancy level
geom_rect(aes(xmin=xmin-w/2,xmax=xmin+w/2,ymin=ymin,ymax=ymax, fill=bar_cat)) +
# create a line between each of the bars
geom_segment(data=x.all[1:(nrow(x.all)-1),], aes(x=xmin-w/2,xend=xmax+w/2,y=ymin,yend=ymin),size=0.7) +
# display total number of people in crowding based on bed occupancy level (vs baseline)
geom_text(aes(x=xmin,y=effect+1,label=round(effect,1))) +
# show difference in effect between each step
geom_text(aes(x=xmin-w,y=effect+0.5,label=round(difference,1)),size=3) +
scale_x_discrete(limits=x.all$kGroupName, labels=x.all$kGroupName) +
# create fill for the bars
scale_fill_manual(values=(c('end'='#005EB8','value'='steelblue', 'start'='grey'))) +
# add some padding to graph
scale_y_continuous(expand=expansion(mult=c(0,0.1))) +
# create dotted line from starting effect (sample mean of y), to max [i.e when bed occ 100%]
geom_hline(yintercept = effectStart, linetype='dotted',colour='#7C2855') +
geom_hline(yintercept = effectEnd, linetype='dotted',colour='#7C2855') +
annotate('text', x=nrow(x.all), y=effectEnd+3,label=effectDesc, colour='#7C2855')
# add plot to return list
allPlots[[paste0(occ,'_waterfall')]] <- plot
# add plot data to returnlist
allPlots[[paste0(occ,'_waterfall_data')]] <- x.all
}
}
return(allPlots)
},
#' @description Retrieve the datasets which were used in model estimation.
#' note this henceforth means that these are the complete cases of all variables
#' included in the model.
#' @return list of data.table objects.
getRegressionDatasets=function(){
return(private$modelRegression.datasets)
},
#' @description Retrive the sample averages of variables in the model.
#' @param bo a boolean.
#' @return list of data.table objects.
getSampleAvgs=function(bo=TRUE){
return(private$model.sample.averages(bo))
},
#' @description Return a regression table based of the fixest implementation.
#' This combines all of the different models (e.g. across slots).
#' @param exclusions - character vector of variables to exclude from the table
#' @return data.frame
getRegressionTable=function(exclusions=NULL){
u <- private$modelObjects
if (length(u)==0|is.null(u)) return(NULL)
# get all variables across all models
allVars <- unlist(lapply(u, function(x) rownames(x$coeftable)),use.names = FALSE)
# get all the slot names
slots <- unlist(lapply(private$esaEDObj$getSlots(),function(x) x$slotID))
model.suff <- private$getModelSuff(private$esaEDObj)
# remove slot & model identifier from all variables (so they align in table)
repl <- paste(paste0(slots,'_',model.suff,'_'),collapse='|',sep='')
toRepl <- allVars[grepl(repl,allVars)]
subRepl <- gsub(repl,'',toRepl)
# create dictionary for etable
names(subRepl) <- toRepl
if (!length(subRepl)>0){
subRepl <- NULL
}
# create exclusions string if there are any
exclude <- ifelse(is.null(exclusions),'!',paste(paste0('^',exclusions),collapse='|',sep=''))
# some vectors of model statistics
extralineList <- list()
omitColinear <- unlist(lapply(u,function(x) paste(x$collin.var,collapse=',',sep='')))
numOmitColinear <- unlist(lapply(u,function(x) length(x$collin.var)))
feSize <- unlist(lapply(u,function(x) x$fixef_sizes))
aic <- unlist(lapply(u,function(x) AIC(x)))
bic <- unlist(lapply(u,function(x) BIC(x)))
if (!is.null(omitColinear)) extralineList[['^_Omitted due to collinearity']] <- omitColinear
if (!is.null(numOmitColinear)) extralineList[['^_Number omitted due to collinear']] <- numOmitColinear
if (!is.null(feSize)) extralineList[['^_Fixed effect size']] <- feSize
if (!is.null(aic)) extralineList[['^_AIC']] <- aic
if (!is.null(bic)) extralineList[['^_BIC']] <- bic
# return a data.frame via fixest's etable method, with some additional stats
return(etable(u,drop=exclude,
extralines=extralineList,
dict = subRepl))
},
#' @description Returns a table of derived average marginal effects. Note
#' that this requires that the user has specified to derive them on initialization.
#' @return data.table or NULL.
getAMEs = function(){
return(private$modelAMEs)
},
#' @description Return a list of fixest model objects.
#' @return list of fixest model objects.
getModelObjs=function(){
return(private$modelObjects)
}
),
private=list(
esaEDObj=NULL,
modelResults.all=NULL,
modelResultsWide.all=NULL,
modelRegression.datasets=NULL,
bedOccupancyVars=NULL,
modelType=NULL,
modelObjects=list(),
modelAMEs=NULL,
fixedEffects=NULL,
dependent=NULL,
modelIdentifiers=NULL,
#' @description check whether
doesCustomDepExist=function(obj){
# check that the dependent private attribute is not null
if (is.null(private$dependent)) stop("The private attribute dependent cannot be null")
# retrieve the model suffix e.g. acuity x queue
modelSuff <- private$getModelSuff(obj)
slots <- lapply(obj$getSlots(), function(x) x$slotID)
# check whether the dependent variable exists across all slots
proposedDeps <- paste0(slots,"_",modelSuff,"_",private$dependent)
doProposedExist <- Reduce("&", lapply(proposedDeps, function(x) x%in%colnames(obj$data)))
response <- list(
doProposedExist = doProposedExist,
depVarPrefix = paste0(slots,"_",modelSuff)
)
return(response)
},
#' @description Return a list of formulas for each of the slot models
#' @param obj ESAEDAggregated
#' @param bedOccupancy character vector of bed occupancy variables
#' @param slotVars character vector of slot-based variables
#' @param nonSlotVars character vector of non-slot-based variables
#' @param fixedEffects character vector of fixed effects
#' @return list of formula objects
getFormulas=function(obj, bedOccupancy=NULL, slotVars=NULL, nonSlotVars=NULL,fixedEffects=NULL){
# method to get formulas for the various slot models. return labeled
# list containing a formula object.
queue <- obj$getQueue()
acuity <- obj$getAcuity()
slots <- obj$getSlots()
# model suffix (ie acuity x queue)
model.suff <- private$getModelSuff(obj)
otherAcuities <- acuity$other.acuities()
formuli <- list()
# iterate through the slots
for (slot in slots){
if (is.null(private$dependent)){
dep <- paste0(slot$slotID,'_',model.suff)
# get other acuities
if (!is.null(otherAcuities)){
otherAccsPrefix <- paste0(slot$slotID, ifelse(is.null(queue),'', paste0('_', queue$name)), '_')
indep.otheraccs <- paste0(otherAccsPrefix, otherAcuities)
} else {
indep.otheraccs <- NULL
}
indep.slots <- unlist(lapply(slotVars, function(x) paste0(dep,'_',x)))
indep.all <- c(indep.otheraccs,indep.slots,bedOccupancy,nonSlotVars)
} else {
dep <- paste0(slot$slotID, "_", model.suff, "_", private$dependent)
dummy <- paste0(slot$slotID,'_',model.suff)
indep.slots <- unlist(lapply(slotVars,function(x) paste0(dummy,'_',x)))
indep.all <- c(indep.slots, bedOccupancy, nonSlotVars)
}
# check all variables are in the data
#if(!searchWithin(search=c(dep,indep.all), colnames(obj$data),quietly=FALSE)){
# stop('Could not identify a variable in the data.')
#}
form <- as.formula(paste0(dep,'~',paste(indep.all, collapse='+',sep=''), ifelse(is.null(fixedEffects),'',paste0('|', paste(fixedEffects, collapse = '+')))))
formuli[[dep]] <- form
}
return(formuli)
},
runModels=function(obj,formuli,model.type,args,printSummary,withAME,fixedEffects,vcovFn,vcovArgs,cl,dynPanelID){
# create empty results dataframe to store results of all the models...
results.cols <- c('factor','estimate','std_err','t_value','p_value',
'lower_ci_90','upper_ci_90','lower_ci_95','upper_ci_95',
'model_key')
reg.df <- data.table::copy(obj$data)
reg.df[, paste0(obj$getProviderSite()) := as.factor(reg.df[[obj$getProviderSite()]])]
# warning that the package has not been tested with 1 site
if (length(unique(reg.df[[obj$getProviderSite()]])) == 1){
warning("The model has not been verified in the situation of one fixed effect (site).")
}
# force clustering on final_date if there is only one site
if (length(unique(reg.df[[obj$getProviderSite()]]))==1 & is.null(cl) & is.null(fixedEffects)){
cl <- 'final_date'
}
# check whether the proposed dependent variables exist; if not create them
# Note this works on the basis that the dependent is not a slot-specific var
if (!is.null(private$dependent)){
customDepHandle <- private$doesCustomDepExist(obj = obj)
if (!customDepHandle[["doProposedExist"]]){
# check whether the dependent variable exists in the data
if (!private$dependent %in% colnames(reg.df)) stop(paste0("Could not find ", private$dependent, " in the data."))
# create the required variables
depVarPrefix <- customDepHandle[["depVarPrefix"]]
newDepVar <- paste0(depVarPrefix, "_", private$dependent)
reg.df[, (newDepVar) := lapply(1:length(newDepVar), function(i) reg.df[[private$dependent]])]
}
}
# the names of all formuli are used as the key for each model
private$modelIdentifiers <- names(formuli)
# run all the models within the list of formuli
model.results <- lapply(names(formuli), function(x){
model <- tryCatch({
message(paste0('running model... ',x))
clusVar <- fixedEffects
if (model.type=='poisfe'){
model <- do.call(fepois,args=append(list(fml=formuli[[x]],data=reg.df,cluster=clusVar),
args),envir=environment())
} else if (model.type=='olsfe'){
model <- do.call(feols,args=append(list(fml=formuli[[x]],data=reg.df,cluster=clusVar),
args),envir=environment())
} else if (grepl('^dyn',model.type)){
# for both poisson and ols dynamic panel model
message('Results may be biased for large N and small to larger T (Nickell bias).')
message('If lagged dependent variable correlates with independents, estimates for independents biased.')
if (is.null(dynPanelID)){
stop("Must specify panel dimensions as char vector c('id','time')")
}
# only support a 1 lag on the dependent at the moment
depVar <- private$depVarFromFormula(formuli[[x]])
if (is.null(depVar)){
# check is not null, otherwise stop
stop('Could not identify a dependent variable')
}
listArgs <- append(
list(fml=update(copy(formuli[[x]]),paste0('~+l(',depVar,')+.')),
data=reg.df,
cluster=clusVar,
panel.id=dynPanelID),
args
)
# run either poisson or ols fe (ldv)
if (model.type=='dynolsfe'){
model <- do.call(feols,args=listArgs,envir=environment())
} else if (model.type=='dynpoisfe'){
model <- do.call(fepois,args=listArgs,envir=enviroment())
} else {
stop('Invalid dynamic panel model detected')
}
} else {
stop('Invalid model detected.')
}
},
error=function(cond){
message(paste0('an error occured with this model: ',x))
message(cond)
return(NULL)
},
warning=function(cond){
message(paste0('a warning occured with this model: ',x))
message(cond)
return(NULL)
})
if (!is.null(model)){
res <- NULL
if (!is.null(fixedEffects)|(is.null(fixedEffects)&is.null(vcovFn))){
# if fixed effects in use, then use summary from fixest package
if (printSummary){
print(summary(model))
print(model$collin.var)
}
res <- cbind(coeftable(model), confint(model,level=0.9))
res <- cbind(res, confint(model,level=0.95))
} else if (is.null(fixedEffects)&is.null(cl)&!is.null(vcovFn)){
# pass thru a variance co-variance function
vcovNew <- do.call(vcovFn,args=append(list(x=model),vcovArgs),envir=environment())
vcovRes <- lmtest::coeftest(model,vcov=vcovNew)
if (printSummary){
print(vcovRes)
print(model$collin.var)
}
res <- cbind(vcovRes, confint(vcovRes,level=0.9))
res <- cbind(res, confint(vcovRes,level=0.95))
}
# create data.table of results
res <- setDT(as.data.frame(res), keep.rownames=TRUE)[]
res[, model_key := paste0(x)]
# rename columns to match the results table
setnames(res, old=colnames(res), new=results.cols)
# store model objects
private$modelObjects[[x]] <- model
return(res)
}
return(NULL)
})
# get the equivalent datasets used in the regression model. Checking
# for complete cases across all the variables included in the formula
# is how this is achieved.
model.complete.df <- lapply(names(formuli), function(x){
#model.df <- data.table::copy(obj$data)
model.df <- data.table::copy(reg.df)
model.vars <- all.vars(formuli[[x]])
cols.retain <- unique(c(obj$getProviderSite(),'final_date',model.vars))
model.df <- model.df[,..cols.retain]
model.df[,is_complete_case:=FALSE]
model.df[complete.cases(model.df[,..model.vars]),is_complete_case:=TRUE]
return(model.df)
})
names(model.complete.df) <- names(formuli)
private$modelRegression.datasets <- model.complete.df
# Derive the average marginal effects, if specified by the user. Use
# the complete cases dataset derived above to calculate
listAMES <- list()
if (withAME){
# check whether marginaleffects package is installed
if (!requireNamespace("marginaleffects", quietly=TRUE)){
stop("The package marginaleffects is required to derive.")
}
# loop through models
for (nm in names(private$modelObjects)){
# get model object
mdl <- private$modelObjects[[nm]]
# check is not null
if (!is.null(mdl)){
# calculate average marginal effects, using the regression sample for that model
# if poisson model, use link, otherwise if OLS then use reponse type
mfx <- marginaleffects(model=mdl,
newdata=model.complete.df[[nm]][is_complete_case==TRUE],
type=ifelse(grepl('pois',model.type),'link','response'))
if (printSummary){
print(summary(mfx))
}
# add to list of average marginal effects
listAMES[[nm]] <- setDT(tidy(mfx))[, model:=nm]
}
}
}
# union all the average marginal effects
private$modelAMEs <- rbindlist(listAMES,use.names=TRUE)
# combine the model results
# drop those outputs which are null before unioning the model results.
model.results <- model.results[!sapply(model.results, is.null)]
results <- rbindlist(model.results, use.names=TRUE)
private$modelResults.all <- results
},
model.results.avg=function(sig=0.1, min.slot.sig=3,poisScaleCoeff=NULL){
# minimum of slots that should be significant in must be less or equal to
# the number of slots
if (min.slot.sig > length(private$esaEDObj$getSlots())){
stop('Minimum slots to define as significant must not exceed the number of slots.')
}
df <- data.table::copy(private$modelResults.all)
# check if all model results table is null, or empty, and return null if so
if (is.null(df)|(!is.null(df)&(nrow(df)==0|length(colnames(df))==0))){
warning('No results found')
return(NULL)
}
# remove slot specific prefixes for factor names
model.suff <- private$getModelSuff(private$esaEDObj)
model.slots <- unlist(lapply(private$esaEDObj$getSlots(), function(x) x$slotID))
df[, varname := gsub(paste(paste0(model.slots,'_'),collapse='|',sep=''), '',factor)]
df[, varname := gsub(paste0(model.suff,'_'),'',varname)]
# reshape wide
df[, model_grouper := gsub(paste0('_',model.suff),'',model_key)]
# get unique list of models identified
modelsAvail <- unique(df$model_grouper)
wide <- dcast(df, formula=varname~model_grouper, fill=NA,
value.var=c('estimate', 'p_value',"lower_ci_90","upper_ci_90",
"lower_ci_95","upper_ci_95"))
# if some slots did not obtain any results, create empty NA column.
modelsShouldAvail <- gsub(paste0("_", model.suff), "", private$modelIdentifiers)
modelsUnavail <- modelsShouldAvail[!modelsShouldAvail %in% modelsAvail]
if (length(modelsUnavail) > 0){
colsModelsUnavail <- c(paste0("estimate_",modelsUnavail),paste0("p_value_",modelsUnavail))
wide[, (colsModelsUnavail) := NA]
}
pval.cols <- paste0('p_value_', modelsShouldAvail)# unique(df[['model_grouper']]))
pval.cols.out <- paste0(pval.cols,'_flag')
# create flags whether p value is less than significance level define
wide[, (pval.cols.out) := lapply(.SD, function(x) fcase(x<sig,1,default=NA)), .SDcols=pval.cols]
if (length(model.slots) != length(modelsShouldAvail)) stop("unequal lengths")
# multiply each slot estimate by whether estimate is significant or not
wide[, (model.slots) := lapply(modelsShouldAvail, function(x) eval(parse(text=paste0("estimate_",x))) * eval(parse(text=paste0("p_value_", x, "_flag"))))]
# create flag if meets the minumum number of significant if if count NA <
wide[, count_na := Reduce('+', lapply(.SD, is.na)), .SDcols=model.slots]
# count how many positive/negative coefficients there are for consistency
wide[, has_pos := fifelse(Reduce('|', lapply(.SD, function(x) x>=0))==TRUE,TRUE,FALSE,FALSE),.SDcols=model.slots]
wide[, has_neg := fifelse(Reduce('|', lapply(.SD, function(x) x<0))==TRUE,TRUE,FALSE,FALSE),.SDcols=model.slots]
# flag whether the factor meets consistency & significant condition
wide[, should_retain := (!(has_pos==has_neg) & (length(model.slots)-count_na) >= min.slot.sig)]
# calculate row means, min and max
withCallingHandlers({
suppressWarnings(
wide[, `:=` (
slot_avg=fcase(should_retain,rowMeans(.SD, na.rm=TRUE), default=NA),
slot_min=fcase(should_retain,apply(.SD,1,FUN=min,na.rm=TRUE),default=NA),
slot_max=fcase(should_retain,apply(.SD,1,FUN=max,na.rm=TRUE),default=NA)
), .SDcols=model.slots]
)
},
warning=function(){
return(NULL)
})
# remove some columns
wide[,(pval.cols.out):=NULL]
private$modelResultsWide.all <- wide
return(wide)
},
model.sample.averages=function(only.modelled=TRUE){
# calculate sample averages
sample.avgs <- lapply(names(private$modelRegression.datasets), function(x){
# filter for those records which were modelled.
df <- data.table::copy(private$modelRegression.datasets[[x]])
if (only.modelled){
df <- df[is_complete_case==TRUE]
}
# find columns which are numeric
cols.numeric <- names(which(unlist(lapply(df, is.numeric))))
site.code <- private$esaEDObj$getProviderSite()
# calculate per site code, the mean, min & max of all numeric columns
stats <- df[, as.list(unlist(lapply(.SD, function(y){
list(mean=mean(y,na.rm=TRUE),
min=min(y, na.rm=TRUE),
max=max(y, na.rm=TRUE),
sd=sd(y,na.rm=TRUE))
}))), by=site.code, .SDcols=cols.numeric]
# where no levels to average/max; min/max may return Inf/-Inf, remove and
# replace with NA
invisible(lapply(names(stats),function(.name) set(stats, which(is.infinite(stats[[.name]])),j=.name,value=NA)))
stats.cols.numeric <- names(which(unlist(lapply(stats, is.numeric))))
# calculate the average of all
stats <- stats[, unlist(lapply(.SD, mean, na.rm=TRUE)), .SDcols=stats.cols.numeric]
# create data.table of results
stats <- setDT(as.data.frame(stats), keep.rownames=TRUE)
colnames(stats) <- c('varname', 'value')
# remove the model prefix
model.suff <- private$getModelSuff(private$esaEDObj)
model.slots <- unlist(lapply(private$esaEDObj$getSlots(), function(x) x$slotID))
stats[, varname := gsub(paste(paste0(model.slots,'_'),collapse='|',sep=''), '',varname)]
stats[, varname := gsub(paste0(model.suff,'_'),'',varname)]
# seperate the .min, .max, and .mean into another column
stats[, metric := .(
fcase(
grepl('.mean', varname, fixed=TRUE), 'mean',
grepl('.max', varname, fixed=TRUE), 'max',
grepl('.min', varname, fixed=TRUE), 'min',
grepl('.sd',varname,fixed=TRUE), 'sd',
default = 'unknown'
)
)]
# remove .mean, .max, .min from varname
stats[, varname := gsub('\\.mean|\\.max|\\.min|\\.sd', '', varname)]
# remove the model suffix, as well as the dependent variable if provided
nameCleanStr <- private$getModelSuff(private$esaEDObj)
if (!is.null(private$dependent)){
nameCleanStr <- paste0(nameCleanStr, "_", private$dependent)
}
slotsName <- gsub(pattern=nameCleanStr, "", x)
stats[, metric := paste0(slotsName,metric)]
# reshape wide so that min, max and mean are in seperate columns
statsWide <- dcast(stats, formula=varname~metric, value.var=c('value'), fill=NA)
# calculate for factor variables
cols.factors <- names(which(unlist(lapply(df, is.factor))))
factorCounts <- lapply(cols.factors, function(y){
# calculate counts per factor, per site, then calculate mean, max and
# min of this count overall, per factor level
z <- df[, .(count=.N), by=c(y,site.code)][,.(mean=mean(count, na.rm=TRUE),
max=max(count,na.rm=TRUE),
min=min(count,na.rm=TRUE)),
by=y]
setnames(z, old=colnames(z), new=c('varname', paste0(slotsName,c('mean','max','min'))))
z[, varname := paste0(y,varname)]
return(z)
})
factorSummary <- rbindlist(factorCounts, use.names=TRUE)
factorSummary[,is_factor:=TRUE]
# bind factor summaries to statswide
statsWide <- rbindlist(list(statsWide,factorSummary),use.names=TRUE,fill=TRUE)
# set varname as key for later merges
setkeyv(statsWide, 'varname')
return(statsWide)
})
if (length(sample.avgs)>=2){
for (i in 2:length(sample.avgs)) sample.avgs[[i]][,is_factor:=NULL]
}
# merge each of the slot sample averages together
all.sample.avgs <- Reduce(merge,sample.avgs)
return(all.sample.avgs)
},
getModelSuff=function(obj){
suff <- paste0(ifelse(is.null(obj$getQueue()),obj$getAcuity()$name,
paste0(obj$getQueue()$name,'_',obj$getAcuity()$name)))
return(suff)
},
depVarFromFormula=function(form){
if (attr(terms(as.formula(form)),which='response')){
return(all.vars(form)[1])
}
return(NULL)
}
)
)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.