R/ESAModel.R

#' Title: ESAModel
#' Author: Economics and Strategic Analysis Team
#' Date created: 22.09.2021
#' Date modified: 22.09.2021
#' Changelog:
#' - 22.09.21: file created.
#' Description:
#'
ESAModel <- R6Class(
  classname='ESAModel',
  public=list(
    initialize=function(ESAEDAggregated,
                        model.type=c('poisfe', 'olsfe'),
                        args=NULL,
                        bedOccupancy=NULL,
                        slotVars=NULL,
                        nonSlotVars=NULL,
                        fixedEffects=NULL,
                        clusterSE=NULL,
                        printSummary=FALSE,
                        withAME=FALSE,
                        vcovFn=NULL,
                        vcovArgs=NULL,
                        panelID=NULL,
                        dependent=NULL){
      if(!is(ESAEDAggregated, 'ESAEDAggregated')){
        stop('ESAEDAggregated must be ESAEDAggregated Object.')
      }
      # check vcov is function and vcovargs is list if fixed effects is null
      if (is.null(fixedEffects)){
        if (!is.null(vcovArgs)){
          if (!is(vcovArgs,'list')){
            stop('vcovArgs must be list if not null')
          }
        }
      }
      private$esaEDObj <- ESAEDAggregated
      private$bedOccupancyVars <- bedOccupancy
      private$modelType <- model.type
      private$fixedEffects <- fixedEffects
      private$dependent <- dependent
      # get formuli
      formuli <- private$getFormulas(obj=ESAEDAggregated,
                                     bedOccupancy=bedOccupancy,
                                     slotVars=slotVars,
                                     nonSlotVars=nonSlotVars,
                                     fixedEffects = fixedEffects)
      # run model
      private$runModels(obj=ESAEDAggregated,
                        formuli=formuli,
                        model.type=model.type,
                        args=args,
                        printSummary=printSummary,
                        withAME=withAME,
                        fixedEffects=fixedEffects,
                        vcovFn=vcovFn,
                        vcovArgs=vcovArgs,cl=clusterSE,
                        dynPanelID=panelID)
    },
    #' @description This method has not been implemented.
    runScenario=function(scenario){
      if(!is(scenario,'ESAModelScenario')){
        stop('Scenario must be ESAModelScenario object.')
      }
      stop('Not supported')
    },
    #' @description Returns slot
    # results.slot.avg=function(sig=0.1,min.slot.sig=3){
    #   return(private$model.results.avg(sig=sig,min.slot.sig=min.slot.sig))
    # },
    #' @description Get descriptive sample of regression dataset, calculating
    #' the mean and the standard deviation
    #' @param variables character vector - variables for which descriptives desired
    #' @return data.table
    getRegressionDescriptives=function(variables=NULL){
      # loop through all the regression datasets
      results <- lapply(names(private$modelRegression.datasets),function(nm){
        dt <- copy(private$modelRegression.datasets[[nm]])
        # extract numeric columns
        colsEval <- colnames(dt)[unlist(lapply(dt,is.numeric))]
        # if user has supplied specific variables to focus on, select these (use string match)
        if (!is.null(variables)){
          colsEval <- colsEval[grepl(paste(variables,collapse='|',sep=''),colsEval)]
        }
        # calculate averages
        dtMeans <- dt[,lapply(.SD,mean,na.rm=TRUE),.SDcols=colsEval]
        dtMeans[, metric:="mean"]
        dtMeans <- dcast(melt(dtMeans,id.vars='metric'),variable~metric)
        # calculate standard deviations
        dtStdev <- dt[,lapply(.SD,sd,na.rm=TRUE),.SDcols=colsEval]
        dtStdev[, metric:="stddev"]
        dtStdev <- dcast(melt(dtStdev,id.vars="metric"),variable~metric)
        # merge the two together
        dtCombined <- merge(x=dtMeans,y=dtStdev,by="variable",all=TRUE)
        dtCombined[,(nm):=paste0(round(mean,3), " (",round(stddev,3),")")]
        setnames(dtCombined,old=c("mean","stddev"),new=paste0(nm,"_",c("mean","stddev")))
        dtCombined[,variable:=gsub(paste0(nm,"_"),"",variable)]
        dtCombined[,variable:=gsub("_"," ",variable)]
        # set the key for merges of all
        setkeyv(dtCombined,"variable")
        return(dtCombined)
      })
      # merge the results
      dtResult <- Reduce(function(...) merge(...,all=TRUE),results)
      return(dtResult)
    },
    #' @description Get a table of results with some form of derived magniture
    #' @param sig decimal - significance level (e.g. 0.1 for 10%, 0.05 for 5% etc)
    #' @param min.slot.sig integer - the minimum number of slots required for a result to be reported as 'consistent'
    #' @param useSampleAvgs boolean - whether to use regression sample's statistcs, or that of whole sample
    #' @param from string - whether to start at the min/mean/max for the magnitude
    #' @param to string - whether to finish at min/mean/max for the magnitude
    #' @param by integer - default=NULL, whether to use the standard deviation instead, and by how many
    #' @param poisScaleCoeff numeric - a scaling factor applied to Poisson coefficients
    #' @param hideCalc boolean - whether or not to hide calculation columns
    #' @return data.table
    getResults=function(sig=0.1, min.slot.sig=3, useSampleAvgs=TRUE,from='max',to='mean',useSD=NULL,poisScaleCoeff=NULL,hideCalc=TRUE){
      if (private$modelType!='poisfe'&!is.null(poisScaleCoeff)){
        warning('argument poisScaleCoeff is ignored due to invalid model type')
        poisScaleCoeff <- NULL
      }
      resAll <- private$model.results.avg(sig=sig,min.slot.sig=min.slot.sig)
      if (is.null(resAll)){
        warning('There were no results obtained from this model.')
        return(NULL)
      }
      if ((!from %in% c('max','mean','min'))| (!to %in% c('max','mean','min'))){
        warning('from or to arguments invalid. defaulting to from=max,to=mean')
        from <- 'max'
        to <- 'mean'
      }
      if (!is.null(useSD)){
        # check that argument is numeric
        if (!is.numeric(useSD)) stop('useSD argument must be numeric')
        message('Defaulting to using SD rather than from mean to max for example')
      }
      # get sample averages
      sampleAvgs <- private$model.sample.averages(useSampleAvgs)
      slots <- unlist(lapply(private$esaEDObj$getSlots(), function(x) x$slotID))
      # create a character which is the suffix to magnitude derived columns
      magnitudeCalcSuff <- ifelse(test=is.null(useSD),
                                  yes=paste0('_',from,'_to_',to),
                                  no=paste0('_',useSD,'_stddev'))
      # handle SD and non-SD differently:
      # for non-SD, subtract 'from' metric from the 'to' metric
      # for SD, multiply number of standard deviations with std dev
      # loop through all of the slots, and derive the above...
      for (slt in slots){
        # as it makes no sense to scale factors for this, set all factors to 1
        if (!is.null(useSD)){
          sampleAvgs[,(paste0(slt,magnitudeCalcSuff)):=fcase(
            # set factors to 1
            is_factor==TRUE,as.numeric(1),
            # is not factor and in SD mode
            (is_factor!=TRUE|is.na(is_factor)),as.numeric((sampleAvgs[[paste0(slt,'_sd')]]*useSD))
          )]
        } else {
          sampleAvgs[,(paste0(slt,magnitudeCalcSuff)):=fcase(
            # set factors to 1
            is_factor==TRUE,as.numeric(1),
            # is not factor and in non-SD mode
            (is_factor!=TRUE|is.na(is_factor)),as.numeric((sampleAvgs[[paste0(slt,'_',to)]]-sampleAvgs[[paste0(slt,'_',from)]]))
          )]
        }
      }
      # merge sample averages with main results
      resAll <- merge.data.table(x=resAll,y=sampleAvgs,by='varname',all.x=TRUE)
      # if poisson model and no scaling factors presented, or ols, multiply
      # slot with scaling factor
      outCols <- paste0(slots,paste0(magnitudeCalcSuff,'_coeff_scaled'))
      if ((private$modelType=='poisfe'&is.null(poisScaleCoeff))|private$modelType=='olsfe'){
        resAll[,(outCols):=lapply(slots,function(x) resAll[[paste0(x,magnitudeCalcSuff)]]*resAll[[x]])]
      } else if(private$modelType=='poisfe'&!is.null(poisScaleCoeff)){
        if (!is.data.table(poisScaleCoeff)){
          stop('scale table is not data.table. require varname and scalefactor col')
        }
        if (('varname'%in%colnames(poisScaleCoeff))&('scalefactor'%in%colnames(poisScaleCoeff))){
          # merge scale table
          resAll <- merge(x=resAll,y=poisScaleCoeff,by='varname',all.x=TRUE)
          # fill na with 1
          resAll[,scalefactor:=fifelse(is.na(scalefactor),1,scalefactor)]
          # multiply sslot with scale factor
          resAll[,(outCols):=lapply(slots,function(x) resAll[[x]]*resAll[['scalefactor']])]
        } else {
          stop('scale table requires varname and scalefactor col')
        }
      }
      # calculate poisson multiplicative - exponentiate scaled coefficients (and slot var)
      if (private$modelType=='poisfe'){
        poisIrrCols <- (paste0(c(slots,outCols),'_pois_irr'))
        resAll[,(poisIrrCols):=lapply(.SD,exp),.SDcols=c(slots,outCols)]
        resAll[,(paste0(c(slots,outCols),'_pois_irr_perc')):=lapply(.SD,function(x) (x-1)*100),
               .SDcols=(poisIrrCols)]
      }
      # check if need to hide certain columns from the results table
      if (hideCalc){
        resAll[,c("count_na","has_pos","has_neg","should_retain","slot_avg",
                  "slot_min","slot_max","is_factor"):=NULL]
      }
      return(resAll)
    },
    #' @description Create coefficient plots
    #' @param exclusions character vector - variables to exclude from plotting
    #' @return list of ggplot objects
    getCoefficientPlots=function(exclusions=NULL){
      # create coefficient plots for all of the models
      # get models
      models <- unique(private$modelResults.all$model_key)
      plots <- lapply(models, function(x){
        df <- private$modelResults.all[model_key==x]
        # remove rows relating to bed occupancy
        if (!is.null(private$bedOccupancyVars)){
          df <- df[!grepl(pattern=paste(private$bedOccupancyVars,collapse='|'),factor),]
        }
        # remove slot/acuity/queue prefix
        df[, factor := gsub('_',' ', gsub(paste0(x,'_'),'',factor))]
        # remove any exlusions if there are any
        if (!is.null(exclusions)){
          df <- df[!grepl(pattern=paste(exclusions,collapse='|',sep=''),x=factor)]
        }
        # create coefficient plots
        plot <- ggplot(df, aes(x=df[['factor']], y=df[['estimate']])) +
                geom_hline(yintercept = 0, colour=gray(1/2), lty= 2) +
                geom_point(aes(x=df[['factor']], y=df[['estimate']])) +
                geom_linerange(aes(x=df[['factor']], ymin=df[['lower_ci_90']], ymax=df[['upper_ci_90']]), lwd=1) +
                geom_linerange(aes(x=df[['factor']], ymin=df[['lower_ci_95']], ymax=df[['upper_ci_95']]), lwd=1/2) +
                ggtitle(paste0('Coefficient plot for ',x)) + coord_flip() + ylab('coefficient') + xlab('variable')
        print(plot)
        return(plot)
      })
      names(plots) <- models
      return(plots)
    },
    #' @description Create various bed occupancy plots for the bed occupancy
    #' variables for the models
    #' @param sig decimal - significance level (0.1=10%, 0.05=5%)
    #' @param min.slot.sig integer - minimum number of slots where signifance occurs for the result to be 'consistent'
    #' @param useSampleAvgs boolean - whether to use regression sample, or overall sample for deriving descriptive statistics
    #' @param groupSize integer - how many percentage increments to group together (e.g. groupSize=5 would have 96,97,98,99,100)
    #' @param reverse boolean - whether to reverse the ordering of the chart
    #' @return list of data.table and ggplot objects.
    getBedOccupancyPlots=function(sig=0.1, min.slot.sig=3, useSampleAvgs=TRUE, groupSize=5, reverse=FALSE){
      # get sample averages
      sampAvgs <- private$model.sample.averages(useSampleAvgs)
      # get results
      results <- private$model.results.avg(sig=sig, min.slot.sig=min.slot.sig)
      if (is.null(results)){
        warning('There were no results obtained from this model')
        return(NULL)
      }
      bedOccVars <- private$bedOccupancyVars
      if (is.null(bedOccVars)){
        warning('There are no bed occupancy variables set.')
        return(NULL)
      }
      # get the sample average for the dependent variable of the model in
      # each of the 6 slots
      modelName <- private$getModelSuff(private$esaEDObj)
      slots <- private$esaEDObj$getSlots()
      slotsNames <- paste0(unlist(lapply(slots,function(x) x$slotID)))
      slotsNamesMeans <- paste0(slotsNames,'_mean')
      depAvgs <- as.list(sampAvgs[varname==modelName,][,..slotsNamesMeans])
      # mean of all the slot means for dep
      meanDepAvgs <- mean(unlist(depAvgs),na.rm=TRUE)
      # columns to retain from results for this calculation
      resultsCols <- c('varname', slotsNames)
      results <- results[,..resultsCols]
      # empty list to store all plots in
      allPlots <- list()
      # create charts for each different bed occupancy variable
      for (occ in bedOccVars){
        # filter for results containing bed occupancy variable in varname
        df <- results[grepl(occ,varname)]
        # calculate the percentage difference with the slot sample average for
        # each slot
        df[, (paste0(slotsNames,'_perc_eff')) := lapply(slotsNames, function(x) (df[[x]]+depAvgs[[paste0(x,'_mean')]])/depAvgs[[paste0(x,'_mean')]])]
        # calculate average effect across slots, as well as min and max
        withCallingHandlers({
          suppressWarnings(
            df[, `:=` (
              occ_avg_perc=rowMeans(.SD, na.rm=TRUE),
              occ_min_perc=apply(.SD,1,FUN=min,na.rm=TRUE),
              occ_max_perc=apply(.SD,1,FUN=max,na.rm=TRUE)
            ), .SDcols=paste0(slotsNames,'_perc_eff')]
          )
        })
        # calculations for min/max chart, get effect by subtracting 1
        df[, (c('occ_avg','occ_min','occ_max')) := lapply(.SD, function(x) ifelse(x>=1,(x-1)*100,(1-x)*100)),
           .SDcols=c('occ_avg_perc','occ_min_perc','occ_max_perc')]
        # remove bed occupancy text from varname [as R for factor will set bedOcc1,bedOcc2]
        # note this will extract the first numeric chars found and use for ordering
        df[, 'kTempName' := as.numeric(sub('\\D*(\\d+).*', '\\1', varname))]
        # order table
        setorderv(df,c('kTempName'),order=ifelse(reverse==TRUE,-1,1))
        # set a row number for index
        df[, 'kIndex' := .I]
        #### plot overall bed occupancy chart
        mainChart <- ggplot(df) +
          theme_classic() +
          theme(axis.text.x=element_text(angle=90, vjust=0),
                panel.grid.major.y = element_line(colour='grey'),
                panel.grid.minor.y=element_line(colour='lightgrey'),
                plot.title=element_text(hjust=0.5)) +
          labs(y='Percentage change in average crowding metric',
               x='Bed occupancy intervals') +
          # add title for plot
          ggtitle(label=paste0(modelName,' ', occ, ' Bed Occupancy Chart'),
                  subtitle = 'The blue line represents the average effect of bed occupancy on crowding across the slots. The error bars represent the maximum and minimum effects on crowding from the individual slots.') +
          # create line for chart
          geom_path(aes(x=kIndex,
                        y=occ_avg,
                        group=1),
                    size=2,
                    colour='steelblue') +
          # create error bars, which are just the min&max across slots
          geom_errorbar(aes(x=kIndex,
                            ymin=occ_min,
                            ymax=occ_max),
                        colour='grey',
                        size=1,
                        width=.5,
                        position=position_dodge(.9)) +
          scale_x_discrete(limits=df$varname, labels=df$varname) +
          # black circle on each y point
          geom_point(aes(x=kIndex,y=occ_avg), size=2.1, colour='black')
        # add plot to return list
        allPlots[[paste0(occ,'_main')]] <- mainChart
        # add data powering plot to return list
        allPlots[[paste0(occ,'_main_data')]] <- df

        ###### derive waterfall bed occupancy charts

        # calculate buckets to group into
        indexVec <- 1:nrow(df)
        # number of rows must be divisible by number of groups
        if (nrow(df)%%groupSize != 0){
          warning(paste0('Cannot reconcile groups for waterfall. Levels: ',nrow(df)))
        } else{
          groupingsIdx <- c()
          currentGroup <- 1
          for (i in indexVec){
            groupingsIdx <- c(groupingsIdx,currentGroup)
            if (i%%groupSize==0){
              currentGroup <- currentGroup+1
            }
          }
          # create temporary table of groupings
          groupingsDF <- data.table(index=indexVec,group=groupingsIdx)
          # join this to df
          df <- merge.data.table(x=df,y=groupingsDF,by.x='kIndex',by.y='index',all.x=TRUE)
          # create grouping name per group
          df[, 'kGroupName' := paste0(min(kTempName),'-',max(kTempName)),by='group']
          # group by group and find mean of average bed occupancy
          waterfallDF <- df[, .(average=mean(occ_avg_perc,na.rm=TRUE)), by=c('kGroupName')]
          # if there are any infinite or na values, skip from the bed occ loop
          if (sapply(waterfallDF, function(x) sum(is.na(x)|is.infinite(x)))[['average']]>0){
            warning('Could not plot waterfall due to NAs/Infs')
            next
          }
          # multiply percentage by average of number of people in each slot
          waterfallDF[, 'effect' := average*meanDepAvgs]
          # add start and end point to results [for the start and end bars]
          x.all <- rbindlist(
            list(
              list('baseline',NA,meanDepAvgs),
              waterfallDF,
              list('max', NA, waterfallDF[nrow(waterfallDF),]$effect)
              )
            )
          # add xmin xmax for bars
          x.all[, 'xmin' := fcase(.I <= nrow(x.all),.I,default=NA)]
          x.all[, 'xmax' := xmin+1]
          # add ymin and ymax
          x.all[, 'ymin' := effect]
          x.all[, 'ymax' := fcase(!is.na(shift(effect)),shift(effect, n=1),default=0)]
          x.all[nrow(x.all), 'ymin':=0]
          x.all[, 'bar_cat' := fcase(.I==nrow(x.all), 'end', .I==1,'start', default='value')]
          # difference in values
          x.all[, 'difference' := effect-shift(effect,n=1)]
          x.all[, 'kGroupName' := as.factor(kGroupName)]
          # calculate effect of moving from baseline to max, and in % terms
          effectStart <- x.all[1,]$effect
          effectEnd <- x.all[nrow(x.all),]$effect
          effectDiff <- round(effectEnd-effectStart,0)
          effectPercDiff <- round(((effectEnd-effectStart)/effectStart)*100,0)
          effectDesc <- paste0(effectDiff, ifelse(effectDiff<0,' fewer',' more'), ' patients (',effectPercDiff,'%)')
          # width of bars
          w <- 0.5
          # create the waterfall plot
          plot <- ggplot(x.all) +
            theme_classic() +
            theme(legend.position='none',
                  panel.grid.major.y=element_line(colour='grey'),
                  panel.grid.minor.y = element_line(colour='lightgrey'),
                  axis.text.x=element_text(angle=0),
                  plot.title=element_text(hjust=0.5)) +
            labs(y='Average number of patients',
                 x='Bed Occupancy Intervals',
                 title=paste0(modelName,' ', occ, ' Waterfall Bed Occupancy Chart')) +
            # plot the individual bars for each bed ocupancy level
            geom_rect(aes(xmin=xmin-w/2,xmax=xmin+w/2,ymin=ymin,ymax=ymax, fill=bar_cat)) +
            # create a line between each of the bars
            geom_segment(data=x.all[1:(nrow(x.all)-1),], aes(x=xmin-w/2,xend=xmax+w/2,y=ymin,yend=ymin),size=0.7) +
            # display total number of people in crowding based on bed occupancy level (vs baseline)
            geom_text(aes(x=xmin,y=effect+1,label=round(effect,1))) +
            # show difference in effect between each step
            geom_text(aes(x=xmin-w,y=effect+0.5,label=round(difference,1)),size=3) +
            scale_x_discrete(limits=x.all$kGroupName, labels=x.all$kGroupName) +
            # create fill for the bars
            scale_fill_manual(values=(c('end'='#005EB8','value'='steelblue', 'start'='grey'))) +
            # add some padding to graph
            scale_y_continuous(expand=expansion(mult=c(0,0.1))) +
            # create dotted line from starting effect (sample mean of y), to max [i.e when bed occ 100%]
            geom_hline(yintercept = effectStart, linetype='dotted',colour='#7C2855') +
            geom_hline(yintercept = effectEnd, linetype='dotted',colour='#7C2855') +
            annotate('text', x=nrow(x.all), y=effectEnd+3,label=effectDesc, colour='#7C2855')
          # add plot to return list
          allPlots[[paste0(occ,'_waterfall')]] <- plot
          # add plot data to returnlist
          allPlots[[paste0(occ,'_waterfall_data')]] <- x.all
        }
      }
      return(allPlots)
    },
    #' @description Retrieve the datasets which were used in model estimation.
    #' note this henceforth means that these are the complete cases of all variables
    #' included in the model.
    #' @return list of data.table objects.
    getRegressionDatasets=function(){
      return(private$modelRegression.datasets)
    },
    #' @description Retrive the sample averages of variables in the model.
    #' @param bo a boolean.
    #' @return list of data.table objects.
    getSampleAvgs=function(bo=TRUE){
      return(private$model.sample.averages(bo))
    },
    #' @description Return a regression table based of the fixest implementation.
    #' This combines all of the different models (e.g. across slots).
    #' @param exclusions - character vector of variables to exclude from the table
    #' @return data.frame
    getRegressionTable=function(exclusions=NULL){
      u <- private$modelObjects
      if (length(u)==0|is.null(u)) return(NULL)
      # get all variables across all models
      allVars <- unlist(lapply(u, function(x) rownames(x$coeftable)),use.names = FALSE)
      # get all the slot names
      slots <- unlist(lapply(private$esaEDObj$getSlots(),function(x) x$slotID))
      model.suff <- private$getModelSuff(private$esaEDObj)
      # remove slot & model identifier from all variables (so they align in table)
      repl <- paste(paste0(slots,'_',model.suff,'_'),collapse='|',sep='')
      toRepl <- allVars[grepl(repl,allVars)]
      subRepl <- gsub(repl,'',toRepl)
      # create dictionary for etable
      names(subRepl) <- toRepl
      if (!length(subRepl)>0){
        subRepl <- NULL
      }
      # create exclusions string if there are any
      exclude <- ifelse(is.null(exclusions),'!',paste(paste0('^',exclusions),collapse='|',sep=''))
      # some vectors of model statistics
      extralineList <- list()
      omitColinear <- unlist(lapply(u,function(x) paste(x$collin.var,collapse=',',sep='')))
      numOmitColinear <- unlist(lapply(u,function(x) length(x$collin.var)))
      feSize <- unlist(lapply(u,function(x) x$fixef_sizes))
      aic <- unlist(lapply(u,function(x) AIC(x)))
      bic <- unlist(lapply(u,function(x) BIC(x)))
      if (!is.null(omitColinear)) extralineList[['^_Omitted due to collinearity']] <- omitColinear
      if (!is.null(numOmitColinear)) extralineList[['^_Number omitted due to collinear']] <- numOmitColinear
      if (!is.null(feSize)) extralineList[['^_Fixed effect size']] <- feSize
      if (!is.null(aic)) extralineList[['^_AIC']] <- aic
      if (!is.null(bic)) extralineList[['^_BIC']] <- bic
      # return a data.frame via fixest's etable method, with some additional stats
      return(etable(u,drop=exclude,
                    extralines=extralineList,
                    dict = subRepl))
    },
    #' @description Returns a table of derived average marginal effects. Note
    #' that this requires that the user has specified to derive them on initialization.
    #' @return data.table or NULL.
    getAMEs = function(){
      return(private$modelAMEs)
    },
    #' @description Return a list of fixest model objects.
    #' @return list of fixest model objects.
    getModelObjs=function(){
      return(private$modelObjects)
    }
  ),
  private=list(
    esaEDObj=NULL,
    modelResults.all=NULL,
    modelResultsWide.all=NULL,
    modelRegression.datasets=NULL,
    bedOccupancyVars=NULL,
    modelType=NULL,
    modelObjects=list(),
    modelAMEs=NULL,
    fixedEffects=NULL,
    dependent=NULL,
    modelIdentifiers=NULL,
    #' @description check whether
    doesCustomDepExist=function(obj){
      # check that the dependent private attribute is not null
      if (is.null(private$dependent)) stop("The private attribute dependent cannot be null")
      # retrieve the model suffix e.g. acuity x queue
      modelSuff <- private$getModelSuff(obj)
      slots <- lapply(obj$getSlots(), function(x) x$slotID)
      # check whether the dependent variable exists across all slots
      proposedDeps <- paste0(slots,"_",modelSuff,"_",private$dependent)
      doProposedExist <- Reduce("&", lapply(proposedDeps, function(x) x%in%colnames(obj$data)))
      response <- list(
        doProposedExist = doProposedExist,
        depVarPrefix = paste0(slots,"_",modelSuff)
      )
      return(response)
    },
    #' @description Return a list of formulas for each of the slot models
    #' @param obj ESAEDAggregated
    #' @param bedOccupancy character vector of bed occupancy variables
    #' @param slotVars character vector of slot-based variables
    #' @param nonSlotVars character vector of non-slot-based variables
    #' @param fixedEffects character vector of fixed effects
    #' @return list of formula objects
    getFormulas=function(obj, bedOccupancy=NULL, slotVars=NULL, nonSlotVars=NULL,fixedEffects=NULL){
      # method to get formulas for the various slot models. return labeled
      # list containing a formula object.
      queue <- obj$getQueue()
      acuity <- obj$getAcuity()
      slots <- obj$getSlots()
      # model suffix (ie acuity x queue)
      model.suff <- private$getModelSuff(obj)
      otherAcuities <- acuity$other.acuities()
      formuli <- list()
      # iterate through the slots
      for (slot in slots){
        if (is.null(private$dependent)){
          dep <- paste0(slot$slotID,'_',model.suff)
          # get other acuities
          if (!is.null(otherAcuities)){
            otherAccsPrefix <- paste0(slot$slotID, ifelse(is.null(queue),'', paste0('_', queue$name)), '_')
            indep.otheraccs <- paste0(otherAccsPrefix, otherAcuities)
          } else {
            indep.otheraccs <- NULL
          }
          indep.slots <- unlist(lapply(slotVars, function(x) paste0(dep,'_',x)))
          indep.all <- c(indep.otheraccs,indep.slots,bedOccupancy,nonSlotVars)
        } else {
          dep <- paste0(slot$slotID, "_", model.suff, "_", private$dependent)
          dummy <- paste0(slot$slotID,'_',model.suff)
          indep.slots <- unlist(lapply(slotVars,function(x) paste0(dummy,'_',x)))
          indep.all <- c(indep.slots, bedOccupancy, nonSlotVars)
        }
        # check all variables are in the data
        #if(!searchWithin(search=c(dep,indep.all), colnames(obj$data),quietly=FALSE)){
        #  stop('Could not identify a variable in the data.')
        #}
        form <- as.formula(paste0(dep,'~',paste(indep.all, collapse='+',sep=''), ifelse(is.null(fixedEffects),'',paste0('|', paste(fixedEffects, collapse = '+')))))
        formuli[[dep]] <- form
      }
      return(formuli)
    },
    runModels=function(obj,formuli,model.type,args,printSummary,withAME,fixedEffects,vcovFn,vcovArgs,cl,dynPanelID){
      # create empty results dataframe to store results of all the models...
      results.cols <- c('factor','estimate','std_err','t_value','p_value',
                        'lower_ci_90','upper_ci_90','lower_ci_95','upper_ci_95',
                        'model_key')
      reg.df <- data.table::copy(obj$data)
      reg.df[, paste0(obj$getProviderSite()) := as.factor(reg.df[[obj$getProviderSite()]])]
      # warning that the package has not been tested with 1 site
      if (length(unique(reg.df[[obj$getProviderSite()]])) == 1){
        warning("The model has not been verified in the situation of one fixed effect (site).")
      }
      # force clustering on final_date if there is only one site
      if (length(unique(reg.df[[obj$getProviderSite()]]))==1 & is.null(cl) & is.null(fixedEffects)){
        cl <- 'final_date'
      }
      # check whether the proposed dependent variables exist; if not create them
      # Note this works on the basis that the dependent is not a slot-specific var
      if (!is.null(private$dependent)){
        customDepHandle <- private$doesCustomDepExist(obj = obj)
        if (!customDepHandle[["doProposedExist"]]){
          # check whether the dependent variable exists in the data
          if (!private$dependent %in% colnames(reg.df)) stop(paste0("Could not find ", private$dependent, " in the data."))
          # create the required variables
          depVarPrefix <- customDepHandle[["depVarPrefix"]]
          newDepVar <- paste0(depVarPrefix, "_", private$dependent)
          reg.df[, (newDepVar) := lapply(1:length(newDepVar), function(i) reg.df[[private$dependent]])]
        }
      }
      # the names of all formuli are used as the key for each model
      private$modelIdentifiers <- names(formuli)
      # run all the models within the list of formuli
      model.results <- lapply(names(formuli), function(x){
        model <- tryCatch({
          message(paste0('running model... ',x))
          clusVar <- fixedEffects
          if (model.type=='poisfe'){
            model <- do.call(fepois,args=append(list(fml=formuli[[x]],data=reg.df,cluster=clusVar),
                                                args),envir=environment())
          } else if (model.type=='olsfe'){
            model <- do.call(feols,args=append(list(fml=formuli[[x]],data=reg.df,cluster=clusVar),
                                               args),envir=environment())
          } else if (grepl('^dyn',model.type)){
            # for both poisson and ols dynamic panel model
            message('Results may be biased for large N and small to larger T (Nickell bias).')
            message('If lagged dependent variable correlates with independents, estimates for independents biased.')
            if (is.null(dynPanelID)){
              stop("Must specify panel dimensions as char vector c('id','time')")
            }
            # only support a 1 lag on the dependent at the moment
            depVar <- private$depVarFromFormula(formuli[[x]])
            if (is.null(depVar)){
              # check is not null, otherwise stop
              stop('Could not identify a dependent variable')
            }
            listArgs <- append(
              list(fml=update(copy(formuli[[x]]),paste0('~+l(',depVar,')+.')),
                   data=reg.df,
                   cluster=clusVar,
                   panel.id=dynPanelID),
              args
            )
            # run either poisson or ols fe (ldv)
            if (model.type=='dynolsfe'){
              model <- do.call(feols,args=listArgs,envir=environment())
            } else if (model.type=='dynpoisfe'){
              model <- do.call(fepois,args=listArgs,envir=enviroment())
            } else {
              stop('Invalid dynamic panel model detected')
            }
          } else {
            stop('Invalid model detected.')
          }
        },
        error=function(cond){
          message(paste0('an error occured with this model: ',x))
          message(cond)
          return(NULL)
        },
        warning=function(cond){
          message(paste0('a warning occured with this model: ',x))
          message(cond)
          return(NULL)
        })
        if (!is.null(model)){
          res <- NULL
          if (!is.null(fixedEffects)|(is.null(fixedEffects)&is.null(vcovFn))){
            # if fixed effects in use, then use summary from fixest package
            if (printSummary){
              print(summary(model))
              print(model$collin.var)
            }
            res <- cbind(coeftable(model), confint(model,level=0.9))
            res <- cbind(res, confint(model,level=0.95))
          } else if (is.null(fixedEffects)&is.null(cl)&!is.null(vcovFn)){
            # pass thru a variance co-variance function
            vcovNew <- do.call(vcovFn,args=append(list(x=model),vcovArgs),envir=environment())
            vcovRes <- lmtest::coeftest(model,vcov=vcovNew)
            if (printSummary){
              print(vcovRes)
              print(model$collin.var)
            }
            res <- cbind(vcovRes, confint(vcovRes,level=0.9))
            res <- cbind(res, confint(vcovRes,level=0.95))
          }
          # create data.table of results
          res <- setDT(as.data.frame(res), keep.rownames=TRUE)[]
          res[, model_key := paste0(x)]
          # rename columns to match the results table
          setnames(res, old=colnames(res), new=results.cols)
          # store model objects
          private$modelObjects[[x]] <- model
          return(res)
        }
        return(NULL)
      })
      # get the equivalent datasets used in the regression model. Checking
      # for complete cases across all the variables included in the formula
      # is how this is achieved.
      model.complete.df <- lapply(names(formuli), function(x){
        #model.df <- data.table::copy(obj$data)
        model.df <- data.table::copy(reg.df)
        model.vars <- all.vars(formuli[[x]])
        cols.retain <- unique(c(obj$getProviderSite(),'final_date',model.vars))
        model.df <- model.df[,..cols.retain]
        model.df[,is_complete_case:=FALSE]
        model.df[complete.cases(model.df[,..model.vars]),is_complete_case:=TRUE]
        return(model.df)
      })
      names(model.complete.df) <- names(formuli)
      private$modelRegression.datasets <- model.complete.df
      # Derive the average marginal effects, if specified by the user. Use
      # the complete cases dataset derived above to calculate
      listAMES <- list()
      if (withAME){
        # check whether marginaleffects package is installed
        if (!requireNamespace("marginaleffects", quietly=TRUE)){
          stop("The package marginaleffects is required to derive.")
        }
        # loop through models
        for (nm in names(private$modelObjects)){
          # get model object
          mdl <- private$modelObjects[[nm]]
          # check is not null
          if (!is.null(mdl)){
            # calculate average marginal effects, using the regression sample for that model
            # if poisson model, use link, otherwise if OLS then use reponse type
            mfx <- marginaleffects(model=mdl,
                                   newdata=model.complete.df[[nm]][is_complete_case==TRUE],
                                   type=ifelse(grepl('pois',model.type),'link','response'))
            if (printSummary){
              print(summary(mfx))
            }
            # add to list of average marginal effects
            listAMES[[nm]] <- setDT(tidy(mfx))[, model:=nm]
          }
        }
      }
      # union all the average marginal effects
      private$modelAMEs <- rbindlist(listAMES,use.names=TRUE)
      # combine the model results
      # drop those outputs which are null before unioning the model results.
      model.results <- model.results[!sapply(model.results, is.null)]
      results <- rbindlist(model.results, use.names=TRUE)
      private$modelResults.all <- results
    },
    model.results.avg=function(sig=0.1, min.slot.sig=3,poisScaleCoeff=NULL){
      # minimum of slots that should be significant in must be less or equal to
      # the number of slots
      if (min.slot.sig > length(private$esaEDObj$getSlots())){
        stop('Minimum slots to define as significant must not exceed the number of slots.')
      }
      df <- data.table::copy(private$modelResults.all)
      # check if all model results table is null, or empty, and return null if so
      if (is.null(df)|(!is.null(df)&(nrow(df)==0|length(colnames(df))==0))){
        warning('No results found')
        return(NULL)
      }
      # remove slot specific prefixes for factor names
      model.suff <- private$getModelSuff(private$esaEDObj)
      model.slots <- unlist(lapply(private$esaEDObj$getSlots(), function(x) x$slotID))
      df[, varname := gsub(paste(paste0(model.slots,'_'),collapse='|',sep=''), '',factor)]
      df[, varname := gsub(paste0(model.suff,'_'),'',varname)]
      # reshape wide
      df[, model_grouper := gsub(paste0('_',model.suff),'',model_key)]
      # get unique list of models identified
      modelsAvail <- unique(df$model_grouper)
      wide <- dcast(df, formula=varname~model_grouper, fill=NA,
                    value.var=c('estimate', 'p_value',"lower_ci_90","upper_ci_90",
                                "lower_ci_95","upper_ci_95"))
      # if some slots did not obtain any results, create empty NA column.
      modelsShouldAvail <- gsub(paste0("_", model.suff), "", private$modelIdentifiers)
      modelsUnavail <- modelsShouldAvail[!modelsShouldAvail %in% modelsAvail]
      if (length(modelsUnavail) > 0){
        colsModelsUnavail <- c(paste0("estimate_",modelsUnavail),paste0("p_value_",modelsUnavail))
        wide[, (colsModelsUnavail) := NA]
      }
      pval.cols <- paste0('p_value_', modelsShouldAvail)# unique(df[['model_grouper']]))
      pval.cols.out <- paste0(pval.cols,'_flag')
      # create flags whether p value is less than significance level define
      wide[, (pval.cols.out) := lapply(.SD, function(x) fcase(x<sig,1,default=NA)), .SDcols=pval.cols]
      if (length(model.slots) != length(modelsShouldAvail)) stop("unequal lengths")
      # multiply each slot estimate by whether estimate is significant or not
      wide[, (model.slots) := lapply(modelsShouldAvail, function(x) eval(parse(text=paste0("estimate_",x))) * eval(parse(text=paste0("p_value_", x, "_flag"))))]
      # create flag if meets the minumum number of significant if if count NA <
      wide[, count_na := Reduce('+', lapply(.SD, is.na)), .SDcols=model.slots]
      # count how many positive/negative coefficients there are for consistency
      wide[, has_pos := fifelse(Reduce('|', lapply(.SD, function(x) x>=0))==TRUE,TRUE,FALSE,FALSE),.SDcols=model.slots]
      wide[, has_neg := fifelse(Reduce('|', lapply(.SD, function(x) x<0))==TRUE,TRUE,FALSE,FALSE),.SDcols=model.slots]
      # flag whether the factor meets consistency & significant condition
      wide[, should_retain := (!(has_pos==has_neg) & (length(model.slots)-count_na) >= min.slot.sig)]
      # calculate row means, min and max
      withCallingHandlers({
        suppressWarnings(
          wide[, `:=` (
            slot_avg=fcase(should_retain,rowMeans(.SD, na.rm=TRUE), default=NA),
            slot_min=fcase(should_retain,apply(.SD,1,FUN=min,na.rm=TRUE),default=NA),
            slot_max=fcase(should_retain,apply(.SD,1,FUN=max,na.rm=TRUE),default=NA)
          ), .SDcols=model.slots]
        )
      },
      warning=function(){
        return(NULL)
      })
      # remove some columns
      wide[,(pval.cols.out):=NULL]
      private$modelResultsWide.all <- wide
      return(wide)
    },
    model.sample.averages=function(only.modelled=TRUE){
      # calculate sample averages
      sample.avgs <- lapply(names(private$modelRegression.datasets), function(x){
        # filter for those records which were modelled.
        df <- data.table::copy(private$modelRegression.datasets[[x]])
        if (only.modelled){
          df <- df[is_complete_case==TRUE]
        }
        # find columns which are numeric
        cols.numeric <- names(which(unlist(lapply(df, is.numeric))))
        site.code <- private$esaEDObj$getProviderSite()
        # calculate per site code, the mean, min & max of all numeric columns
        stats <- df[, as.list(unlist(lapply(.SD, function(y){
          list(mean=mean(y,na.rm=TRUE),
               min=min(y, na.rm=TRUE),
               max=max(y, na.rm=TRUE),
               sd=sd(y,na.rm=TRUE))
        }))), by=site.code, .SDcols=cols.numeric]
        # where no levels to average/max; min/max may return Inf/-Inf, remove and
        # replace with NA
        invisible(lapply(names(stats),function(.name) set(stats, which(is.infinite(stats[[.name]])),j=.name,value=NA)))
        stats.cols.numeric <- names(which(unlist(lapply(stats, is.numeric))))
        # calculate the average of all
        stats <- stats[, unlist(lapply(.SD, mean, na.rm=TRUE)), .SDcols=stats.cols.numeric]
        # create data.table of results
        stats <- setDT(as.data.frame(stats), keep.rownames=TRUE)
        colnames(stats) <- c('varname', 'value')
        # remove the model prefix
        model.suff <- private$getModelSuff(private$esaEDObj)
        model.slots <- unlist(lapply(private$esaEDObj$getSlots(), function(x) x$slotID))
        stats[, varname := gsub(paste(paste0(model.slots,'_'),collapse='|',sep=''), '',varname)]
        stats[, varname := gsub(paste0(model.suff,'_'),'',varname)]
        # seperate the .min, .max, and .mean into another column
        stats[, metric := .(
          fcase(
            grepl('.mean', varname, fixed=TRUE), 'mean',
            grepl('.max', varname, fixed=TRUE), 'max',
            grepl('.min', varname, fixed=TRUE), 'min',
            grepl('.sd',varname,fixed=TRUE), 'sd',
            default = 'unknown'
          )
        )]
        # remove .mean, .max, .min from varname
        stats[, varname := gsub('\\.mean|\\.max|\\.min|\\.sd', '', varname)]
        # remove the model suffix, as well as the dependent variable if provided
        nameCleanStr <- private$getModelSuff(private$esaEDObj)
        if (!is.null(private$dependent)){
          nameCleanStr <- paste0(nameCleanStr, "_", private$dependent)
        }
        slotsName <- gsub(pattern=nameCleanStr, "", x)
        stats[, metric := paste0(slotsName,metric)]
        # reshape wide so that min, max and mean are in seperate columns
        statsWide <- dcast(stats, formula=varname~metric, value.var=c('value'), fill=NA)
        # calculate for factor variables
        cols.factors <- names(which(unlist(lapply(df, is.factor))))
        factorCounts <- lapply(cols.factors, function(y){
          # calculate counts per factor, per site, then calculate mean, max and
          # min of this count overall, per factor level
          z <- df[, .(count=.N), by=c(y,site.code)][,.(mean=mean(count, na.rm=TRUE),
                                                       max=max(count,na.rm=TRUE),
                                                       min=min(count,na.rm=TRUE)),
                                                    by=y]
          setnames(z, old=colnames(z), new=c('varname', paste0(slotsName,c('mean','max','min'))))
          z[, varname := paste0(y,varname)]
          return(z)
        })
        factorSummary <- rbindlist(factorCounts, use.names=TRUE)
        factorSummary[,is_factor:=TRUE]
        # bind factor summaries to statswide
        statsWide <- rbindlist(list(statsWide,factorSummary),use.names=TRUE,fill=TRUE)
        # set varname as key for later merges
        setkeyv(statsWide, 'varname')
        return(statsWide)
      })
      if (length(sample.avgs)>=2){
        for (i in 2:length(sample.avgs)) sample.avgs[[i]][,is_factor:=NULL]
      }
      # merge each of the slot sample averages together
      all.sample.avgs <- Reduce(merge,sample.avgs)
      return(all.sample.avgs)
    },
    getModelSuff=function(obj){
      suff <- paste0(ifelse(is.null(obj$getQueue()),obj$getAcuity()$name,
                    paste0(obj$getQueue()$name,'_',obj$getAcuity()$name)))
      return(suff)
    },
    depVarFromFormula=function(form){
      if (attr(terms(as.formula(form)),which='response')){
        return(all.vars(form)[1])
      }
      return(NULL)
    }
  )
)
nhsengland/ESA_ED_Crowding documentation built on Nov. 23, 2022, 4:49 p.m.