R/mod_GlimmaR_navigate.R

Defines functions calc_terms_importances offset_importances predict_offsets extract_offsets predict_tabulations band_var_with_feature_spec adjust_base_levels predict_on_tabulations prepare_glm_tabulations position feature_banding uncentered_vars uncentered_terms_new summarise_glm_vars return_banded_levels export_model get_base_risk GlimmaR_model_summary_row GlimmaR_model_summary mod_GlimmaR_navigate_server mod_GlimmaR_navigate_ui

#' navigateGlimmaR UI Function
#'
#' @description A shiny Module.
#'
#' @param id,input,output,session Internal parameters for {shiny}.
#'
#' @noRd 
#'
#' @importFrom shiny NS tagList 
#' @importFrom DT DTOutput
mod_GlimmaR_navigate_ui <- function(id){
  ns <- NS(id)
  tagList(
    fluidRow(
      column(
        width = 4,
        h3('GlimmaR model summary')
      ),
      column(width = 8,
             align = 'right',
             style = 'margin-top:16px; padding-right:16px; padding-bottom:0px',
             actionButton(
               inputId = ns('delete_model'),
               label = 'Delete',
               icon = icon("minus-circle"),
               class = 'btn-danger'
             ),
             actionButton(
               inputId = ns('make_active'),
               label = 'Make active',
               icon = icon("chevron-right")
             ),
             actionButton(
               inputId = ns('tabulate'),
               label = 'Tabulate',
               icon = icon("table")
             ),
             shinySaveButton(
               id = ns('save_model'),
               label = 'Save GLM',
               title = 'Save GLM model as .rds',
               filename = "",
               filetype = list(txt="rds"),
               icon = icon('upload'),
               style = 'color: #fff; background-color: #4bb03c; border-color: #3e6e37; text-align: left',
               viewtype = "detail"
             ),
             shinySaveButton(
               id = ns('save_GlimmaR_models'),
               label = 'Save all GlimmaR models',
               title = 'Save all GlimmaR models as .rds',
               filename = "",
               filetype = list(txt="rds"),
               icon = icon('upload'),
               style = 'color: #fff; background-color: #4bb03c; border-color: #3e6e37; text-align: left',
               viewtype = "detail"
             )
      )
    ),
    fluidRow(
      column(width = 12,
             align = 'right',
             DTOutput(ns('model_summary')
             )
      )
    ),
    fluidRow(
      column(
        width = 12,
        fluidRow(
          column(
            width = 5,
            h3('Importances')
          ),
          column(
            width = 7,
            align = 'right',
            div(
              style = 'margin-top:16px; margin-bottom:-16px',
              radioGroupButtons(
                inputId = ns('importance_type'),
                label = NULL,
                choices = c('Features','Terms'),
                selected = 'Features'
              )
            )
          )
        ),
        DTOutput(ns('importance_summary'))
      )
    )
  )
}
    
#' navigateGlimmaR Server Functions
#'
#' @importFrom DT styleEqual
#'
#' @noRd 
mod_GlimmaR_navigate_server <- function(id, d, response, weight, feature_spec, GlimmaR_models, GlimmaR_idx, tabulated_models){
  moduleServer( id, function(input, output, session){
    ns <- session$ns
    observeEvent(GlimmaR_models(), {
      output$model_summary <- renderDT({
        # model summary table
        if(length(GlimmaR_models())==0){
          model_summary <- NULL
        } else {
          model_summary <- GlimmaR_model_summary(GlimmaR_models())
        }
        if(is.null(model_summary)){
          model_summary <- data.table(name='No GLMs built')
        }
          model_summary |>
          DT::datatable(rownames= FALSE,
                        extensions = 'Buttons',
                        selection=list(mode="multiple", target="row"),
                        options = list(pageLength = nrow(model_summary),
                                       #initComplete = JS("function(settings, json) {$(this.api().table().header()).css({'font-size' : '12px'});}"),
                                       dom = 'Bfrt',
                                       scrollX = T,
                                       scrollY = 'calc(20vh)',
                                       searchHighlight=TRUE,
                                       buttons =
                                         list('colvis', 'copy', list(
                                           extend = 'collection',
                                           buttons = list(list(extend='csv',filename = ''),
                                                          list(extend='excel',filename = ''),
                                                          list(extend='pdf',filename= '')),
                                           text = 'Download')
                                         )
                        )
          ) |>
          formatStyle(columns = colnames(model_summary), lineHeight='0%', fontSize = '14px') |>
          formatStyle(columns = 'name', target='row', backgroundColor = styleEqual(GlimmaR_idx(), rgb(100/255,180/255,220/255)))
      })
    })
    observeEvent(c(GlimmaR_models(), GlimmaR_idx()), {
      output$importance_summary <- renderDT({
        # model summary table
        if(!is.null(GlimmaR_models()) & !is.null(GlimmaR_idx()) & length(GlimmaR_models())>0){
          if(input$importance_type=='Terms'){
            dt <- GlimmaR_models()[[GlimmaR_idx()]]$importances[[1]]
          } else {
            dt <- GlimmaR_models()[[GlimmaR_idx()]]$importances[[2]]
          }
        } else {
          dt <- data.table('V1'='No GLMs')
        }
        imp_table <- dt |>
          DT::datatable(rownames= FALSE,
                        extensions = 'Buttons',
                        selection='single',
                        options = list(pageLength = nrow(dt),
                                       #initComplete = JS("function(settings, json) {$(this.api().table().header()).css({'font-size' : '12px'});}"),
                                       dom = 'rt',
                                       scrollX = T,
                                       scrollY = 'calc(80vh - 380px)',
                                       searchHighlight=TRUE
                                       )
          ) |>
          formatStyle(columns = colnames(dt), lineHeight='0%', fontSize = '14px')
        if(length(GlimmaR_models())>0){
          imp_table <- imp_table |> formatRound('importance', 4)
        }
      })
    })
    observeEvent(input$delete_model, {
      rows_selected <- input$model_summary_rows_selected
      if(!is.null(rows_selected)){
        new_list <- GlimmaR_models()[-rows_selected]
        GlimmaR_models(new_list)
      } else {
        confirmSweetAlert(session = session,
                          type = 'error',
                          inputId = "temp",
                          title = 'No GLMs selected',
                          btn_labels = c('OK')
        )
      }
    })
    observeEvent(input$make_active, {
      rows_selected <- input$model_summary_rows_selected
      if(!is.null(rows_selected)){
        if(length(rows_selected)>1){
          rows_selected <- rows_selected[1]
        }
        GlimmaR_idx(names(GlimmaR_models())[rows_selected])
      } else {
        confirmSweetAlert(session = session,
                          type = 'error',
                          inputId = "temp",
                          title = 'No GLMs selected',
                          btn_labels = c('OK')
        )
      }
    })
    observeEvent(input$tabulate, {
      rows_selected <- input$model_summary_rows_selected
      if(!is.null(rows_selected)){
        if(length(rows_selected)>1){
          rows_selected <- rows_selected[1]
        }
        rows_selected
        if(length(GlimmaR_models())>0){
          # QUESTION - need to check for multiple selections
          model_name <- names(GlimmaR_models())[rows_selected]
          g <- GlimmaR_models()[[model_name]]
          if(!is.null(g)){
            base_risk <- get_base_risk(d(), feature_spec(), g$weight)
              tabulated_model <- NULL
              glm_term_summary <- summarise_glm_vars(g$glm)
              if(!is.null(glm_term_summary)){
                var_terms <- split(glm_term_summary, by ='vars', keep.by = FALSE)
              } else {
                var_terms <- NULL
              }
              tabulations <- prepare_glm_tabulations(d()[g$pred_rows], var_terms, feature_spec())
              tabulations <- predict_on_tabulations(tabulations, g$glm, var_terms)
              tabulations_adj <- adjust_base_levels(tabulations, feature_spec())
              predictions <- predict_tabulations(d()[g$pred_rows], tabulations_adj, feature_spec())
              predictions <- cbind(predictions, glm_fitted = g$predictions)
              # calculate the sd between glm prediction and tabulated prediction
              sd_error <- sd(link_function(predictions$tabulated_glm, g$link)-g$predictions)
              # save the tabulations into the GlimmaR model
              temp <- GlimmaR_models()
              temp[[model_name]][['tabulations']] <- tabulations_adj
              temp[[model_name]][['tabulated_predictions']] <- predictions
              temp[[model_name]][['tabulated_error']] <- sd_error
              GlimmaR_models(temp)
              # save into list
              temp <- tabulated_models()
              temp[[g$name]] <- tabulated_model
              tabulated_models(temp)
          }
        }
      }
    })
    observe({
      volumes <- c('working directory' = getwd(), 'home' = fs::path_home())
      shinyFileSave(input, 'save_model', roots=volumes, session=session)
      fileinfo <- parseSavePath(volumes, input$save_model)
      isolate({
        if (nrow(fileinfo) > 0) {
          # leave only part of glm_model object needed for prediction
          # otherwise file will be huge (it retains data used to fit model)
          stripped_model <- GlimmaR_models()[[GlimmaR_idx()]]$glm
          saveRDS(stripped_model, file = fileinfo$datapath, compress = TRUE)
          confirmSweetAlert(session = session,
                            type = 'success',
                            inputId = "temp",
                            title = 'GLM saved',
                            btn_labels = c('OK')
          )
        }
      })
    })
    observe({
      volumes <- c('working directory' = getwd(), 'home' = fs::path_home())
      shinyFileSave(input, 'save_GlimmaR_models', roots=volumes, session=session)
      fileinfo <- parseSavePath(volumes, input$save_GlimmaR_models)
      isolate({
        if (nrow(fileinfo) > 0) {
          # leave only part of glm_model object needed for prediction
          # otherwise file will be huge (it retains data used to fit model)
          if(length(GlimmaR_models())>0){
            saveRDS(GlimmaR_models(), file = fileinfo$datapath, compress = TRUE)
            confirmSweetAlert(session = session,
                              type = 'success',
                              inputId = "temp",
                              title = 'GlimmaR models saved',
                              btn_labels = c('OK')
            )
          } else {
            confirmSweetAlert(session = session,
                              type = 'error',
                              inputId = "temp",
                              title = 'No GLMs built',
                              btn_labels = c('OK')
            )
          }
        }
      })
    })
  })
}

GlimmaR_model_summary <- function(GlimmaR_models){
  # takes the key info from the BoostaR_models
  # and makes a summary data table
  rows <- lapply(GlimmaR_models, GlimmaR_model_summary_row)
  rbindlist(rows)
}
GlimmaR_model_summary_row <- function(GlimmaR_model){
  x <- data.table(name =  GlimmaR_model$name,
                  time = round(GlimmaR_model$time, 1),
                  data = GlimmaR_model$dataset_name,
                  train = GlimmaR_model$training_data,
                  response = GlimmaR_model$response,
                  weight = GlimmaR_model$weight,
                  obj = GlimmaR_model$objective,
                  terms = GlimmaR_model$num_terms,
                  dev = signif(GlimmaR_model$deviance, 6),
                  AIC = signif(GlimmaR_model$AIC, 6),
                  dispersion = signif(GlimmaR_model$dispersion, 6),
                  NAs = GlimmaR_model$count_NAs
  )
  return(x)
}

get_base_risk <- function(d, feature_spec, weight){
  base_risk <- d[1,]
  if(weight!='N'){
    base_risk[[weight]] <- 1
  }
  for (col in names(base_risk)){
    base_level <- feature_spec$base_level[feature_spec$feature==col]
    if(length(base_level)>0){
      if(is.na(base_level)){
        # do nothing
      } else {
        if(base_level!=''){
          if(inherits(base_risk[[col]],'factor')){
            base_risk[[col]] <- base_level
          } else {
            base_risk[[col]] <- as.numeric(base_level)
          }
        }
      }
    }
  }
  return(base_risk)
}
export_model <- function(dat, model, base_risk, collapse, type, feature_spec = NULL, format, weight, response, fitted = NULL, model_name = NULL){
  # function converts a glm model into rectangular tables
  # get a list of the factor in the base_risk and whether they are present in the model
  grouping_var <- NULL
  grouping_var_1 <- NULL
  grouping_var_2 <- NULL
  factors_in_base_risk <- as.data.frame(all.vars(model$formula))
  factors_in_base_risk[,1] <- as.character(factors_in_base_risk[,1])
  factors_in_base_risk$id <- 1:nrow(factors_in_base_risk)
  names(factors_in_base_risk)[1] <- 'var1'
  # get model terms and append on any offset terms (which can be found in variables)
  model_terms <- attr(stats::terms(model), 'term.labels')
  # get any terms with an offset, removing the weight term if present
  model_terms_offset <- as.character(attr(stats::terms(model), 'variables'))
  model_terms_offset <- model_terms_offset[grep('offset\\(', model_terms_offset)]
  model_terms <- c(model_terms, model_terms_offset)
  model_terms_formulae <- paste('y ~', model_terms)
  variable_list <- data.frame('var1'=character(0), 'var2'=character (0))
  if(length(model_terms)>0){
    for (i in 1:length(model_terms_formulae)){
      variables <- all.vars(stats::as.formula(model_terms_formulae[i]))
      if (length(variables)==2){
        # simple factor
        variable_list <- rbind(variable_list, data.frame('var1'=variables[2],'var2'=''))
      } else if (length(variables)==3) {
        # two way interaction
        variable_list <- rbind(variable_list, data.frame('var1'=variables[2],'var2'=variables[3]))
      }
    }
  }
  # make the list unique and remove the weight if present
  variable_list <- unique(variable_list)
  variable_list <- variable_list[variable_list$var1!=weight,]
  # create dataframes containing the factors and interactions present in the model
  factors <- variable_list[variable_list$var2=='',]
  interactions <- variable_list[variable_list$var2!='',]
  factors[,1] <- as.character(factors[,1])
  interactions[,1] <- as.character(interactions[,1])
  interactions[,2] <- as.character(interactions[,2])
  # shuffle order in factors to make closer to original formula
  factors  <- merge(factors, factors_in_base_risk, by = 'var1', all.x = TRUE)
  factors <- factors[order(factors$id),]
  factors$id <- NULL
  # create a list to contain the model tables, extra 1 for the base level
  number_of_tables <- 1 + nrow(factors) + nrow(interactions)
  table_list <- vector('list', number_of_tables)
  # get the base level for the model and put it into the first element of the list
  if(model$family$link == 'logit'){type = 'link'} # can't export response for binomial
  base_level <- stats::predict(model, newdata = base_risk, type = type)
  # if long format, need extra information for the base level row
  if(format %in% c('long')){
    if(weight=='N'){
      wts <- rep(1,nrow(model$data))
    } else {
      wts <- model$data[[weight]]
    }
    # dat_subset <- data.frame(weight = wts, observed = model$y, fitted = model$fitted.values)
    # summary_tot <- dat_subset %>%
    #   dplyr::summarise_at(c('observed','fitted','weight'), sum, na.rm = TRUE) %>%
    #   dplyr::mutate_at(c('observed','fitted'), ~./ weight)
    
    # DATA.TABLE replacement
    cols <- c('observed','fitted')
    summary_tot <- data.table(weight = wts, observed = model$y, fitted = model$fitted.values)
    summary_tot <- summary_tot[, lapply(.SD, sum), .SDcols = c(cols,'weight')]
    summary_tot[, observed :=  observed/weight]
    summary_tot[, fitted :=  fitted/weight]
    if(is.null(model_name)) model_name <- paste(sep='_', response, weight)
    # include an operation signifier in the factor1 column for the base level table
    # strip out the base level
    if(type == 'link' | model$family$link == 'identity'){
      operation <- 'sum'
    } else if (model$family$link == 'log'){
      operation <- 'product'
    } else {
      # any other link function won't have a sensible response so revert to link function
      # no matter what the user asked for
      operation <- 'sum'
      type <- 'link'
    }
    base_table <- cbind(model_name=model_name,
                        factor1='ratebook_operation',
                        factor1_levels=operation,
                        factor2='',
                        factor2_levels='',
                        summary_tot,
                        model_relativity = base_level)
  } else {
    base_table <- data.frame(model_relativity = base_level)
  }
  table_list[[1]] <- list(table = base_table, name = 'base level')
  # create one way tables
  num_1D_tables <- number_of_tables - nrow(interactions) - 1
  # cumulative base adjustment for long format tables
  # adjust relativities so they average to 1.000
  if(type == 'link' | model$family$link == 'identity'){
    long_format_adj <- 0.0
  } else if (model$family$link == 'log'){
    long_format_adj <- 1.0
  } else {
    long_format_adj <- 0.0
  }
  for (i in 1:num_1D_tables){
    if(num_1D_tables==0) next
    factor_name <- factors[i,1]
    expanded_factor <- return_banded_levels(factor_name, feature_spec, FALSE, dat)
    # could change next line to only involve features that are present in the model, rather than the whole row
    dummy_risks <- base_risk[rep(1,each=nrow(expanded_factor$expanded_factor)),]
    dummy_risks[,factor_name] <- expanded_factor$expanded_factor_for_prediction
    predictions <- stats::predict(model, newdata = dummy_risks, type = type)
    # strip out the base level
    if(type == 'link' | model$family$link == 'identity'){
      predictions <- predictions - base_level
    } else if (model$family$link == 'log'){
      predictions <- predictions / base_level
    } else {
      # any other link function won't have a sensible response so revert to link function
      predictions <- predictions - base_level
    }
    # rounding
    predictions <- signif(predictions, 6)
    predictions[abs(predictions)<0.0000001] <- 0
    export_table <- setDT(cbind(expanded_factor$expanded_factor, predictions))
    colnames(export_table)[1] <- factor_name
    colnames(export_table)[2] <- 'model_relativity'
    if(format %in% c('long')){
      # append on KPI columns for
      # number of observations, weight, fitted, observed, model relativity
      # first band the feature if it is numeric
      print(factor_name)
      if(is.numeric(dat[[factor_name]])){
        banding <- feature_spec$banding[feature_spec$feature==factor_name]
        banding_min <- feature_spec$min[feature_spec$feature==factor_name]
        banding_max <- feature_spec$max[feature_spec$feature==factor_name]
        if(!is.na(banding)){
          if(length(banding)>0){
            if(banding=='') banding <- numeric(0)
          }
        } else {
          banding <- numeric(0)
        }
        if(length(banding)==0){
          # guess the bandings
          banding <- banding_guesser(dat[[factor_name]])
          banding_min <- min(dat[[factor_name]], na.rm = TRUE)
          banding_max <- max(dat[[factor_name]], na.rm = TRUE)
          # round down or up as needed
          banding_min <- floor(banding_min/banding)*banding
          banding_max <- ceiling(banding_max/banding)*banding
        }
        # apply same rounding as used in table creation to ensure merge can proceed
        grouped <- floor(model$data[[factor_name]]/banding)*banding
        grouped <- round(pmax(banding_min, pmin(banding_max, grouped)),6)
      } else {
        # categorical feature - just use raw data column
        grouped <- model$data[[factor_name]]
      }
      # extract actual and fitted from glm_model
      #dat_subset <- data.frame(grouping_var = dat[['grouping_var']], weight = dat[[weight]], observed = glm_model()$y, fitted = glm_model()$fitted.values)
      if(weight=='N'){
        wts <- rep(1,nrow(model$data))
      } else {
        wts <- model$data[[weight]]
      }
      
      # data.table replacement
      dat_subset <- data.table(grouping_var = grouped, weight = wts, observed = model$y, fitted = model$fitted.values)
      summary <- dat_subset[, lapply(.SD, sum), by = grouping_var, .SDcols = c('observed', 'fitted', 'weight')]
      summary[, observed := observed/weight]
      summary[, fitted := fitted/weight]
      
      # merge summary onto the rating table
      export_table <- merge(export_table, summary, by.x = factor_name, by.y = 'grouping_var', all.x = TRUE)
      if(format=='long norm'){
        # adjust relativities so they average to 1.000
        if(type == 'link' | model$family$link == 'identity'){
          average_relativity <- sum(export_table$model_relativity*export_table$weight, na.rm =TRUE) / sum(export_table$weight, na.rm = TRUE)
          export_table$model_relativity <- export_table$model_relativity - average_relativity
          long_format_adj <- long_format_adj + average_relativity
        } else if (model$family$link == 'log'){
          average_relativity <- sum(export_table$model_relativity*export_table$weight, na.rm =TRUE) / sum(export_table$weight, na.rm = TRUE)
          export_table$model_relativity <- export_table$model_relativity / average_relativity
          long_format_adj <- long_format_adj * average_relativity
        } else {
          # any other link function won't have a sensible response so revert to link function
          average_relativity <- sum(export_table$model_relativity*export_table$weight, na.rm =TRUE) / sum(export_table$weight, na.rm = TRUE)
          export_table$model_relativity <- export_table$model_relativity - average_relativity
          long_format_adj <- long_format_adj + average_relativity
        }
      }
      # swap around the position of the value column (2nd column) and rename
      export_table <- export_table[, c(1,3,4,5,2)]
      names(export_table)[5] <- 'model_relativity'
      # create a summary row at the bottom
      # this is a bit long-winded, but a useful check that the relativities average to 1.000
      export_table$weighted_relativity <- export_table$model_relativity * export_table$weight
      export_table$weighted_observed <- export_table$observed * export_table$weight
      export_table$weighted_fitted <- export_table$fitted * export_table$weight
      
      # data.table replacement
      cols <- c('weighted_observed','weighted_fitted','weight','weighted_relativity')
      summary_tot <- export_table[, lapply(.SD, sum, na.rm = TRUE), .SDcols = cols]
      cols <- c('weighted_observed','weighted_fitted','weighted_relativity')
      summary_tot[, (cols) := lapply(.SD, '/', weight), .SDcols = cols]
      summary_tot <- cbind('', summary_tot)
      export_table$weighted_relativity <- NULL
      export_table$weighted_observed <- NULL
      export_table$weighted_fitted <- NULL
      # bind the summary row to the overall table
      names(summary_tot) <- names(export_table)
      export_table <- rbind(summary_tot, export_table)
      # rename columns and create a blank column for factor 2
      names(export_table)[1] <- 'factor1_levels'
      export_table$factor2_levels <- ''
      # bind the factor name
      export_table <- cbind('factor1'=rep(factor_name, nrow(export_table)),
                            'factor2'=rep('',nrow(export_table)),
                            export_table)
      export_table <- export_table[, c(1,3,2,8,4,5,6,7)]
      # bind a model name row if one has been supplied
      if(!is.null(model_name)){
        export_table <- cbind('model_name'=rep(model_name, nrow(export_table)), export_table)
      }
    }
    table_list[[i+1]] <- list(table = export_table, name = factor_name)
  }
  # create two way tables
  for (i in 1:nrow(interactions)){
    if (nrow(interactions)==0) next
    factor1_name <- interactions[i,1]
    factor2_name <- interactions[i,2]
    expanded_factor_1 <- return_banded_levels(factor1_name, feature_spec, FALSE, dat)
    expanded_factor_2 <- return_banded_levels(factor2_name, feature_spec, FALSE, dat)
    expanded_factor_1$expanded_factor <- as.vector(expanded_factor_1$expanded_factor[[1]]) # expand.grid didn't like data frames, so make vector
    expanded_factor_2$expanded_factor <- as.vector(expanded_factor_2$expanded_factor[[1]])
    expanded_factor_1$expanded_factor_for_prediction <- as.vector(expanded_factor_1$expanded_factor_for_prediction[[1]])
    expanded_factor_2$expanded_factor_for_prediction <- as.vector(expanded_factor_2$expanded_factor_for_prediction[[1]])
    expanded_factor <- expand.grid(expanded_factor_1$expanded_factor, expanded_factor_2$expanded_factor)
    expanded_factor_for_prediction <- expand.grid(expanded_factor_1$expanded_factor_for_prediction, expanded_factor_2$expanded_factor_for_prediction)
    dummy_risks <- base_risk[rep(1,each=nrow(expanded_factor_for_prediction)),]
    dummy_risks1 <- dummy_risks
    dummy_risks2 <- dummy_risks
    dummy_risks[,factor1_name] <- expanded_factor[,1]
    dummy_risks[,factor2_name] <- expanded_factor[,2]
    dummy_risks1[,factor1_name] <- expanded_factor[,1]
    dummy_risks2[,factor2_name] <- expanded_factor[,2]
    predictions <- stats::predict(model, newdata = dummy_risks, type = type)
    predictions1 <- stats::predict(model, newdata = dummy_risks1, type = type)
    predictions2 <- stats::predict(model, newdata = dummy_risks2, type = type)
    if (collapse){
      if(type == 'link' | model$family$link == 'identity'){
        predictions <- predictions - base_level
      } else if (model$family$link == 'log'){
        predictions <- predictions / base_level
      } else {
        # any other link function won't have a sensible response so revert to link function
        predictions <- predictions - base_level
      }
    } else {
      if(type == 'link'| model$family$link == 'identity'){
        predictions <- predictions + base_level - predictions1 - predictions2
      } else if (model$family$link == 'log'){
        predictions <- (base_level * predictions) / (predictions1 * predictions2)
      } else {
        predictions <- predictions + base_level - predictions1 - predictions2
      }
    }
    # rounding and set numbers v close to zero to zero
    predictions <- signif(predictions, 6)
    predictions[abs(predictions)<0.0000001] <- 0
    export_table <- as.data.table(cbind(expanded_factor, predictions))
    colnames(export_table)[1] <- factor1_name
    colnames(export_table)[2] <- factor2_name
    colnames(export_table)[3] <- 'model_relativity'
    if(format=='solo'){
      # 2 dimensional table
      export_table <- dcast(export_table, export_table[[factor1_name]] ~ export_table[[factor2_name]], value.var = c('model_relativity'), fun.aggreagte=sum)
      colnames(export_table)[1] <- paste(factor1_name, '__X__', factor2_name, sep = '')
    } else if (format %in% c('long')){
      # long table with two columns for features
      # export_table is in correct format, we need to append KPI columns for
      # number of observations, weight, fitted, observed, model relativity
      # band feature 1 if numeric
      if(is.numeric(dat[[factor1_name]])){
        banding_min_1 <- feature_spec$min[feature_spec$feature==factor1_name]
        banding_max_1 <- feature_spec$max[feature_spec$feature==factor1_name]
        banding_1 <- feature_spec$banding[feature_spec$feature==factor1_name]
        if(length(banding_1)==0 | banding_1=='') banding_1 <- NA
        if(is.na(banding_1)){
          # guess the bandings
          banding_1 <- banding_guesser(dat[[factor1_name]])
          banding_min_1 <- min(dat[[factor1_name]], na.rm = TRUE)
          banding_max_1 <- max(dat[[factor1_name]], na.rm = TRUE)
        }
        dat$grouping_var_1 <- floor(model$data[[factor1_name]]/banding_1)*banding_1
        dat$grouping_var_1 <- pmax(banding_min_1, pmin(banding_max_1, dat$grouping_var_1))
      } else {
        # categorical feature - just use raw data column
        dat$grouping_var_1 <- model$data[[factor1_name]]
      }
      # band feature 2 if numeric
      if(is.numeric(dat[[factor2_name]])){
        banding_min_2 <- feature_spec$min[feature_spec$feature==factor2_name]
        banding_max_2 <- feature_spec$max[feature_spec$feature==factor2_name]
        banding_2 <- feature_spec$banding[feature_spec$feature==factor2_name]
        if(length(banding_2)==0 | banding_2=='') banding_2 <- NA
        if(is.na(banding_2)){
          # guess the bandings
          banding_2 <- banding_guesser(dat[[factor2_name]])
          banding_min_2 <- min(dat[[factor2_name]], na.rm = TRUE)
          banding_max_2 <- max(dat[[factor2_name]], na.rm = TRUE)
        }
        dat$grouping_var_2 <- floor(model$data[[factor2_name]]/banding_2)*banding_2
        dat$grouping_var_2 <- pmax(banding_min_2, pmin(banding_max_2, dat$grouping_var_2))
      } else {
        # categorical feature - just use raw data column
        dat$grouping_var_2 <- model$data[[factor2_name]]
      }
      # extract actual and fitted from glm_model
      if(weight=='N'){
        wts <- rep(1, nrow(dat))
      } else {
        wts <- dat[[weight]]
      }
      
      # data.table version
      observed <- NULL
      dat_subset <- data.table(grouping_var_1 = dat[['grouping_var_1']], grouping_var_2 = dat[['grouping_var_2']], weight = wts, observed = model$y, fitted = model$fitted.values)
      summary <- dat_subset[, lapply(.SD, sum), by = c('grouping_var_1','grouping_var_2'), .SDcols = c('observed', 'fitted', 'weight')]
      cols <- c('observed', 'fitted')
      summary <- summary[, (cols) := lapply(.SD, '/', weight), .SDcols = cols]
      
      # apply some rounding
      # this is probably not ideal
      if(is.numeric(summary$grouping_var_1)){
        summary$grouping_var_1 <- signif(summary$grouping_var_1,6)
      }
      if(is.numeric(summary$grouping_var_2)){
        summary$grouping_var_2 <- signif(summary$grouping_var_2,6)
      }
      # merge summary onto the rating table
      export_table <- merge(export_table, summary, by.x = c(factor1_name, factor2_name), by.y = c('grouping_var_1', 'grouping_var_2'), all.x = TRUE)
      # adjust relativities so they average to 1.000
      if(format=='long norm'){
        if(type == 'link' | model$family$link == 'identity'){
          average_relativity <- sum(export_table$model_relativity*export_table$weight, na.rm =TRUE) / sum(export_table$weight, na.rm = TRUE)
          export_table$model_relativity <- export_table$model_relativity - average_relativity
          long_format_adj <- long_format_adj + average_relativity
        } else if (model$family$link == 'log'){
          average_relativity <- sum(export_table$model_relativity*export_table$weight, na.rm =TRUE) / sum(export_table$weight, na.rm = TRUE)
          export_table$model_relativity <- export_table$model_relativity / average_relativity
          long_format_adj <- long_format_adj * average_relativity
        } else {
          # any other link function won't have a sensible response so revert to link function
          average_relativity <- sum(export_table$model_relativity*export_table$weight, na.rm =TRUE) / sum(export_table$weight, na.rm = TRUE)
          export_table$model_relativity <- export_table$model_relativity - average_relativity
          long_format_adj <- long_format_adj + average_relativity
        }
      }
      # swap around the position of the value column (2nd column) and rename
      export_table <- export_table[, c(1,2,4,5,6,3)]
      names(export_table)[1] <- 'factor1_levels'
      names(export_table)[2] <- 'factor2_levels'
      names(export_table)[6] <- 'model_relativity'
      # create a summary row at the bottom
      # this is a bit long-winded, but a useful check that the relativities average to 1.000
      export_table$weighted_relativity <- export_table$model_relativity * export_table$weight
      export_table$weighted_observed <- export_table$observed * export_table$weight
      export_table$weighted_fitted <- export_table$fitted * export_table$weight
      # summary_tot <- export_table %>%
      #   dplyr::summarise_at(c('weighted_observed','weighted_fitted','weight','weighted_relativity'), sum, na.rm = TRUE) %>%
      #   dplyr::mutate_at(c('weighted_observed','weighted_fitted','weighted_relativity'), ~./ weight)
      
      # data.table version
      cols <- c('weighted_observed','weighted_fitted','weighted_relativity')
      summary_tot <- export_table[, lapply(.SD, sum, na.rm = TRUE), .SDcols = c(cols, 'weight')]
      summary_tot[, (cols) := lapply(.SD, '/', weight), .SDcols = cols]
      setcolorder(summary_tot, c('weighted_observed','weighted_fitted','weight','weighted_relativity'))
      summary_tot <- cbind('','', summary_tot)
      export_table$weighted_relativity <- NULL
      export_table$weighted_observed <- NULL
      export_table$weighted_fitted <- NULL
      # bind the summary row to the overall table
      names(summary_tot) <- names(export_table)
      export_table <- rbind(summary_tot, export_table)
      # bind the factor name
      export_table <- cbind('factor1'=rep(factor1_name, nrow(export_table)),
                            'factor2'=rep(factor2_name, nrow(export_table)),
                            export_table)
      # reorder columns yet again
      export_table <- export_table[, c(1,3,2,4,5,6,7,8)]
      # bind a model name row if one has been supplied
      if(!is.null(model_name)){
        export_table <- cbind('model_name'=rep(model_name, nrow(export_table)), export_table)
      }
    }
    table_list[[1 + nrow(factors) + i]] <- list(table = export_table, name = paste(sep = '__X__', factor1_name, factor2_name))
  }
  # adjust the long format base level if neccesary
  # adjust relativities so they average to 1.000
  if(format %in% c('long')){
    if(type == 'link' | model$family$link == 'identity'){
      table_list[[1]]$table$model_relativity <- table_list[[1]]$table$model_relativity + long_format_adj
    } else if (model$family$link == 'log'){
      table_list[[1]]$table$model_relativity <- table_list[[1]]$table$model_relativity * long_format_adj
    } else {
      # any other link function won't have a sensible response so revert to link function
      table_list[[1]]$table$model_relativity <-table_list[[1]]$table$model_relativity + long_format_adj
    }
  }
  # return list from function, include slot for the format of the tables
  table_names <- sapply(table_list, '[[', 'name')
  names(table_list) <- table_names
  table_list$format <- format
  return(table_list)
}
return_banded_levels <- function(factor_name, feature_spec, use_mid_point_column, dat){
  
  # if the feature spec contains banding/min/max then use those
  # otherwise make a guess as to a good banding level
  banding_min <- NA
  banding_max <- NA
  banding <- NA
  use_mid_point <- FALSE
  if(!is.null(feature_spec) & is.numeric(dat[[factor_name]]) & ('banding' %in% names(feature_spec))){
    banding_min <- feature_spec$min[feature_spec$feature==factor_name]
    banding_max <- feature_spec$max[feature_spec$feature==factor_name]
    banding <- feature_spec$banding[feature_spec$feature==factor_name]
    use_mid_point <- feature_spec$use_mid_point[feature_spec$feature==factor_name]
    if(length(banding_min)==0) banding_min <- NA
    if(length(banding_max)==0) banding_max <- NA
    if(length(banding)==0) banding <- NA
    if(length(use_mid_point)==0){
      use_mid_point <- FALSE
    } else if(is.na(use_mid_point)){
      use_mid_point <- FALSE
    } else if(inherits(use_mid_point,'character')){
      if(use_mid_point=='TRUE') use_mid_point <- TRUE
    }
    
    # override use_mid_point if
    if(use_mid_point_column=='No'){
      use_mid_point <- FALSE
    }
    
  }
  
  # set the banding level if not using feature spec or banding level not found
  if(is.null(feature_spec) | is.na(banding) | banding==''){
    # guess the banding level if numeric
    if(is.numeric(dat[[factor_name]])){
      banding <- banding_guesser(dat[[factor_name]])
    }
  }
  
  # set the min if not using feature spec or banding level not found
  if(is.null(feature_spec) | is.na(banding_min) | banding_min==''){
    if(is.numeric(dat[[factor_name]])){
      banding_min <- min(dat[[factor_name]], na.rm = TRUE)
    }
  }
  
  # set the max if not using feature spec or banding level not found
  if(is.null(feature_spec) | is.na(banding_max) | banding_max==''){
    if(is.numeric(dat[[factor_name]])){
      banding_max <- max(dat[[factor_name]], na.rm = TRUE)
    }
  }
  
  # create levels we want to predict on
  if(is.numeric(dat[[factor_name]])){
    # in case feature spec saved as characters
    banding <- as.numeric(banding)
    banding_min <- as.numeric(banding_min)
    banding_max <- as.numeric(banding_max)
    # simple sequence - code below makes sure end points are correct
    banding_min <- floor(banding_min/banding)*banding
    banding_max_temp <- floor(banding_max/banding)*banding
    if(banding_max_temp==banding_max){
      banding_max <- banding_max_temp
    } else {
      banding_max <- banding_max_temp + banding
    }
    
    # expand out the feature
    # apply some rounding to make merging on data easier later
    expanded_factor <- round(seq(banding_min, banding_max, banding),6)
    expanded_factor <- as.data.frame(expanded_factor)
    
    # if use_mid_point is TRUE then predict on mid-point not start of band
    if(use_mid_point){
      expanded_factor_for_prediction <- expanded_factor + banding/2
    } else {
      expanded_factor_for_prediction <- expanded_factor
    }
    
  } else {
    # every level in the dataset for non-numeric features
    expanded_factor <- as.data.frame(levels(dat[[factor_name]]))
    expanded_factor_for_prediction <- expanded_factor
  }
  
  list(expanded_factor = expanded_factor, expanded_factor_for_prediction = expanded_factor_for_prediction)
  
}

#' @import stats
summarise_glm_vars <- function(glm){
  # extract the terms
  term.labels <- attr(terms.formula(glm$formula), "term.labels")
  # extract the offsets
  all_vars <- as.character(attr(stats::terms(glm), 'variables'))
  offsets <- all_vars[grep('offset\\(', all_vars)]
  # add them to the term.labels
  term.labels <- c(term.labels, offsets)
  if(length(term.labels)>0){
    term.labels.form <- paste0('~', term.labels)
    # extract the vars in each term
    vars <- term.labels.form |>
      lapply(formula) |>              # make into a formula
      lapply(all.vars) |>             # extract vars from formula
      lapply(sort) |>                 # alphabetical sort
      lapply(paste0, collapse = '|')  # collapse into string
    # return data.table
    data.table(
      terms = term.labels,
      vars = unlist(vars)
    )
  }
}
uncentered_terms_new <- function(object, newdata, terms, na.action = na.pass, ...) {
  offset_term <- ifelse(length(grep('offset(', terms, fixed = TRUE))>0,T,F)
  if(offset_term){
    # evaluate the offset directly
    # QUESTION FOR GHH - need to split out offset terms
    begin <- min(unlist(gregexpr('(', terms, fixed = TRUE))) + 1
    end <- max(unlist(gregexpr(')', terms, fixed = TRUE))) - 1
    form <- substr(terms,begin,end)
    predictor <- newdata[, eval(parse(text = form))]
    result <- data.table(predictor)
    setnames(result, terms)
  } else {
    # use model matrix
    # get all the terms
    tt <- terms(object)
    n_terms <- length(attr(tt, 'term.labels'))
    
    # keep only the terms we are going to use
    component_terms <- terms[grep(':', terms)]
    if(length(component_terms)>0){
      component_terms <- unique(unlist(strsplit(component_terms, ':')))
    }
    component_terms <- c(component_terms, terms)
    component_terms <- intersect(component_terms, attr(tt, 'term.labels'))
    keep <- which(attr(tt, 'term.labels') %in% component_terms)
    drop_terms <- setdiff(1:n_terms, keep)
    if(length(drop_terms)>0){
      tt <- drop.terms2(tt, drop_terms)
    }
    tt <- delete.response(tt)
    predvars <- attr(tt, "predvars")
    vars <- attr(tt, 'variables')
    if (!is.null(predvars)) {
      predvars_char <- as.character(predvars)
      vars_char <- as.character(vars)
      offset_loc <- grep('offset(', predvars_char, fixed = TRUE)
      if(length(offset_loc)>0){
        attr(tt, "predvars") <- predvars[-offset_loc]
      }
      offset_loc <- grep('offset(', vars_char, fixed = TRUE)
      if(length(offset_loc)>0){
        attr(tt, "variables") <- vars[-offset_loc]
      }
    }
    
    n_terms <- length(terms)
    attr(tt, '.Environment') <- environment() # makes splines package available as loaded by lucidum
    m <- model.frame(tt, newdata, na.action = na.action, xlev = object$xlevels, drop.unused.levels = TRUE)
    if (!is.null(cl <- attr(tt, "dataClasses"))) .checkMFClasses(cl, m)
    new_form <- as.formula(paste0('~',paste0(terms, collapse = '+')))
    X <- model.matrix(new_form, m, contrasts.arg = object$contrasts)
    aa <- attr(X, "assign")
    
    hasintercept <- attr(tt, "intercept") > 0L
    if (hasintercept){
      ll <- c("(Intercept)", terms)
      keep <- c(1,keep+1)
      n_terms <- n_terms + 1
    }
    aaa <- factor(aa, labels = ll)
    asgn <- split(order(aa), aaa)
    
    # keep the coefficients we need
    coeff_names <- dimnames(X)[[2]]
    # NEW- populate beta
    beta <- vector('list', length = length(coeff_names))
    names(beta) <- coeff_names
    beta[1:length(beta)] <- 0
    for(i in 1:length(beta)){
      if(names(beta)[i] %in% names(object$coefficients)){
        beta[i] <- object$coefficients[names(beta)[i]]
      }
    }
    beta <- unlist(beta)
    beta[is.na(beta)] <- 0 # glm returns NA for coefficients it drops from the model - zero ensures no contribution
    predictor <- matrix(ncol = n_terms, nrow = NROW(X))
    for(i in seq.int(1L, n_terms, length.out = n_terms)) {
      ii <- asgn[[i]]
      predictor[, i] <- X[, ii, drop = FALSE] %*% beta[ii]
    }
    result <- data.table(predictor)
    setnames(result, levels(aaa))
  }
  return(result)
}
uncentered_vars <- function(uncentered_terms, var_terms){
  uc_vars <- setNames(data.table(matrix(nrow = nrow(uncentered_terms), ncol = length(var_terms))), names(var_terms))
  for(i in 1:length(var_terms)){
    cols <- var_terms[[i]]$terms
    if(all(cols %in% names(uncentered_terms))){
      uc_vars[,i] <- uncentered_terms[,rowSums(.SD),.SDcols=cols]
    }
  }
  uc_vars
}
feature_banding <- function(d, feature_col, feature_spec){
  if(inherits(d[[feature_col]], 'factor')){
    # drop levels needed if model has been built on subset of data
    # that does not contain all of the levels of the parent dataset
    levels(droplevels(d[[feature_col]]))
  } else if (inherits(d[[feature_col]], c('integer','numeric'))){
    f_min <- feature_spec[feature==feature_col,'min'][[1]]
    f_max <- feature_spec[feature==feature_col,'max'][[1]]
    f_banding <- feature_spec[feature==feature_col,'banding'][[1]]
    # in case the feature spec is character cols
    f_min <- as.numeric(f_min)
    f_max <- as.numeric(f_max)
    f_banding <- as.numeric(f_banding)
    # check for NAs or empties
    # and derive banding from raw data if so
    got_banding <- T
    if(any(length(f_min)==0,length(f_max)==0,length(f_banding)==0)) got_banding <- F
    if(any(is.na(f_min),is.na(f_max),is.na(f_banding))) got_banding <- F
    if(!got_banding){
      f_min <- min(d[[feature_col]], na.rm = TRUE)
      f_max <- max(d[[feature_col]], na.rm = TRUE)
      f_banding <- banding_guesser(d[[feature_col]])
      f_min <- floor(f_min/f_banding)*f_banding
      f_max <- floor(f_max/f_banding)*f_banding + f_banding
    }
    round(seq(from = f_min, to = f_max, by = f_banding), 6)
  }
}
position <- function(input_attr, test_attr){
  # calculate the positions of each element of input_attr within test_attr
  input_char <- as.character(input_attr)
  test_attr <- as.character(test_attr)
  which(test_attr %in% input_char)
}
drop.terms2 <- function (termobj, dropx = NULL, keep.response = FALSE){
  # copy of drop.terms from glm to correctly handle predvars and dataClasses
  if (is.null(dropx)) 
    termobj
  else {
    if (!inherits(termobj, "terms")) 
      stop(gettextf("'termobj' must be a object of class %s", dQuote("terms")), domain = NA)
    newformula <- reformulate(attr(termobj, "term.labels")[-dropx], 
                              response = if (keep.response) 
                                termobj[[2L]], intercept = attr(termobj, "intercept"), 
                              env = environment(termobj))
    result <- terms(newformula, specials = names(attr(termobj, "specials")))
    response <- attr(termobj, "response")
    dropOpt <- if (response && !keep.response) 
      c(response, dropx + length(response))
    else dropx + max(response)
    loc <- position(attr(termobj, 'term.labels')[dropx], attr(termobj, 'var'))
    # extract the individual term labels in termobj
    # stripping down interaction terms (that use ":")
    tl <- attr(termobj, 'term.labels')[-dropx]
    int_tl <- tl[grep(':', tl)]
    if(length(int_tl)>0){
      int_tl <- unique(unlist(strsplit(int_tl, ':')))
    }
    tl <- unique(c(tl,int_tl))
    
    # find where the term labels are in attr(termobj, 'var')
    # and keep only those in predvars and dataClasses
    keep_loc <- position(tl, attr(termobj, 'var'))
    if (!is.null(predvars <- attr(termobj, "predvars"))) {
      attr(result, "predvars") <- predvars[c(1,keep_loc)]
    }
    if (!is.null(dataClasses <- attr(termobj, "dataClasses"))) {
      attr(result, "dataClasses") <- dataClasses[(keep_loc-1)] 
    }
    result
  }
}
prepare_glm_tabulations <- function(dt, var_terms, feature_spec){
  tabulations <- list()
  tabulations[['base']] <- data.table(base=0L)
  if(!is.null(var_terms)){
    for(i in 1:length(var_terms)){
      # get the vars and terms
      terms <- var_terms[[i]][[1]]
      vars <- unlist(strsplit(names(var_terms)[[i]], '|', fixed = TRUE))
      banded_vars <- list()
      for(j in 1:length(vars)){
        banded_vars[[vars[j]]] <- feature_banding(dt, vars[j], feature_spec)
      }
      # create data.table with all combinations of banded_vars
      expanded_vars <- expand.grid(banded_vars)
      tabulations[[names(var_terms)[i]]] <- setDT(expanded_vars)
    }
  }
  return(tabulations)
}
predict_on_tabulations <- function(tabulations, glm, var_terms){
  # first table is the base level
  # pick up the glm intercept
  if('(Intercept)' %in% names(glm$coefficients)){
    tabulations[['base']][, base:= glm$coefficients[['(Intercept)']]]
  } else {
    tabulations[['base']][, base := 0]
  }
  if(!is.null(var_terms)){
    for(i in 2:length(tabulations)){
      predictions <- uncentered_terms_new(glm, newdata = tabulations[[i]], terms = var_terms[[i-1]]$terms)
      if('(Intercept)' %in% names(predictions)){
        predictions[, `(Intercept)`:=NULL]
      }
      total <- rowSums(predictions)
      tabulations[[i]] <- cbind(tabulations[[i]], predictions, tabulated_glm = total)
    }
  }
  return(tabulations)
}
adjust_base_levels <- function(tabulations, feature_spec){
  cumulative_adjustment <- 0
  if(length(tabulations)>1){
    for(i in 2:length(tabulations)){
      # get the base levels for the table
      vars <- unlist(strsplit(names(tabulations)[i], '|', fixed = TRUE))
      base_levels <- tabulations[[i]][1,1:length(vars)]
      all_present <- TRUE
      for(j in 1:length(vars)){
        b <- feature_spec[feature==vars[j], base_level]
        if(!inherits(tabulations[[i]][[j]], c('character','factor'))){
          b <- as.numeric(b)
        }
        if(shiny::isTruthy(b)){
          base_levels[[j]][1] <- b
        } else {
          base_levels[[j]][1] <- NA
          all_present <- FALSE
        }
      }
      # identify the base level row in tabulations
      if(all_present){
        setkeyv(tabulations[[i]], vars)
        setkeyv(base_levels, vars)
        adjustment <- tabulations[[i]][base_levels]$tabulated_glm
      } else {
        adjustment <- 0
      }
      tabulations[[i]][['tabulated_glm']] <- tabulations[[i]][['tabulated_glm']] - adjustment
      cumulative_adjustment <- cumulative_adjustment + adjustment
    }
  }
  # adjust the base level
  tabulations[['base']] <- tabulations[['base']] + cumulative_adjustment
  return(tabulations)
}
band_var_with_feature_spec <- function(x, var_name, feature_spec){
  if(inherits(x, c('character','factor'))){
    x
  } else {
    got_banding <- T
    if(is.null(feature_spec)){
      got_banding <- F
    } else {
      f_min <- feature_spec$min[feature_spec$feature==var_name]
      f_max <- feature_spec$max[feature_spec$feature==var_name]
      f_banding <- feature_spec$banding[feature_spec$feature==var_name]
      f_min <- as.numeric(f_min)
      f_max <- as.numeric(f_max)
      f_banding <- as.numeric(f_banding)
      if(any(length(f_min)==0,length(f_max)==0,length(f_banding)==0)) got_banding <- F
      if(any(is.na(f_min),is.na(f_max),is.na(f_banding))) got_banding <- F
    }
    if(!got_banding){
      # derive banding from raw data
      f_min <- min(x)
      f_max <- max(x)
      f_banding <- banding_guesser(x)
      f_min <- floor(f_min/f_banding)*f_banding
      f_max <- floor(f_max/f_banding)*f_banding + f_banding
    }
    # band and apply min/max
    x <- floor(x/f_banding)*f_banding
    # apply same rounding as used in table creation to ensure merge can proceed
    x <- pmax(f_min, pmin(f_max, x))
    x <- round(x, 6)
  }
  return(x)
}
predict_tabulations <- function(dt, tabulations, feature_spec){
  # matrix to hold the predictions for each table in tablulations
  # and each row in dt
  predictions <- matrix(data=NA, nrow=nrow(dt),ncol=length(tabulations))
  # base level is the same for every row
  predictions[,1] <- tabulations[[1]][['base']]
  if(length(tabulations)>1){
    # loop over the remaining tables
    withProgress(message = 'GlimmaR', detail = 'tabulating', {
      for(i in 2:length(tabulations)){
        setProgress(
          value = i/length(tabulations),
          detail = paste0('tabulating ',
                          names(tabulations)[[i]], # name of the table
                          ' (',
                          prod(dim(tabulations[[i]])), # number of cells in the table
                          ' cells)')
        )
        vars <- unlist(strsplit(names(tabulations)[[i]], '|', fixed = TRUE))
        dt_var_cols <- dt[, ..vars]
        # band numerical columns
        for(v in vars){
          x_banded <- band_var_with_feature_spec(dt_var_cols[[v]],v,feature_spec)
          dt_var_cols[, (v):= x_banded]
        }
        # create index col so can reorder after merge
        dt_var_cols[, row_idx_temp := 1:.N]
        # now values are banded, we can
        # merge tabulation onto dt_var_cols
        setkeyv(dt_var_cols, vars)
        setkeyv(tabulations[[i]], vars)
        merged <- tabulations[[i]][dt_var_cols]
        setorder(merged, 'row_idx_temp')
        predictions[,i] <- merged$tabulated_glm
      }
    })
  }
  predictions_dt <- data.table(predictions)
  setnames(predictions_dt, names(tabulations))
  predictions_dt[, tabulated_glm := rowSums(predictions_dt)]
  return(predictions_dt)
}
extract_offsets <- function(glm){
  tt <- glm$terms
  offset_idx <- attr(tt, 'offset')
  as.character(attr(glm$terms, 'predvar')[offset_idx+1])
}
predict_offsets <- function(dt, offsets){
  results <- matrix(data=NA, nrow=nrow(dt), ncol = length(offsets))
  for(i in 1:length(offsets)){
    results[,i] <- dt[, eval(parse(text = offsets[i]))]
  }
  results <- data.table(results)
  setnames(results, offsets)
  return(results)
}
offset_importances <- function(dt, offsets){
  results <- data.table(names = offsets, importance = 0)
  for(i in 1:length(offsets)){
    x <- dt[, eval(parse(text = offsets[i]))]
    results[i,importance := sd(x)]
  }
  return(results)
}
calc_terms_importances <- function(glm){
  offsets <- extract_offsets(glm)
  terms_predictions <- predict(glm, type = 'terms')
  glm_term_summary <- summarise_glm_vars(glm)
  if(!is.null(glm_term_summary)){
    var_terms <- split(glm_term_summary, by ='vars', keep.by = FALSE)
  } else {
    var_terms <- NULL
  }
  if(length(offsets)>0){
    offsets_predictions <- predict_offsets(glm$data, offsets)
    terms_offset_predictions <- cbind(terms_predictions, offsets_predictions)
  } else {
    terms_offset_predictions <- data.table(terms_predictions)
  }
  if(ncol(terms_offset_predictions)>0){
    terms_importances <- apply(terms_offset_predictions, 2, sd)
    terms_importances <- data.table(names = names(terms_importances), importance = terms_importances)[order(-importance)]
    var_importances <- uncentered_vars(terms_offset_predictions, var_terms)
    var_importances <- apply(var_importances, 2, sd)
    var_importances <- data.table(names = names(var_importances), importance = var_importances)[order(-importance)]
  } else {
    terms_importances <- NA
    var_importances <- NA
  }
  return(list(terms=terms_importances, vars = var_importances))
}
SpeckledJim2/lucidum documentation built on Jan. 26, 2025, 11:03 a.m.