R/apollo_sharesTest.R
In apollo: Tools for Choice Model Estimation and Application

Documented in apollo_sharesTest

#' Compares predicted and observed shares
#' 
#' Comparing the shares predicted by the model with the shares observed in the data, and conducts statistical tests.
#' 
#' This is an auxiliary function to help guide the definition of utility functions in a choice model. 
#' By comparing the predicted and observed shares of alternatives for different categories of the data, 
#' it is possible to identify what additional explanatory variables could improve the fit of the model.
#' @param model Model object. Estimated model object as returned by function \link{apollo_estimate}.
#' @param apollo_probabilities Function. Returns probabilities of the model to be estimated. Must receive three arguments:
#'                          \itemize{
#'                            \item \strong{\code{apollo_beta}}: Named numeric vector. Names and values of model parameters.
#'                            \item \strong{\code{apollo_inputs}}: List containing options of the model. See \link{apollo_validateInputs}.
#'                            \item \strong{\code{functionality}}: Character. Can be either \strong{\code{"components"}}, \strong{\code{"conditionals"}}, \strong{\code{"estimate"}} (default), \strong{\code{"gradient"}}, \strong{\code{"output"}}, \strong{\code{"prediction"}}, \strong{\code{"preprocess"}}, \strong{\code{"raw"}}, \strong{\code{"report"}}, \strong{\code{"shares_LL"}}, \strong{\code{"validate"}} or \strong{\code{"zero_LL"}}.
#'                          }
#' @param apollo_inputs List grouping most common inputs. Created by function \link{apollo_validateInputs}.
#' @param sharesTest_settings List. Contains settings for this function. User input is required for all settings except those with a default or marked as optional. 
#'                            \itemize{
#'                              \item \strong{\code{alternatives}}: Named numeric vector. Names of alternatives and their corresponding value in \code{choiceVar}.
#'                              \item \strong{\code{choiceVar}}: Numeric vector. Contains choices for all observations. It will usually be a column from the database. Values are defined in \code{alternatives}.
#'                              \item \strong{\code{modelComponent}}: Name of model component. Set to model by default.
#'                              \item \strong{\code{newAlts}}: Optional list describing the new alternatives to be used by apollo_sharesTest. This should have as many elements as new alternatives, with each entry being a matrix of 0-1 entries, with one row per observation, and one column per alternative used in the model.
#'                              \item \strong{\code{newAltsOnly}}: Boolean. If TRUE, results will only be printed for the 'new' alternatives defined in newAlts, not the original alternatives used in the model. Set to FALSE by default.
#'                              \item \strong{\code{subsamples}}: Named list of boolean vectors. Each element of the list defines whether a given observation belongs to a given subsample (e.g. by sociodemographics).
#'                            }
#' @return Nothing
#' @export
apollo_sharesTest=function(model, apollo_probabilities, apollo_inputs, sharesTest_settings){
  if(is.null(sharesTest_settings[["alternatives"]])) stop("SYNTAX ISSUE - The sharesTest_settings list needs to include an object called \"alternatives\"!")
  if(is.null(sharesTest_settings[["choiceVar"]])) stop("SYNTAX ISSUE - The sharesTest_settings list needs to include an object called \"choiceVar\"!")
  if(is.null(sharesTest_settings[["subsamples"]])) sharesTest_settings[["subsamples"]]=NA
  if(is.null(sharesTest_settings[["modelComponent"]])) sharesTest_settings$modelComponent="model"
  if(is.null(sharesTest_settings[["newAlts"]])) sharesTest_settings$newAlts=NULL  ### 31 Oct
  if(is.null(sharesTest_settings[["newAltsOnly"]])) sharesTest_settings$newAltsOnly=FALSE  ### 31 Oct
  
  alternatives = sharesTest_settings[["alternatives"]]
  choiceVar    = sharesTest_settings[["choiceVar"]] 
  subsamples   = sharesTest_settings[["subsamples"]]
  newAlts      = sharesTest_settings[["newAlts"]]      ### 31 Oct
  newAltsOnly  = sharesTest_settings[["newAltsOnly"]]  ### 31 Oct
  
  predictedShares = apollo_prediction(model, apollo_probabilities, apollo_inputs, 
                                      prediction_settings=list(modelComponent=sharesTest_settings$modelComponent,silent=TRUE))
  
  ### 31 Oct
  if(!is.null(newAlts)){
   nAlts = ncol(predictedShares) - 3
   nObs  = nrow(predictedShares)
   if(any(lapply(newAlts,nrow)!=nObs) ) stop("SYNTAX ISSUE - Some components in sharesTestSettings$newAlts do not have number of rows equal to that in the database!")
   if(any(lapply(newAlts,ncol)!=nAlts)) stop("SYNTAX ISSUE - Some components in sharesTestSettings$newAlts do not have number of columns equal to the number of alternatives in the model!")
  }
  
  #### predictedShares=predictedShares[,-ncol(predictedShares)] ### removed
  predictedShares = predictedShares[,!colnames(predictedShares)%in%c("ID","Observation","chosen")] #### NEW
  
  ### SH new lines 4/4. Sort out full sample here
  if(length(subsamples)==1 && is.na(subsamples)) subsamples=list("All data"=rep(1,length(choiceVar)))
  
  ### If there are any NA in predicted Shares, then it assumes these come from rows
  ### and removes them from the analysis
  if(anyNA(predictedShares)){
    rows <- !is.na(rowSums(predictedShares))
    choiceVar <- choiceVar[rows]
    subsamples <- lapply(subsamples, function(x) x[rows])
    predictedShares <- predictedShares[rows,]
    apollo_print("Predicted values contain NA. This could be due to using the rows setting in the model. These observations will be ommited from the analysis.", type="w")
  }
  
  # 18/02/2021 Made this conditional, as it is no longer needed due to predictedShares being a data.frame
  if(is.matrix(predictedShares)){
    xnames = colnames(predictedShares)
    predictedShares = split(predictedShares, rep(1:ncol(predictedShares), each = nrow(predictedShares)))
    names(predictedShares) = xnames
    rm(xnames)
  }
  
  categories = subsamples

  ### Check that values in 'categories' are either 0/1 or boolean
  isValid <- function(x){
    ux <- unique(x)
    if(all(ux %in% 0:1) || is.logical(ux)) return(TRUE)
    return(FALSE)
  }
  txt <- "SYNTAX ISSUE - Subsamples must be defined by logical (boolean) or dummy (0/1) variables."
  if(is.data.frame(categories) || is.list(categories)){
    if(!all(sapply(categories, isValid))) stop(txt)
  }
  if(is.array(categories)) if(!isValid(as.vector(categories))) stop(txt)
  if(!is.list(categories) && is.vector(categories)) if(!isValid(categories)) stop(txt)
  
  ### Calculate shares
  trueShares = list()
  for(i in 1:length(alternatives)) trueShares[[names(alternatives)[i]]] <- (choiceVar==alternatives[i])
  
  # SH changes 4/4/20
  #if(anyNA(categories)){
  #  categories=list()
  #  categories[["All data"]]=rep(1,length(trueShares[[1]]))
  #} else {
    if(!all(names(alternatives) %in% names(predictedShares))) stop("SYNTAX ISSUE - Predicted choice probabilities should be provided for all alternatives.")
    if(any(lapply(categories,sum)==0)) stop("INPUT ISSUE - Some subsamples are empty!)")
    if(!any(lapply(categories,sum)==length(trueShares[[1]])) & !any(lapply(categories,length)==1)) categories[["All data"]]=rep(1,length(trueShares[[1]]))
  #}
  
  out <- list()
  
  cat("\nRunning share prediction tests\n")
  if(!newAltsOnly){  ## 31 Oct
    if(!is.null(newAlts)) cat("\nPart 1: alternatives as used in model")
    iterations=length(categories)
    for(j in 1:iterations){
      output = matrix(0,nrow=6,ncol=length(trueShares)+1)  
      colnames(output) = c(names(trueShares),"All")
      rownames(output) = c("Times chosen (data)","Times chosen (prediction)","SD prediction",
                           "Diff (prediction-data)","t-ratio","p-val")
      for(k in 1:length(trueShares)){
        temp1 = as.data.frame(cbind(trueShares[[k]],categories[[j]]))
        temp2 = as.data.frame(cbind(predictedShares[[k]],categories[[j]]))
        output[1,k] = colSums(temp1[temp1$V2==1,])[1]
        output[2,k] = colSums(temp2[temp2$V2==1,])[1]
        output[3,k] = sqrt(sum(temp2[temp2$V2==1,1]*(1-temp2[temp2$V2==1,1])))
        output[4,k] = output[2,k]-output[1,k]
        if(output[1,k]==output[2,k]){
          output[5,k]=NA
          output[6,k]=NA
        } else {
          output[5,k]=(output[2,k]-output[1,k])/output[3,k]
          output[6,k]=round(2*(1-stats::pnorm(abs(output[5,k]))),3)  
        }
      }
      output[1,(length(trueShares)+1)]=sum(output[1,(1:length(trueShares))])
      output[2,(length(trueShares)+1)]=sum(output[2,(1:length(trueShares))])
      output[3,(length(trueShares)+1)]=NA
      output[4,(length(trueShares)+1)]=sum(output[4,(1:length(trueShares))])
      output[5,(length(trueShares)+1)]=NA
      output[6,(length(trueShares)+1)]=NA
      cat("\nPrediction test for group: ",names(categories)[[j]]," (",sum(categories[[j]])," observations)",sep="")
      cat("\n")
      cat("\n")
      output=output[-3,]
      print(round(output,3))
      if(output[1,k+1]!=sum(categories[[j]])){
        cat("\nWARNING: the totals in the final column are not equal to the number of observations!")
        cat("\nThis is normal if you're working with only a subset of possible alternatives in the columns.\n")
      }
      out[[j]] <- output
    }
  }
  
  if(!is.null(newAlts)){
    if(!newAltsOnly) cat("\nPart 2: alternatives as defined in sharesTestSettings$newAlts")
    trueSharesNew=list()
    predictedSharesNew=list()
    for(s in 1:length(newAlts)){
      trueSharesNew[[s]]=0
      predictedSharesNew[[s]]=0
      for(j in 1:length(trueShares)){
        trueSharesNew[[s]]=trueSharesNew[[s]]+trueShares[[j]]*newAlts[[s]][,j]
        predictedSharesNew[[s]]=predictedSharesNew[[s]]+predictedShares[[j]]*newAlts[[s]][,j]
      }
    }
    names(trueSharesNew)=names(newAlts)
    names(predictedSharesNew)=names(newAlts)

    temp=Reduce("+",newAlts)
    for(j in 1:ncol(temp)){
      if(min(temp[,j])<1) cat("\nWARNING: there are cases in your data where alternative",names(alternatives)[j],"does not form part of any of the new alternatives in sharesTestSettings$newAlts")
      if(max(temp[,j])>1) cat("\nWARNING: there are cases in your data where alternative",names(alternatives)[j],"forms part of more than one of the new alternatives in sharesTestSettings$newAlts")
    }
    
    iterations=length(categories)
    for(j in 1:iterations){
      output=matrix(0,nrow=6,ncol=length(trueSharesNew)+1)  
      colnames(output)=c(names(trueSharesNew),"All")
      rownames(output)=c("Times chosen (data)","Times chosen (prediction)","SD prediction","Diff (prediction-data)","t-ratio","p-val")
      for(k in 1:length(trueSharesNew)){
        temp1=as.data.frame(cbind(trueSharesNew[[k]],categories[[j]]))
        temp2=as.data.frame(cbind(predictedSharesNew[[k]],categories[[j]]))
        output[1,k]=colSums(temp1[temp1$V2==1,])[1]
        output[2,k]=colSums(temp2[temp2$V2==1,])[1]
        output[3,k]=sqrt(sum(temp2[temp2$V2==1,1]*(1-temp2[temp2$V2==1,1])))
        output[4,k]=output[2,k]-output[1,k]
        if(output[1,k]==output[2,k]){
          output[5,k]=NA
          output[6,k]=NA
        } else {
          output[5,k]=(output[2,k]-output[1,k])/output[3,k]
          output[6,k]=round(2*(1-stats::pnorm(abs(output[5,k]))),3)  
        }
      }
      output[1,(length(trueSharesNew)+1)]=sum(output[1,(1:length(trueSharesNew))])
      output[2,(length(trueSharesNew)+1)]=sum(output[2,(1:length(trueSharesNew))])
      output[3,(length(trueSharesNew)+1)]=NA
      output[4,(length(trueSharesNew)+1)]=sum(output[4,(1:length(trueSharesNew))])
      output[5,(length(trueSharesNew)+1)]=NA
      output[6,(length(trueSharesNew)+1)]=NA
      cat("\nPrediction test for group: ",names(categories)[[j]]," (",sum(categories[[j]])," observations)",sep="")
      cat("\n")
      cat("\n")
      output=output[-3,]
      print(round(output,3))
      if(output[1,k+1]!=sum(categories[[j]])){
        cat("\nWARNING: the totals in the final column are not equal to the number of observations!")
        cat("\nThis is normal if you're working with only a subset of possible alternatives in the columns.\n")
      }
      out[[j]] <- output
    }
  
  }
  
  apollo_print("\nThe outputs from this function are also returned invisibly as an output. Calling the function via result=apollo_sharesTest(...) will save this output in an object called result (or otherwise named object).", type="i")
  return(invisible(out))
}