R/gof.rpm.R

Defines functions plot.gofrpm print.gofrpm gof.rpm gof.default gof

Documented in gof gof.default gof.rpm plot.gofrpm

#' Calculate goodness-of-fit statistics for Revealed Preference Matchings Model based on observed data
#' 
#' \code{\link{gof.rpm}} ...
#' It is typically based on the estimate from a \code{rpm()} call.
#' 
#' The function \code{\link{rpm}} is used to fit a revealed preference model
#' for men and women of certain
#' characteristics (or shared characteristics) of people of the opposite sex.
#' The model assumes a one-to-one stable matching using an observed set of
#' matchings and a set of (possibly dyadic) covariates to 
#' estimate the parameters for
#' linear equations of utilities.
#' It does this using an large-population likelihood based on ideas from Dagsvik (2000), Menzel (2015) and Goyal et al (2023).
#' 
#' The model represents the dyadic utility functions as deterministic linear utility functions of
#' dyadic variables. These utility functions are functions of observed characteristics of the women
#' and men.
#' These functions are entered as terms in the function call
#' to \code{\link{rpm}}. This function simulates from such a model.
#'
#' @aliases gof.default
#' @param object list; an object of class\code{rpm} that is typically the result of a call to \code{rpm()}.
#' @param \dots Additional arguments, to be passed to lower-level functions.
#' @param empirical_p logical; (Optional) If TRUE the function returns the empirical p-value of the sample
#' statistic based on \code{nsim} simulations
#' @param compare_sim string; describes which two objects are compared to compute simulated goodness-of-fit
#' statistics; valid values are \code{"sim-est"}: compares the marginal distribution of pairings in a
#' simulated sample to the \code{rpm} model estimate of the marginal distribution based on that same simulated sample;
#' \code{mod-est}: compares the marginal distribution of pairings in a
#' simulated sample to the \code{rpm} model estimate used to generate the sample
#' @param control A list of control parameters for algorithm tuning. Constructed using
#' \code{\link{control.rpm}}, which should be consulted for specifics. 
#' @param reboot logical; if this is \code{TRUE}, the program will rerun the bootstrap at the coefficient values, rather than 
#' expect the object to contain a \code{bs.results} component with the bootstrap results run at the solution values.
#' The latter is the default for \code{rpm} fits.
#' @param verbose logical; if this is \code{TRUE}, the program will print out
#' additional information, including data summary statistics.
#' @return \code{\link{gof.rpm}} returns a list consisting of the following elements:
#' \item{observed_pmf}{numeric matrix giving observed probability mass distribution over different household types}
#' \item{model_pmf}{numeric matrix giving expected probability mass distribution from \code{rpm} model}
#' \item{obs_chi_sq}{the count-based observed chi-square statistic comparing marginal distributions of the population
#' the data and the model estimate}
#' \item{obs_chi_sq_cell}{the contribution to the observed chi-squared statistic by household type}
#' \item{obs_kl}{the Kullback-Leibler (KL) divergence computed by comparing the observed marginal distributions to the
#' expected marginal distribution based on the \code{rpm} model estimate}
#' \item{obs_kl_cell}{the contribution to the observed KL divergence by household type}
#' \item{empirical_p_chi_sq}{the proportion of simulated chi-square statistics that are greater than
#' or equal to the observed chi-square statistic}
#' \item{empirical_p_kl}{the proportion of simulated KL divergences that are greater than
#' or equal to the observed KL divergence}
#' \item{chi_sq_simulated}{vector of size \code{nsim} storing all simulated chi-square statistics}
#' \item{kl_simulated}{vector of size \code{nsim} storing all simulated KL divergences}
#' \item{chi_sq_cell_mean}{Mean contributions of each household type to the simulated chi_sq statistic}
#' \item{chi_sq_cell_sd}{Standard deviation of the contributions of each household type to the simulated chi_sq statistics}
#' \item{chi_sq_cell_median}{Median contributions of each household type to the simulated chi_sq statistic}
#' \item{chi_sq_cell_iqr}{Interquartile range of the contributions of each household type to the simulated chi_sq statistics}
#' \item{kl_cell_mean}{Mean contributions of each household type to the simulated KL divergences}
#' \item{kl_cell_sd}{Standard deviation of the contributions of each household type to the simulated KL divergencesc}
#' \item{kl_cell_median}{Median contributions of each household type to the simulated KL divergences}
#' \item{kl_cell_iqr}{Interquartile range of the contributions of each household type to the simulated KL divergences}
#' @keywords models
#' @examples
#' library(rpm)
#' \donttest{
#' data(fauxmatching)
#' fit <- rpm(~match("edu") + WtoM_diff("edu",3),
#'           Xdata=fauxmatching$Xdata, Zdata=fauxmatching$Zdata,
#'           X_w="X_w", Z_w="Z_w",
#'           pair_w="pair_w", pair_id="pair_id", Xid="pid", Zid="pid",
#'           sampled="sampled")
#' a <- gof(fit)
#' }
#' @references
#'
#' Goyal, Shuchi; Handcock, Mark S.; Jackson, Heide M.; Rendall, Michael S. and Yeung, Fiona C. (2023).
#' \emph{A Practical Revealed Preference Model for Separating Preferences and Availability Effects in Marriage Formation},
#' \emph{Journal of the Royal Statistical Society}, A. \doi{10.1093/jrsssa/qnad031} 
#'
#' Dagsvik, John K. (2000) \emph{Aggregation in Matching Markets} \emph{International Economic Review},, Vol. 41, 27-57.
#' JSTOR: https://www.jstor.org/stable/2648822, \doi{10.1111/1468-2354.00054}
#'
#' Menzel, Konrad (2015).
#' \emph{Large Matching Markets as Two-Sided Demand Systems}
#' Econometrica, Vol. 83, No. 3 (May, 2015), 897-941. \doi{10.3982/ECTA12299}
#' @export gof
gof <- function(object, ...){
  UseMethod("gof")
}


#' @noRd
#' @importFrom utils methods
#' @export
gof.default <- function(object,...) {
  classes <- setdiff(gsub(pattern="^gof.",replacement="",as.vector(methods("gof"))), "default")
  stop("Goodness-of-Fit methods have been implemented only for class 'rpm'.")
}

#' @describeIn gof Calculate goodness-of-fit statistics for Revealed Preference Matchings Model based on observed data
#' @export
gof.rpm <- function(object, ...,
                    empirical_p = TRUE,
                    compare_sim = 'sim-est',
                    control = object$control,
                    reboot = FALSE,
                    verbose=FALSE) 
{
  if(!is.null(control$seed)) set.seed(control$seed)
  
  model_pmf <- object$pmf_est
  observed_pmf <- object$pmf
  
  out <- list()
  out$model_pmf <- object$pmf_est
  out$observed_pmf <- object$pmf
  
  out$compare_sim <- "sim-est"
  
  # Observed Hellinger divergence
  compute_hellinger <- function(observed, expected) {
    hell <- (sqrt(observed)-sqrt(expected))^2
    return(hell)
  }
  out$obs_hellinger_cell <- compute_hellinger(observed_pmf,model_pmf)
  out$obs_hellinger <- sqrt(sum(out$obs_hellinger_cell))/sqrt(2)
  
  # Observed chi-square statistic
  compute_chi_sq <- function(N,observed,expected) {
    csc <- N*(observed-expected)^2/(expected+0.5/N)
    csc[nrow(observed),ncol(observed)] = 0
    return(csc)
  }
  out$obs_chi_sq_cell <- compute_chi_sq(object$nobs,observed_pmf,model_pmf)
  out$obs_chi_sq <- sum(out$obs_chi_sq_cell)
  
  # Observed KL divergence
  compute_kl <- function(N,observed, expected) {
    klc <- -(observed+0.5/N)*log((expected+0.5/N)/(observed+0.5/N))
    klc[nrow(observed),ncol(observed)] = 0
    klc[is.na(klc)|is.nan(klc)] <- 0
    return(klc)
  }
  out$obs_kl_cell <- compute_kl(object$nobs,observed_pmf,model_pmf)
  out$obs_kl <- sum(out$obs_kl_cell)
  
  if (empirical_p) {
    # Get it from the object bootstrap output  
    if(reboot){
    if(object$N <= control$large.population.bootstrap){
        # Use the direct (small population) method
        # These are the categories of women
        NumBetaS <- dim(object$Sd)[3]
        beta_S <- object$coefficients[1:NumBetaS]

        if(dim(object$Xd)[3]>0){
          beta_w <- object$coefficients[NumBetaS+(1:object$NumBetaW)]
        }else{
          beta_w <- NULL
        }
        if(dim(object$Zd)[3]>0){
          beta_m <- object$coefficients[NumBetaS+object$NumBetaW + (1:object$NumBetaM)]
        }else{
          beta_m <- NULL
        }

        # Use the direct (small population) method
        # These are the categories of women
        Ws <- rep(seq_along(object$pmfW),round(object$pmfW*object$num_women))
        if(length(Ws) > object$num_women+0.01){
         Ws <- Ws[-sample.int(length(Ws),size=length(Ws)-object$num_women,replace=FALSE)]
        }
        if(length(Ws) < object$num_women-0.01){
         Ws <- c(Ws,sample.int(length(object$pmfW),size=object$num_women-length(Ws),prob=object$pmfW,replace=TRUE))
        }
        Ws <- Ws[sample.int(length(Ws))]
        Ms <- rep(seq_along(object$pmfM),round(object$pmfM*object$num_men))
        if(length(Ms) > object$num_men+0.01){
         Ms <- Ms[-sample.int(length(Ms),size=length(Ms)-object$num_men,replace=FALSE)]
        }
        if(length(Ms) < object$num_men-0.01){
         Ms <- c(Ms,sample.int(length(object$pmfM),size=object$num_men-length(Ms),prob=object$pmfM,replace=TRUE))
        }
        Ms <- Ms[sample.int(length(Ms))]

        # create utility matrices
        U_star = matrix(0, nrow=length(Ws), ncol = length(Ms))
        V_star = matrix(0, nrow=length(Ms), ncol = length(Ws))
        for (ii in 1:NumBetaS) {
          U_star = U_star + object$Sd[Ws,Ms,ii] * beta_S[ii] * 0.5
        }
        if(!is.empty(beta_w)){
         for (ii in 1:object$NumBetaW) {
          U_star = U_star + object$Xd[Ws,Ms,ii] * beta_w[ii]
         }
        }
        for (ii in 1:NumBetaS) {
          V_star = V_star + t(object$Sd[Ws,Ms,ii]) * beta_S[ii] * 0.5
        }
        if(!is.empty(beta_m)){
         for (ii in 1:object$NumBetaM) {
          V_star = V_star + object$Zd[Ms,Ws,ii] * beta_m[ii]
         }
        }
  
        # adjust for outside option (remain single)

        Jw <- sqrt(object$num_men)
        Jm <- sqrt(object$num_women) 
    }

    num_Xu = nrow(object$Xu)
    num_Zu = nrow(object$Zu)
    cnW <- paste(colnames(object$Xu)[2],object$Xu[,2], sep=".")
    if(ncol(object$Xu)>2){
    for(i in 3:ncol(object$Xu)){
      cnW <- paste(cnW,paste(colnames(object$Xu)[i],object$Xu[,i], sep="."),sep='.')
    }}
    cnM <- paste(colnames(object$Zu)[2],object$Zu[,2], sep=".")
    if(ncol(object$Zu)>2){
    for(i in 2:ncol(object$Zu)){
      cnM <- paste(cnM,paste(colnames(object$Zu)[i],object$Zu[,i], sep="."),sep='.')
    }}

    if(control$ncores > 1){
      doFuture::registerDoFuture()
      future::plan(multisession, workers=control$ncores)  ## on MS Windows
      if (!is.null(object$control$seed)) {
        doRNG::registerDoRNG(object$control$seed)
      }
      if(verbose) message(sprintf("Starting parallel bootstrap using %d cores.",control$ncores))
       if(object$N > control$large.population.bootstrap){
        # Use the large population bootstrap
        bs.results <-
        foreach::foreach (b=1:control$nbootstrap, .packages=c('rpm')
        ) %dorng% {
          rpm.bootstrap.large(b, object$coefficients,
                      S=object$Sd,X=object$Xd,Z=object$Zd,sampling_design=object$sampling_design,Xdata=object$Xdata,Zdata=object$Zdata,
                      sampled=object$sampled,
                      Xid=object$Xid,Zid=object$Zid,pair_id=object$pair_id,X_w=object$X_w,Z_w=object$Z_w,
                      num_sampled=object$nobs,
                      NumBeta=object$NumBeta,NumGammaW=object$NumGammaW,NumGammaM=object$NumGammaM,
                      num_Xu=num_Xu,num_Zu=num_Zu,cnW=cnW,cnM=cnM,LB=object$LB,UB=object$UB,
                      control=control)
       }
      }else{
        # Use the small population bootstrap
        bs.results <-
        foreach::foreach (b=1:control$nbootstrap, .packages=c('rpm')
        ) %dorng% {
          rpm.bootstrap.small(b, object$coefficients,
           num_women=length(Ws), num_men=length(Ms), Jw=Jw, Jm=Jm, U_star=U_star, V_star=V_star, S=object$Sd, X=object$Xd, Z=object$Zd, pmfW=object$pmfW, pmfM=object$pmfM, object$Xu, object$Zu, num_Xu, num_Zu, cnW, cnM, object$Xid, object$Zid, object$pair_id, object$sampled, object$sampling_design, object$NumBeta, object$NumGammaW, object$NumGammaM, object$LB, object$UB, control, object$nobs)
        }
      }
      if(verbose) message(sprintf("Ended parallel bootstrap\n"))
    }else{
#     Start of the non-parallel bootstrap
      bs.results <- list()
      for(i in 1:control$nbootstrap){
        if(object$N > control$large.population.bootstrap){
        bs.results[[i]] <- rpm.bootstrap.large(b, object$coefficients,
                      S=object$Sd,X=object$Xd,Z=object$Zd,sampling_design=object$sampling_design,Xdata=object$Xdata,Zdata=object$Zdata,
                      sampled=object$sampled,
                      Xid=object$Xid,Zid=object$Zid,pair_id=object$pair_id,X_w=object$X_w,Z_w=object$Z_w,
                      num_sampled=object$nobs,
                      NumBeta=object$NumBeta,NumGammaW=object$NumGammaW,NumGammaM=object$NumGammaM,
                      num_Xu=num_Xu,num_Zu=num_Zu,cnW=cnW,cnM=cnM,LB=object$LB,UB=object$UB,
                      control=control)
        }else{
        bs.results[[i]] <- rpm.bootstrap.small(b, object$coefficients,
           num_women=length(Ws), num_men=length(Ms), Jw=Jw, Jm=Jm, U_star=U_star, V_star=V_star, S=object$Sd, X=object$Xd, Z=object$Zd, pmfW=object$pmfW, pmfM=object$pmfM, object$Xu, object$Zu, num_Xu, num_Zu, cnW, cnM, object$Xid, object$Zid, object$pair_id, object$sampled, object$sampling_design, object$NumBeta, object$NumGammaW, object$NumGammaM, object$LB, object$UB, control, object$nobs)
        }
      }
    }
    }else{
      if(is.null(object$bs.results)){
        stop("Goodness-of-Fit was called with 'reboot=FALSE', but the passed object does not contain bootstrap results. Either rerun with 'reboot=TRUE' or re-fit with 'bootstrap=TRUE'.")
      }
      bs.results <- object$bs.results
      object$bs.results <- NULL
    }
    # Get it from the object bootstrap output  
    nsim <- length(bs.results)
    sample_sizes <- unlist(lapply(bs.results, '[[', "nobs"))
    simulated_pmf_est <- array(unlist(lapply(bs.results, '[[', "pmf_est")),dim=c(nrow(object$pmf),ncol(object$pmf),nsim))
    simulated_pmf <-     array(unlist(lapply(bs.results, '[[', "pmf")),dim=c(nrow(object$pmf),ncol(object$pmf),nsim))
    
    if(verbose){
      message(sprintf("Assessing rpm model goodness-of-fit: %s",out$compare_sim))
    }
    
    # Calculate Hellinger contribution by cell for simulated pmfs
    simest_hellinger_cell <- array(NA,dim=c(nrow(object$pmf),ncol(object$pmf),nsim))
    modest_hellinger_cell <- array(NA,dim=c(nrow(object$pmf),ncol(object$pmf),nsim))
    sim_hellinger_cell <- array(NA,dim=c(nrow(object$pmf),ncol(object$pmf),nsim))
    for (it in 1:nsim) {
      simest_hellinger_cell[,,it] = compute_hellinger(simulated_pmf[,,it],simulated_pmf_est[,,it])
      modest_hellinger_cell[,,it] = compute_hellinger(simulated_pmf[,,it],model_pmf)
      sim_hellinger_cell[,,it] = simest_hellinger_cell[,,it]-modest_hellinger_cell[,,it]
    }
    
    # Calculate chi-square contribution by cell for simulated pmfs
    simest_chi_sq_cell <- array(NA, dim=c(nrow(object$pmf),ncol(object$pmf),nsim))
    modest_chi_sq_cell <- array(NA, dim=c(nrow(object$pmf),ncol(object$pmf),nsim))
    sim_chi_sq_cell <- array(NA, dim=c(nrow(object$pmf),ncol(object$pmf),nsim))
    for (it in 1:nsim) {
      simest_chi_sq_cell[,,it] = compute_chi_sq(sample_sizes[it],simulated_pmf[,,it],simulated_pmf_est[,,it])
      modest_chi_sq_cell[,,it] = compute_chi_sq(sample_sizes[it],simulated_pmf[,,it],model_pmf)
      sim_chi_sq_cell[,,it] = simest_chi_sq_cell[,,it]-modest_chi_sq_cell[,,it]
      
    }
    
    # Calculate KL contribution by cell for simulated pmfs
    simest_kl_cell <- array(NA, c(nrow(object$pmf),ncol(object$pmf),nsim))
    modest_kl_cell <- array(NA, c(nrow(object$pmf),ncol(object$pmf),nsim))
    sim_kl_cell <- array(NA, c(nrow(object$pmf),ncol(object$pmf),nsim))
    for (it in 1:nsim) {
      simest_kl_cell[,,it] <- compute_kl(sample_sizes[it],simulated_pmf[,,it],simulated_pmf_est[,,it])
      modest_kl_cell[,,it] <- compute_kl(sample_sizes[it],simulated_pmf[,,it],model_pmf)
      sim_kl_cell[,,it] <- simest_kl_cell[,,it]-modest_kl_cell[,,it]

    }
    
    # Simulated chi_sq, Hellinger and KL divergence statistics
    # This is how far the data PMF should be from the model PMF under the null
    out$hellinger_simulated <- sqrt(apply(simest_hellinger_cell, 3, sum, na.rm=TRUE))/sqrt(2)
    out$chi_sq_simulated <- apply(simest_chi_sq_cell, 3, sum, na.rm=TRUE)
    out$kl_simulated   <-   apply(simest_kl_cell,     3, sum, na.rm=TRUE)

    b <- matrix(NA,ncol=nsim,nrow=nsim)
    for(it in 1:nsim){
      for(it2 in 1:nsim){
        b[it,it2] <- sqrt(sum(compute_hellinger(simulated_pmf[,,it],simulated_pmf[,,it2]), na.rm=TRUE))/sqrt(2)
      }
    }
    diag(b) <- NA
    out$sim_hellinger_bs_pmf <- apply(b,1,mean,na.rm=TRUE)
    out$obs_hellinger_bs_pmf <- apply(simulated_pmf,3,function(x){sqrt(sum(compute_hellinger(x,object$pmf), na.rm=TRUE))/sqrt(2)})
    out$empirical_p_hellinger <- mean(apply(b,1,function(x){mean(x >= out$obs_hellinger_bs_pmf,na.rm=TRUE)}))
    h <- rep(0,nsim)
    for(i in seq_along(h)){h[i] <- mean(b >= out$obs_hellinger_bs_pmf[i],na.rm=TRUE)}
    out$empirical_p_hellinger <- mean(h)
    
    for(it in 1:nsim){
      for(it2 in 1:nsim){
        b[it,it2] <- (sum(compute_chi_sq(sample_sizes[it],simulated_pmf[,,it],simulated_pmf[,,it2]), na.rm=TRUE))
      }
    }
    diag(b) <- NA
    out$sim_chi_sq_bs_pmf <- apply(b,1,mean,na.rm=TRUE)
    out$obs_chi_sq_bs_pmf <- apply(simulated_pmf,3,function(x){(sum(compute_chi_sq(object$nobs,x,object$pmf), na.rm=TRUE))})
    out$empirical_p_chi_sq <- mean(apply(b,2,function(x){mean(x >= out$obs_chi_sq_bs_pmf,na.rm=TRUE)}))
    h <- rep(0,nsim)
    for(i in seq_along(h)){h[i] <- mean(b >= out$obs_chi_sq_bs_pmf[i],na.rm=TRUE)}
    out$empirical_p_chi_sq <- mean(h)
    
    for(it in 1:nsim){
      for(it2 in 1:nsim){
        b[it,it2] <- (sum(compute_kl(sample_sizes[it],simulated_pmf[,,it],simulated_pmf[,,it2]), na.rm=TRUE))
      }
    }
    diag(b) <- NA
    out$sim_kl_bs_pmf <- apply(b,1,mean,na.rm=TRUE)
    out$obs_kl_bs_pmf <- apply(simulated_pmf,3,function(x){(sum(compute_kl(object$nobs,x,object$pmf), na.rm=TRUE))})
    out$empirical_p_kl <- mean(apply(b,2,function(x){mean(x >= out$obs_kl_bs_pmf,na.rm=TRUE)}))
    h <- rep(0,nsim)
    for(i in seq_along(h)){h[i] <- mean(b >= out$obs_kl_bs_pmf[i],na.rm=TRUE)}
    out$empirical_p_kl <- mean(h)
    
    out$kl_simulated[is.na(out$kl_simulated)]  <- 0
    out$kl_simulated_new[is.na(out$kl_simulated_new)]  <- 0
    out$sim_chi_sq_bs_pmf[is.na(out$sim_chi_sq_bs_pmf)]  <- 0
    out$obs_chi_sq_bs_pmf[is.nan(out$obs_chi_sq_bs_pmf)]  <- 0
    out$sim_chi_sq_bs_pmf[is.nan(out$sim_chi_sq_bs_pmf)]  <- 0
    out$obs_chi_sq_bs_pmf[is.nan(out$obs_chi_sq_bs_pmf)]  <- 0
    # Remove error est from divergence statistics
    out$hellinger_simulated_new <- sqrt(apply(simest_hellinger_cell, 3, sum, na.rm=TRUE))/sqrt(2)-sqrt(apply(modest_hellinger_cell, 3, sum, na.rm=TRUE))/sqrt(2)
    out$chi_sq_simulated_new <- apply(simest_chi_sq_cell, 3, sum, na.rm=TRUE)-apply(modest_chi_sq_cell, 3, sum, na.rm=TRUE)
    out$kl_simulated_new   <-   apply(simest_kl_cell,     3, sum, na.rm=TRUE)-apply(modest_kl_cell,     3, sum, na.rm=TRUE)
    
    # Empirical p-value
    # This is rank of the observed divergence from that we should see under the null
    out$empirical_p_hellinger <- mean(out$hellinger_simulated >= out$obs_hellinger, na.rm=TRUE)
    out$empirical_p_chi_sq <- mean(out$chi_sq_simulated >= out$obs_chi_sq, na.rm=TRUE)
    out$empirical_p_kl     <- mean(out$kl_simulated >= out$obs_kl, na.rm=TRUE)
    
    # Empirical p-value
    out$empirical_p_hellinger_new <- mean(out$hellinger_simulated_new >= out$obs_hellinger, na.rm=TRUE)
    out$empirical_p_chi_sq_new <- mean(out$chi_sq_simulated_new >= out$obs_chi_sq, na.rm=TRUE)
    out$empirical_p_kl_new     <- mean(out$kl_simulated_new >= out$obs_kl, na.rm=TRUE)
    
    # Cell summaries of contributions
    # Mean, SD, Median, IQR for Hellinger
    r_dimnames <- dimnames(observed_pmf)
    out$hellinger_cell_mean <- apply(sim_hellinger_cell, 1:2, mean, na.rm=TRUE)
    dimnames(out$hellinger_cell_mean) <- r_dimnames
    
    out$hellinger_cell_sd <- apply(sim_hellinger_cell, 1:2, stats::sd, na.rm=TRUE)
    dimnames(out$hellinger_cell_sd) <- r_dimnames    
    
    out$hellinger_cell_median <- apply(sim_hellinger_cell, 1:2, stats::median, na.rm=TRUE)
    dimnames(out$hellinger_cell_median) <- r_dimnames
    
    out$hellinger_cell_iqr <- apply(sim_hellinger_cell, 1:2, stats::IQR, na.rm=TRUE)
    dimnames(out$hellinger_cell_iqr) <- r_dimnames
    
    # Mean, SD, Median, IQR for chi-sq
    out$chi_sq_cell_mean <- apply(sim_chi_sq_cell, 1:2, mean, na.rm=TRUE)
    dimnames(out$chi_sq_cell_mean) <- r_dimnames
    
    out$chi_sq_cell_sd <- apply(sim_chi_sq_cell, 1:2, stats::sd, na.rm=TRUE)
    dimnames(out$chi_sq_cell_sd) <- r_dimnames    
    
    out$chi_sq_cell_median <- apply(sim_chi_sq_cell, 1:2, stats::median, na.rm=TRUE)
    dimnames(out$chi_sq_cell_median) <- r_dimnames
    
    out$chi_sq_cell_iqr <- apply(sim_chi_sq_cell, 1:2, stats::IQR, na.rm=TRUE)
    dimnames(out$chi_sq_cell_iqr) <- r_dimnames
    
    # Mean, SD, Median, IQR for KL
    out$kl_cell_mean <- apply(sim_kl_cell, 1:2, mean, na.rm=TRUE)
    dimnames(out$kl_cell_mean) <- r_dimnames
    
    out$kl_cell_sd <- apply(sim_kl_cell, 1:2, stats::sd, na.rm=TRUE)
    dimnames(out$kl_cell_sd) <- r_dimnames
    
    out$kl_cell_median <- apply(sim_kl_cell, 1:2, stats::median, na.rm=TRUE)
    dimnames(out$kl_cell_median) <- r_dimnames
    
    out$kl_cell_iqr <- apply(sim_kl_cell, 1:2, stats::IQR, na.rm=TRUE)
    dimnames(out$kl_cell_iqr) <- r_dimnames
  }
  
  class(out) <- "gofrpm"
  return(out)
}  

#' @method print gofrpm
#' @export
print.gofrpm <- function(x, ...){
  message(sprintf("Hellinger goodness-of-fit p-value: %f", x$empirical_p_hellinger ))
  message(sprintf("Chi-Squared goodness-of-fit p-value: %f", x$empirical_p_chi_sq ))
  message(sprintf("Kullback-Liebler goodness-of-fit p-value: %f", x$empirical_p_kl ))
  invisible()
}

###################################################################
# The <plot.gof> function plots the GOF diagnostics for each
# term included in the build of the gof
#
# --PARAMETERS--
#   x          : a gof, as returned by one of the <gof.X>
#                functions
#   ...        : additional par arguments to send to the native R
#                plotting functions
#   cex.axis   : the magnification of the text used in axis notation;
#                default=0.7
#   plotlogodds: whether the summary results should be presented
#                as their logodds; default=FALSE
#   main       : the main title; default="Goodness-of-fit diagnostics"
#   verbose    : this parameter is ignored; default=FALSE
#   normalize.reachibility: whether to normalize the distances in
#                the 'distance' GOF summary; default=FALSE
#
# --RETURNED--
#   NULL
#
###################################################################



#' @describeIn gof
#' 
#' \code{\link{plot.gofrpm}} plots diagnostics such empirical p-value
#' based on chi-square statistics and KL divergences.
#' See \code{\link{rpm}} for more information on these models.
#' 
#' @param x a list, usually an object of class gofrpm
#' @param cex.axis the magnification of the text used in axis notation;
#  default=0.7#'
#' @param main Title for the goodness-of-fit plots.
#' @keywords graphs
#' 
#' @importFrom graphics points lines mtext plot
#' @export

#' @method plot gofrpm
#' @export plot.gofrpm
plot.gofrpm <- function(x, ..., 
                        cex.axis=0.7,
                        main="Goodness-of-fit diagnostics"
) {
  
  hist(x$chi_sq_simulated, xlim=range(c(x$chi_sq_simulated,x$obs_chi_sq),finite=TRUE),
       probability=TRUE,
       main=expression(paste("Distribution of simulated ",chi^2)),
       xlab=expression(chi^2),nclass=20)
  lines(stats::density(x$chi_sq_simulated),col=3)
  abline(v=x$obs_chi_sq, col=2)
  
  hist(x$kl_simulated, xlim=range(c(x$kl_sim,x$obs_kl),finite=TRUE),
       probability=TRUE,
       main=expression(paste("Distribution of simulated ",KL)),
       xlab=expression(KL),nclass=20)
  lines(stats::density(x$kl_simulated),col=3)
  abline(v=x$obs_kl, col=2)
  
  y <- x$chi_sq_cell_mean
  y[nrow(y),ncol(y)] <- NA
  # Heat map
  chiHeat<-tibble(rep(rownames(y),times=nrow(y)),
                  rep(colnames(y),each=ncol(y)),
                  Mean = c(y)) %>%
    ggplot(mapping = aes(x = rep(rownames(y),times=nrow(y)),
                         y = rep(colnames(y),each=ncol(y)),
                         fill = Mean)) +
    geom_tile() +
    xlab(label = "Woman's type")+
    ylab(label= "Man's type")+
    ggtitle("Mean chi-square contribution by cell") +
    scale_fill_gradient(low = "#FFFFFF", high = "#100FCE",na.value = '#000000')
  plot(chiHeat)
  
  y <- x$kl_cell_mean
  y[nrow(y),ncol(y)] <- NA
  ## Heat map
  klHeat <- tibble(rep(rownames(y),times=nrow(y)),
                   rep(colnames(y),each=ncol(y)),
                   Mean = c(y)) %>%
    ggplot(mapping = aes(x = rep(rownames(y),times=nrow(y)),
                         y = rep(colnames(y),each=ncol(y)),
                         fill = Mean)) +
    geom_tile() +
    xlab(label = "Woman's type")+
    ylab(label= "Man's type")+
    ggtitle("Mean KL contribution by cell") +
    scale_fill_gradient2(low = "#CE0F0F",
                         mid = "#FFFFFF",
                         high = "#100FCE", midpoint = 0,
                         na.value='#000000')
  plot(klHeat)
  
}

Try the rpm package in your browser

Any scripts or data that you put into this service are public.

rpm documentation built on May 29, 2024, 12:07 p.m.