R/utils_gof.R
In gofedf: Goodness of Fit Tests Based on Empirical Distribution Functions

#' Calculate the estimate of W_{n}(u) process over a grid of pit values.
#'
#' @param S score matrix with n rows and p columns.
#' @param FI Fisher information matrix with p rows and p columns.
#' @param pit a numeric vector of PIT values.
#'
#' @return a matrix with n rows and n columns
#'
#' @noRd
calculateWnuhat = function(S, FI, pit){

  # Find the number of observations
  n       <- nrow(S)

  # Compute the values of indicator function over pit values
  ind     <- outer(pit, pit, '<=')

  # Estimate the Psi vector by covariance of indicator and score
  Psi_hat <- ( (n-1) * cov(ind, S) ) / n

  # Compute the covariance function over the grid (pit)
  Mat     <- ( ind - S %*% solve( FI ) %*% t(Psi_hat) )
  colnames(Mat) <- 1:n
  rownames(Mat) <- 1:n

  # Return the matrix
  return(Mat)

}


#' Calculate the estimate of W_{n}(u) process on a grid of equally spaced values
#' over (0,1) interval
#'
#' @param S score matrix with n rows and p columns.
#' @param FI Fisher information matrix with p rows and p columns.
#' @param pit a numeric vector of PIT values.
#' @param M number of equally spaced values over (0,1) interval.
#'
#' @return a matrix with n rows and M columns
#'
#' @noRd
calculateWnuhat_manualGrid = function(S, FI, pit, M){

  # Find the number of observations
  n       <- nrow(S)

  # Create a grid points over (0,1) interval. Note that we need to add a small
  # number to the edges of interval. The covariance function does not define on
  # the edges. We used 1e-5 for the epsilon.
  epsilon <- 1e-5
  gridpts <- seq(0 + epsilon, 1 - epsilon, length = M)

  # Compute the values of indicator function over pit values
  ind     <- outer(pit, gridpts, '<=')

  # Estimate the Psi vector by covariance of indicator and score
  Psi_hat <- ( (n-1) * cov(ind, S) ) / n

  # Compute the estimate of W_{n}(u) process over the grid (pit)
  Mat     <- ( ind - S %*% solve( FI ) %*% t(Psi_hat) )
  colnames(Mat) <- paste0('u', 1:M)
  rownames(Mat) <- 1:n

  return(Mat)

}


#' Compute Eigenvalues of the covariance matrix
#'
#' @param S score matrix with n rows and p columns.
#' @param FI Fisher information matrix with p rows and p columns.
#' @param pit a numeric vector of PIT values.
#' @param me the goodness-of-fit statistic, Cramer-von-Mises or
#'   Anderson-Darling.
#'
#' @return a numeric vector of Eigenvalues.
#'
#' @noRd
getEigenValues = function(S, FI, pit, me){

  # Find the number of observations
  n       <- nrow(S)

  # Find the number of parameters
  p       <- ncol(S)

  # Compute the estimate of W_{n}(u) process over a grid of pit values.
  Mat     <- calculateWnuhat(S, FI, pit)

  # Compute the covariance of the estimate of W_{n}(u) process and adjust for
  # the sample size.
  W       <- var(Mat)
  #W       <- ( (n-1) * W ) / (n)
  W       <- ( (n-1) * W ) / (n-p-1)


  # Compute the Eigenvalues of the covariance matrix depending on the
  # goodness-of-fit statistic
  if( me == 'cvm' ){

    # Compute b vector to adjust W matrix
    pit <- sort(pit)
    l   <- length(pit)
    b   <- numeric()
    for( j in 1:l ){

      if( j == 1){
        b[j] <- pit[2]
      }

      if( j == 2){
        b[j] <- pit[3] - pit[1]
      }

      if( (j>2) & (j<=(n-1)) ){
        b[j] <- pit[j+1] - pit[j-1]
      }

      if( j == n){
        b[j] <- 1 - pit[n-1]
      }

    }
    b   <- b / 2
    b   <- b / sum(b)

    # Create diagonal matrix with b vector elements
    Q <- diag( sqrt(b) )

    W <- Q %*% W %*% Q

    #ev      <- eigen(W, symmetric = TRUE, only.values = TRUE)$values /
    #length(pit)
    ev      <- eigen(W, symmetric = TRUE, only.values = TRUE)$values
    return(ev)
  }

  if( me == 'ad'){
    adj.value <- sqrt( outer( pit * (1- pit), pit * (1- pit) ) )
    W       <- W / adj.value
    ev      <- eigen(W, symmetric = TRUE, only.values = TRUE)$values / length(pit)

    #ev      <- eigen(W, symmetric = TRUE, only.values = TRUE)$values
    return(ev)
  }

}


#' Compute Eigenvalues of the covariance matrix
#'
#' @param S score matrix with n rows and p columns.
#' @param FI Fisher information matrix with p rows and p columns.
#' @param pit a numeric vector of PIT values.
#' @param M number of equally spaced values over (0,1) interval.
#' @param me the goodness-of-fit statistic, Cramer-von-Mises or Anderson-Darling.
#'
#' @return a numeric vector of Eigenvalues.
#'
#' @noRd
getEigenValues_manualGrid = function(S, FI, pit, M, me){

  # Find the number of observations
  n       <- nrow(S)

  # Find the number of parameters
  p       <- ncol(S)

  # Compute the estimate of W_{n}(u) process over a grid of values equally spaced over (0,1).
  Mat     <- calculateWnuhat_manualGrid(S, FI, pit, M)

  # Compute the covariance of the estimate of W_{n}(u) process and adjust for the sample size.
  W       <- var(Mat)
  W       <- ( (n-1) * W ) / (n-p-1)

  # Compute the Eigenvalues of the covariance matrix depending on the goodness-of-fit statistic
  if( me == 'cvm' ){
    ev      <- eigen(W, symmetric = TRUE, only.values = TRUE)$values / M
    return(ev)
  }

  if( me == 'ad'){
    s <- 1:M
    s <- s / (M + 1)
    adj.value <- sqrt( outer( s*(1-s) , s*(1-s) ) )
    W <- W / adj.value
    ev      <- eigen(W, symmetric = TRUE, only.values = TRUE)$values / M
    return(ev)
  }

}


#' Compute the Cramer-von-Mises statistics
#'
#' @param x a numeric vector of pit values
#'
#' @return a numeric value, CvM statistics
#'
#' @noRd
#'
getCvMStatistic = function(x){

  n   <- length(x)
  Z   <- sort(x)
  a   <- seq(from = 1, to = 2*n-1, by = 2)
  a   <- a/(2*n)
  res <- sum( (Z - a)^2 ) + 1/(12*n)
  return(res)

}


#' Compute the Anderson-Darling statistics
#'
#' @param x a numeric vector of pit values
#'
#' @return a numeric value, AD statistics
#'
#' @noRd
#'
getADStatistic = function(x){

  n   <- length(x)
  Z   <- sort(x)
  a   <- seq(from = 1, to = n, by = 1)
  a   <- (2 * a) - 1
  S   <- sum( a * ( log(Z) + log( 1 - rev(Z) ) ) )
  res <- (-S/n) - n
  return(res)

}


#' Calculate p-value for the goodness-of-fit test.
#'
#' @description The calculation of p-value is done by the aid of \code{CompQuadForm} package and \code{Farebrother} function.
#' It computes Pr(Q > u) where Q is a sum of squared Chi-quared variables weighted by non-zero Eigenvalues.
#'
#' @param u  a numeric value, Cramer-von-Mises statistic or Anderson-Darling statistic.
#' @param eigen a numeric vector of Eigenvalues.
#'
#' @return p-value
#'
#' @noRd
#'
getpvalue = function(u, eigen){

  # set_1 is from i=1 to J1
  eigen <- eigen[eigen >= 1e-15]
  indx  <- which( eigen >= eigen / 2000)
  set_1 <- eigen[indx]

  # set_2 is from i=J1+1 to J1+J2
  uc      <- eigen[1] / 2000
  indx    <- which( (eigen >= 1e-15) & (eigen <= uc) )
  set_2   <- eigen[indx]

  # Compute the lower bound for the probability of Pr(Q > u)
  LB <- getLowerBoundForpvalue(statistic = u, lambda = eigen)

  # Compute the upper bound for the probability of Pr(Q > u)
  UB <- getUpperBoundForpvalue(statistic = u, lambda = eigen)

  if( LB >= 1e-7){
    # Compute p-value by imhof method
    pvalue <- CompQuadForm::imhof(q = u - sum(set_2), lambda = set_1)$Qq

    if( (pvalue < LB) & (pvalue > UB) ){
       # Imhof method failed to produce a valid pvalue. Compute p-value by farebrother method instead
       pvalue <- CompQuadForm::farebrother(q = u - sum(set_2), lambda = set_1)$Qq
       return(pvalue)
    }
  }


  if( (1e-10 <= LB) & (LB < 1e-7) ){
    # Compute p-value by farebrother method
    pvalue <- CompQuadForm::farebrother(q = u - sum(set_2), lambda = set_1)$Qq
    return(pvalue)
  }


  if( LB < 1e-10 ){
    # Compute p-value by farebrother method
    pvalue <- CompQuadForm::farebrother(q = u - sum(set_2), lambda = set_1)$Qq
  }

  # Check if the computed p-value by CompQuadForm package falls between lower and upper bound.
  if( (pvalue >= LB) & (pvalue <= UB) ){
    return(pvalue)
  }else{
    message(paste0('CompQuadForm failed to generate a valid p-value. The p-value lies between ', LB, ' and ', UB))
    return(pvalue)
  }

}


#' Compute lower bound for p-value
#'
#' @param statistic a numeric value, Cramer-von-Mises statistics or Anderson-Darling statistics
#' @param lambda a numeric vector containing Eigenvalues
#'
#' @return a numeric value
#'
#' @noRd
getLowerBoundForpvalue = function(statistic, lambda){

  term1 <- integrate(f = integrandForLowerBound, lower = 0, upper = statistic/lambda[1], ST = statistic, EV = lambda)$value
  term2 <- pchisq(q = statistic/lambda[1], df = 1, lower.tail = FALSE)
  return(term1 + term2)

}


#' Compute upper bound for p-value.
#'
#' @param statistic a numeric value, Cramer-von-Mises statistics or Anderson-Darling statistics.
#' @param lambda a numeric vector containing Eigenvalues.
#' @param tol the tolerance used to find the solution. The default value is 1e-10.
#' @param max.iter the maximum number of iteration to find the solution. The default value is 50.
#'
#' @return a numeric value
#'
#' @noRd
getUpperBoundForpvalue = function(statistic, lambda, tol = 1e-10, max.iter = 50){

  # Check to see if the root is at t = 0 and return the root and upper bound for pvalue.
  if( sum(lambda) >= statistic ){
    root <- 0
    fval <- exp(-root * statistic - 0.5 * sum(log(1-2*root*lambda)))
    return(UB_pvalue = fval)
  }

  # Define the interval to search for the root using bisection method.
  t_LB <- 0
  t_UB <- 1/(2*max(lambda)) - 1e-6

  # Evaluate values at the edges of interval
  f.LB  <- sum( lambda / (1-2*lambda*t_LB) ) - statistic
  f.UB  <- sum( lambda / (1-2*lambda*t_UB) ) - statistic

  # m counts the number of iteration and if max.iter reached, the process stops.
  m    <- 0

  # Work on solution as long as t_UB and t_LB are not close enough.
  while( abs(t_UB - t_LB) > tol) {

    # Increase iteration
    m <- m + 1

    # Check if the maximum iteration reached
    if (m > max.iter) {
      # maybe we should return CompQuadForm value if the maximum reached? Check with Richard
      message('maximum iteration reached.')
      warning(paste0('last root was:', root, ' and upper boud of ', fval))
      break
    }

    # Find the center of interval [t_LB,t_UB] and calculate the value of function at this middle point.
    t_mid <- (t_LB + t_UB)/2
    f.mid <- sum( lambda / (1-2*lambda*t_mid) ) - statistic

    #
    if (f.UB * f.mid < 0) {
      t_LB <- t_mid
      f.LB <- f.mid
    }else {
      t_UB <- t_mid
      f.UB <- f.mid
    }

  }

  root <- (t_LB + t_UB)/2
  fval <- exp(-root * statistic - 0.5 * sum(log(1-2*root*lambda)))

  return(UB_pvalue = fval)

}



#' Integrand function to compute lower bound
#'
#' @param t function argument
#' @param ST a numeric value, Cramer-von-Mises statistics or Anderson-Darling statistics.
#' @param EV a numeric vector containing Eigenvalues.
#'
#' @return a numeric value
#'
#' @noRd
integrandForLowerBound = function(t, ST, EV){
  a   <- EV[2] / EV[1]
  b   <- ST / EV[1]
  res <- pchisq(q = (b-t)/a, df = 1, lower.tail = FALSE) * dchisq(x = t, df = 1)
  return(res)
}



#' Compute P matrix
#'
#' @param n sample size
#'
#' @param S Score, a matrix with n rows (sample size) and p columns (number of parameters).
#'
#' @param method a character string indicating which goodness-of-fit statistic is to be computed.
#'
#' @return a matrix with n rows and n columns.
#'
#' @noRd
computeMatrix = function(n, S, method){

  if(method == 'cvm'){

    # Define the identity matrix
    I <- diag( rep(1,n) )

    # Add a columns of one to score matrix
    S <- cbind(rep(1,n),S)

    # Define the Hat matrix resulted from regressing columns of score on columns of indicator function.
    H <- S %*% solve( t(S) %*% S ) %*% t(S)

    # Define a vector of values, 1 to n.
    u <- (1:n)

    # Define an nxn matrix Q with elements q_{ij} = (-1/n) max(u_{i},u_{j})
    Q <- (-1/n) * outer(u, u, FUN = pmax)

    # Define P matrix as P = (I - H) Q (I - H)
    P <- (I - H) %*% Q %*% (I - H)

  }

  if(method == 'ad'){

    # Define the identity matrix
    I <- diag( rep(1,n) )

    # Add a columns of one to score matrix
    S <- cbind(rep(1,n),S)

    # Define the Hat matrix resulted from regressing columns of score on columns of indicator function.
    H <- S %*% solve( t(S) %*% S ) %*% t(S)

    # Define a vector of values, 1 to n.
    u <- (1:n)/(n+1)

    # Compute the maxumum of vector u and compute the require log transformed
    umax <- max(u)
    constant <- log( umax/(1-umax) )

    # Define Q matrix with elements of Q_ij = constant - log( max(u_i,u_j) / ( 1 - max(u_i,u_j) ) )
    Q <- constant - outer(u, u, FUN = utility_function)

    # Define P matrix as P = (I - H) Q (I - H)
    P <- (I - H) %*% Q %*% (I - H)

  }


  # Return matrix P
  return(P)

}


utility_function = function(x,y){

  log( pmax(x,y) / (1 - pmax(x,y)) )

}

Any scripts or data that you put into this service are public.

gofedf documentation built on June 8, 2025, 10:52 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

gofedf
Goodness of Fit Tests Based on Empirical Distribution Functions

R/utils_gof.R
In gofedf: Goodness of Fit Tests Based on Empirical Distribution Functions

Defines functions utility_function computeMatrix integrandForLowerBound getUpperBoundForpvalue getLowerBoundForpvalue getpvalue getADStatistic getCvMStatistic getEigenValues_manualGrid getEigenValues calculateWnuhat_manualGrid calculateWnuhat

Try the gofedf package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

gofedf Goodness of Fit Tests Based on Empirical Distribution Functions

R/utils_gof.R In gofedf: Goodness of Fit Tests Based on Empirical Distribution Functions

Defines functions utility_function computeMatrix integrandForLowerBound getUpperBoundForpvalue getLowerBoundForpvalue getpvalue getADStatistic getCvMStatistic getEigenValues_manualGrid getEigenValues calculateWnuhat_manualGrid calculateWnuhat

Try the gofedf package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

gofedf
Goodness of Fit Tests Based on Empirical Distribution Functions

R/utils_gof.R
In gofedf: Goodness of Fit Tests Based on Empirical Distribution Functions