R/NMixPseudoGOF.default.R

Defines functions NMixPseudoGOF.default

Documented in NMixPseudoGOF.default

##
##  PURPOSE:   Pseudo goodness-of-fit test for a normal mixture
##             * default method
##
##  AUTHOR:    Arnost Komarek
##             arnost.komarek[AT]mff.cuni.cz
##
##  CREATED:   20/08/2009
##
##  FUNCTIONS: NMixPseudoGOF.default.R
##
## ==================================================================

## *************************************************************
## NMixPseudoGOF.default
## *************************************************************
NMixPseudoGOF.default <- function(x, scale, w, mu, Sigma, breaks, nbreaks=10, digits=3, ...)
{  
  ##### Dimension of the normal mixture
  ##### ===============================================
  if (is.numeric(x)) x <- data.frame(x=x)
  if (is.matrix(x)) x <- as.data.frame(x)
  if (!is.data.frame(x)) stop("x must be a data.frame")
  p <- ncol(x)
  if (p < 1) stop("length of breaks must be 1 or more")

  ##### Remove rows with NA's from data
  ##### ===============================================  
  row.NA <- apply(is.na(x), 1, sum) > 0
  x <- x[!row.NA,]
  
  ##### scale
  ##### ===============================================  
  if (missing(scale)) scale <- list(shift=rep(0, p), scale=rep(1, p))
  if (!is.list(scale)) stop("scale must be a list")
  if (length(scale) != 2) stop("scale must have 2 components")
  inscale <- names(scale)  
  iscale.shift <- match("shift", inscale, nomatch=NA)
  iscale.scale <- match("scale", inscale, nomatch=NA)
  if (is.na(iscale.shift)) stop("scale$shift is missing")
  if (length(scale$shift) == 1) scale$shift <- rep(scale$shift, p)
  if (length(scale$shift) != p) stop(paste("scale$shift must be a vector of length ", p, sep=""))    
  if (is.na(iscale.scale)) stop("scale$scale is missing")
  if (length(scale$scale) == 1) scale$scale <- rep(scale$scale, p)
  if (length(scale$scale) != p) stop(paste("scale$scale must be a vector of length ", p, sep=""))
  if (any(scale$scale <= 0)) stop("all elements of scale$scale must be positive")

  ##### Number of mixture components
  ##### ===============================================  
  K <- length(w)

  ##### Check mixture weights
  ##### ===============================================  
  if (any(w < 0) | any(w > 1)) stop("weights must lie between zero and 1")
  if (abs(sum(w) - 1) > 1e-5) warning("sum of weights differs from 1")

  ##### Check mixture means and variances
  ##### (make them numeric if p=1)
  ##### Adjust means and variances for scaling
  ##### Compute overall mean and variance of the mixture (to determine sensible break values if not given)
  ##### ====================================================================================================  
  if (p == 1){

    ### Check
    if (length(mu) != K) stop("incorrect mu")
    mu <- as.numeric(mu)
    if (is.list(Sigma)){
      if (any(sapply(Sigma, length) != 1)) stop("incorrect Sigma")
      Sigma <- unlist(Sigma)
    }
    if (length(Sigma) != K) stop("incorrect Sigma")

    ### Scale adjustment
    mu <- mu * scale$scale + scale$shift
    Sigma <- Sigma * scale$scale^2

    ### Overall mean and variance
    Emix <- sum(w * mu)
    Vmix <- sum(w * (Sigma + (mu - Emix)^2))    
  }else{

    ### Check
    if (K == 1){
      if (length(mu) != p) stop("incorrect mu")
      mu <- matrix(mu, nrow=K, ncol=p)
      if (!is.list(Sigma)) Sigma <- list(Sigma)    
    }
    if (nrow(mu) != K) stop(paste("mu must have ", K, " rows", sep=""))
    if (ncol(mu) != p) stop(paste("mu must have ", p, " columns", sep=""))
    if (any(!sapply(Sigma, is.matrix))) stop("all Sigma's must be matrices")
    if (any(sapply(Sigma, nrow) != p)) stop(paste("all Sigma's must have ", p, " rows", sep=""))
    if (any(sapply(Sigma, ncol) != p)) stop(paste("all Sigma's must have ", p, " columns", sep=""))

    ### Scale adjustment, overall mean and variance
    mu <- mu * matrix(rep(scale$scale, K), nrow=K, ncol=p, byrow=TRUE) + matrix(rep(scale$shift, K), nrow=K, ncol=p, byrow=TRUE)
    Emix <- apply(matrix(rep(w, p), nrow=K, ncol=p) * mu, 2, sum)
    Vmix <- matrix(0, nrow=p, ncol=p)
    for (k in 1:K){
      Sigma[[k]] <- diag(scale$scale) %*% Sigma[[k]] %*% diag(scale$scale)
      Vmix <- Vmix + w[k] * (Sigma[[k]] + matrix(mu[k,] - Emix, ncol=1) %*% matrix(mu[k,] - Emix, nrow=1)) 
    }
  }
  
  ##### Breaks
  ##### ===============================================  
  if (length(nbreaks) == 1) nbreaks <- rep(nbreaks, p)
  if (length(nbreaks) != p) stop(paste("nbreaks must be a vector of length ", p, sep=""))

  if (length(digits) == 1) digits <- rep(digits, p)
  if (length(digits) != p) stop(paste("digits must be a vector of length ", p, sep=""))

  csigma <- 2
  if (missing(breaks)){    
    breaks <- list()
    if (p == 1){
      rangeGrid <- Emix + c(-csigma, csigma)*sqrt(Vmix)
      if (nbreaks[1] == 1) breaks[[1]] <- Emix
      else                 breaks[[1]] <- round(seq(rangeGrid[1], rangeGrid[2], length=nbreaks), digits=digits)
    }else{     
      for (i in 1:p){
        rangeGrid <- Emix[i] + c(-csigma, csigma)*sqrt(Vmix[i, i])
        if (nbreaks[i] == 1) breaks[[i]] <- Emix[i]
        else                 breaks[[i]] <- round(seq(rangeGrid[1], rangeGrid[2], length=nbreaks[i]), digits=digits[i])
      }
    }
  }
  
  if (is.numeric(breaks)) breaks <- list(x1=breaks)  
  if (!is.list(breaks)) stop("breaks must be a list")
  if (length(breaks) != p) stop(paste("breaks must be a list of length ", p, sep=""))
  if (is.null(names(breaks))) names(breaks) <- paste("x", (1:p), sep="")  

  ##### Order breaks, add Inf, create labels of intervals, create mid-points
  ##### ===========================================================================  
  labels <- midpoints <- breaksInf <- list()
  for (i in 1:p){
    breaks[[i]] <- breaks[[i]][order(breaks[[i]])]
    if (length(breaks[[i]]) == 1){
      if (p == 1) midpoints[[i]] <- breaks[[i]] + c(-1, 1) * sqrt(Vmix)
      else        midpoints[[i]] <- breaks[[i]] + c(-1, 1) * sqrt(Vmix[i, i])
    }else{
      aver.dist <- mean(breaks[[i]][2:length(breaks[[i]])] - breaks[[i]][1:(length(breaks[[i]]) - 1)])
      mids <- 0.5 * (breaks[[i]][2:length(breaks[[i]])] + breaks[[i]][1:(length(breaks[[i]]) - 1)])
      midpoints[[i]] <- c(breaks[[i]][1] - aver.dist, mids, breaks[[i]][length(breaks[[i]])] + aver.dist)
    }  
    
    labLeft <- paste("(", c(-Inf, breaks[[i]]), sep="")
    labRight <- c(paste(breaks[[i]], "]", sep=""), "Inf)")
    labels[[i]] <- paste(labLeft, ", ", labRight, sep="")    
    breaksInf[[i]] <- c(breaks[[i]], Inf)      
  }  

  nBreak <- sapply(breaks, length)
  #if (any(nBreak <= 1)) stop("there must be at least two breaks in each margin")

  
  ##### For each observation, determine interval in which it lies
  ##### - store it in index matrix
  ##### ============================================================================  
  index <- matrix(0, nrow=nrow(x), ncol=p)
  for (i in 1:p){
    tmpx <- matrix(rep(x[,i], length(breaks[[i]])), nrow=length(x[,i]))
    tmpb <- matrix(rep(breaks[[i]], each=length(x[,i])), nrow=length(x[,i]))
    index[,i] <- apply(tmpx > tmpb, 1, sum) + 1                                   ## takes values 1, ..., nBreak[i] + 1
  }    

  ##### Compute pseudo chi-squared statistics for each univariate margin
  ##### ============================================================================  
  RESULT <- CHISQ <- list()

  F1 <- list()
  RESULT[[1]] <- list()
  NPARAM <- (K - 1) + K + K
  for (i in 1:p){
    Mean <- mu[,i]
    StdDev <- sqrt(sapply(Sigma, "[", i, i))
    F <- matrix(0, nrow=K, ncol=nBreak[i])
    for (k in 1:K){
      F[k,] <- w[k]*pnorm(breaks[[i]], mean=Mean[k], sd=StdDev[k])
    }
    F1[[i]] <- apply(F, 2, sum)
    PP <- c(F1[[i]], 1) - c(0, F1[[i]])                                      ## probabilities of bins
    
    EXPECT <- nrow(x) * PP                                                   ## expected counts
    
    TAB <- table(index[,i])
    OBSERVED <- rep(0, length(labels[[i]]))
    names(OBSERVED) <- paste(1:length(labels[[i]]))
    OBSERVED[names(TAB)] <- TAB                                              ## observed counts
    
    RESID <- (OBSERVED - EXPECT) / sqrt(EXPECT)
    if (i == 1) CHISQ[[1]] <- data.frame(ChiSq = sum(RESID^2), nBin = length(labels[[i]]), nParam = NPARAM)
    else        CHISQ[[1]] <- rbind(CHISQ[[1]], data.frame(ChiSq = sum(RESID^2), nBin = length(labels[[i]]), nParam = NPARAM))    
    RESULT[[1]][[i]] <- data.frame(Ind1 = 1:(nBreak[i] + 1), Mid1 = midpoints[[i]], Bin1 = labels[[i]], Observed = OBSERVED, Expected = EXPECT, Resid = RESID)
  }
  rownames(CHISQ[[1]]) <- names(RESULT[[1]]) <- paste(1:p)
    
  #layout(autolayout(p))
  #for (i in 1:p) plot(RESULT[[1]][[i]][,"Mid1"], abs(RESULT[[1]][[i]][,"Resid"]), type="h", col="red", lwd=2)

  ##### Compute pseudo chi-squared statistics for each bivariate margin
  ##### ====================================================================================  
  if (p > 1){
    RESULT[[2]] <- list()
    NPARAM <- (K - 1) + K * 2 + K * 3
    NAAM <- character()
    for (i in 1:(p - 1)){
      for (j in (i+1):p){
        NAAM <- c(NAAM, paste(i, "-", j, sep=""))
        nBin <- length(labels[[i]]) * length(labels[[j]])
        Bin1 <- rep(labels[[i]], length(labels[[j]]))
        Bin2 <- rep(labels[[j]], each=length(labels[[i]]))
        
        Break1 <- rep(breaks[[i]], nBreak[j])
        Break2 <- rep(breaks[[j]], each=nBreak[i])
        Grid <- cbind(Break1, Break2)
        
        ### Distribution function in verteces of rectangles
        ### ----------------------------------------------------
        FF <- rep(0, nrow(Grid))
        for (g in 1:nrow(Grid)) for (k in 1:K) FF[g] <- FF[g] + w[k] * mnormt::pmnorm(Grid[g,], mean=mu[k, c(i, j)], varcov=Sigma[[k]][c(i, j), c(i, j)])
        FF <- matrix(FF, nrow=nBreak[i], ncol=nBreak[j])

        ### Rectangle probabilities
        ### ----------------------------------------------------
        PP <- numeric(nBin)

            ## P(X1 in A, X2 <= Break2[1])
        PP[1] <- FF[1, 1]                                                                       ## P(X1<=Break1[1], X2<=Break2[1])
        if (nBreak[i] > 1) PP[2:nBreak[i]]   <- FF[2:nBreak[i], 1] - FF[1:(nBreak[i] - 1), 1]   ## First row of PP matrix with X2<=Break2[1] and finite X1
        PP[nBreak[i] + 1] <- F1[[j]][1] - FF[nBreak[i], 1]                                      ## P(X1>Break1[nBreak[i]], X2<=Break2[1])
        Filled <- nBreak[i] + 1

            ## P(X1 in A, X2 in B)
        if (nBreak[j] > 1){
          for (d2 in 2:nBreak[j]){
            PP[Filled + 1] <- FF[1, d2] - FF[1, d2 - 1]
            if (nBreak[i] > 1) PP[Filled + (2:nBreak[i])] <- FF[2:nBreak[i], d2] - FF[2:nBreak[i], d2 - 1] - FF[1:(nBreak[i] - 1), d2] + FF[1:(nBreak[i] - 1), d2 - 1]
            PP[Filled + nBreak[i] + 1] <- F1[[j]][d2] - FF[nBreak[i], d2] - F1[[j]][d2 - 1] + FF[nBreak[i], d2 - 1]
            Filled <- Filled + nBreak[i] + 1            
          }            
        }

            ## P(X1 in A, X2 > Break2[nBreak[j]]
        PP[Filled + 1] <- F1[[i]][1] - FF[1, nBreak[j]]
        if (nBreak[i] > 1) PP[Filled + (2:nBreak[i])] <- F1[[i]][2:nBreak[i]] - FF[2:nBreak[i], nBreak[j]] - F1[[i]][1:(nBreak[i] - 1)] + FF[1:(nBreak[i] - 1), nBreak[j]]
        PP[Filled + nBreak[i] + 1] <- 1 - F1[[i]][nBreak[i]] - F1[[j]][nBreak[j]] + FF[nBreak[i], nBreak[j]]

        ### Expected counts
        ### -----------------------------------------------------
        EXPECT <- nrow(x) * PP
        
        ### Observed counts
        ### -----------------------------------------------------
        TAB <- table(index[, i], index[, j])
        OBSERVED <- matrix(0, nrow=length(labels[[i]]), ncol=length(labels[[j]]))        
        rownames(OBSERVED) <- paste(1:length(labels[[i]]))
        colnames(OBSERVED) <- paste(1:length(labels[[j]]))        
        OBSERVED[rownames(TAB), colnames(TAB)] <- TAB
        OBSERVED <- as.numeric(OBSERVED)

        ### Pearson's residuals, put everything together
        ### -----------------------------------------------------
        RESID <- (OBSERVED - EXPECT) / sqrt(EXPECT)
        if (i == 1 & j == 2){
          CHISQ[[2]] <- data.frame(ChiSq = sum(RESID^2), nBin = nBin, nParam = NPARAM)
        }else{  
          CHISQ[[2]] <- rbind(CHISQ[[2]], data.frame(ChiSq = sum(RESID^2), nBin = nBin, nParam = NPARAM))
        }  
        RESULT[[2]][[paste(i, "-", j, sep="")]] <- data.frame(Ind1 = rep(1:(nBreak[i] + 1), nBreak[j] + 1),       Ind2 = rep(1:(nBreak[j] + 1), each=nBreak[i] + 1),
                                                              Mid1 = rep(midpoints[[i]], length(midpoints[[j]])), Mid2 = rep(midpoints[[j]], each=length(midpoints[[i]])),
                                                              Bin1 = rep(labels[[i]], length(labels[[j]])),       Bin2 = rep(labels[[j]], each=length(labels[[i]])),
                                                              Observed = OBSERVED, Expected = EXPECT, Resid = RESID)
        
        if (i == 1 & j == 3){
          COL <- rev(heat_hcl(33, c.=c(80, 30), l=c(30, 90), power=c(1/5, 1.3)))
          PP2 <- matrix(PP, nrow=length(labels[[i]]), ncol=length(labels[[j]]))
          RESID2 <- matrix(abs(RESID), nrow=length(labels[[i]]), ncol=length(labels[[j]]))

          layout(autolayout(3))
          par(bty="n")
          image(midpoints[[i]], midpoints[[j]], PP2, col=COL, xlab=paste("x", i, sep=""), ylab=paste("x", j, sep=""))
          image(breaks[[i]], breaks[[j]], FF, col=COL, xlab=paste("x", i, sep=""), ylab=paste("x", j, sep=""))

          IMBREAK <- seq(0.5, 5, by=0.5)
          IMCOL <- rev(heat_hcl(length(IMBREAK) + 1, c.=c(80, 30), l=c(30, 90), power=c(1/5, 1.3)))
          image(midpoints[[i]], midpoints[[j]], RESID2, col=IMCOL, xlab=paste("x", i, sep=""), ylab=paste("x", j, sep=""))          
        }
      }
    }

    rownames(CHISQ[[2]]) <- NAAM
  }  
  
  LTp <- p * (p + 1)/2

  RET <- list(result=RESULT, chisq=CHISQ)  
  return(RET)  
}

Try the mixAK package in your browser

Any scripts or data that you put into this service are public.

mixAK documentation built on Sept. 17, 2024, 1:06 a.m.