R/summary.sgee.R
In sgee: Stagewise Generalized Estimating Equations

Documented in summary.sgee

################################################################################
##
##   R package sgee by Gregory Vaughan, Kun Chen, and Jun Yan
##   Copyright (C) 2016-2018
##
##   This file is part of the R package sgee.
##
##   The R package sgee is free software: You can redistribute it and/or
##   modify it under the terms of the GNU General Public License as published
##   by the Free Software Foundation, either version 3 of the License, or
##   any later version (at your option). See the GNU General Public License
##   at <http://www.gnu.org/licenses/> for details.
##
##   The R package sgee is distributed in the hope that it will be useful,
##   but WITHOUT ANY WARRANTY without even the implied warranty of
##   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
##
#################################################################################' Coefficient Path summary
#' 
#' Function to analyze  and summarize a path of coefficent values by
#' comparing them using
#' prediction error on a \"new\" data set (or fold in CV), or the
#' original data set if no comparison data is provided. The best point along
#' the path in terms of the prediction error is identified. All of the
#' prediction errors for each point along the path, the minimum prediction
#' error, and the index of the minimum are returned.
#'
#' The prediction error used is dependent on the input. If the true Beta is
#' not given, then the sum squared error (or MSE; see parameter averaged)
#' in the response is used for gaussian (or non-poisson); for poisson
#' if the scale (or an estimate) is also given, then the sum squared Pearson
#' residuals are used, otherwise the deviance is used. If the true Beta is
#' provided then the sum squared error in the linear predictor is used instead.
#'
#' Furthermore, when true Beta is supplied, additional model selection metrics
#' are produced, including: False Positive Rate, False Discovery Rate,
#' False Negative Rate.
#'
#' The function is provided to allow for model selection; given
#' a path generated by a sgee function, the path can be fed into this function
#' with a testing data set to identify an optimal point along the path.
#' Cross validation can be performed by dividing the original data set
#' into k folds before hand and generating multiple coefficient paths and
#' applying this function to each path generated.
#'  
#' @param object Object of class \code{sgee}, from which various path
#' information is pulled.
#' @param newX Design matrix to be used for model testing. It is assumed
#' that \code{newX} does not contain an intercept column. An intercept column
#' is appended by\code{sgee.summary} if an intercept was used to make
#' \code{object}.
#' @param newY Response vector to be used for model testing.
#' @param newOffset Vector of offsets to be used for model testing. Must be
#' same length as newY.
#' @param trueBeta For simulation use; true coefficient values can be provided
#' to get certain metrics.
#' @param trueIntercept For simulation use; true intercept value to be used in
#' conjunction with trueBeta.
#' @param scale Scale value can be passed to allow for standardized error
#' measurements (poisson case only).
#' @param classification A numeric parameter from 0 to 1 indicating
#' cutoff to be used to determine classification rate in Binomial
#' setting. Default is 0.5. Values below 0 indicate that the
#' squared error, in either the observation
#' or the true linear predictor is the trueBeta is given, is to be used
#' instead of the classification rate.
#' @param averaged Logical parameter indicating whether the mean of the
#' total error is to be used; assumed TRUE.
#' @param ... Currently not used.
#' 
#' @return A list containing  1) a vector of prediction errors with
#' testing data set, 2) the smallest prediction error found along path,
#' 3) the index of the smallest error, and if the trueBeta parameter is
#' provided the False Positive, False Discovery, and false negative
#' rates, and True positive and False Positive counts
#' at the index of the smallest error, along with the minimum
#' mis-classification and corresponding index, where the mis-classification
#' is the total of the coefficients incorrectly marked as important/unimportant.
#' 
#' @author Gregory Vaughan
#' @examples
#' 
#' ## Initialize covariate values
#' p <- 50 
#' beta <- c(rep(2.4,5),
#'           c(1.3, 0, 1.7, 0, .5),
#'           rep(0.5,5),
#'           rep(0,p-15))
#' groupSize <- 1
#' numGroups <- length(beta)/groupSize
#' 
#' 
#' 
#' trainingData <- genData(numClusters = 50,
#'                         clusterSize = 4,
#'                         clusterRho = 0.6,
#'                         clusterCorstr = "exchangeable",
#'                         yVariance = 1,
#'                         xVariance = 1,
#'                         numGroups = numGroups,
#'                         groupSize = groupSize,
#'                         groupRho = 0.3,
#'                         beta = beta,
#'                         family = gaussian(),
#'                         intercept = 1)
#'
#' testingData <- genData(numClusters = 50,
#'                        clusterSize = 4,
#'                        clusterRho = 0.6,
#'                        clusterCorstr = "exchangeable",
#'                        yVariance = 1,
#'                        xVariance = 1,
#'                        numGroups = numGroups,
#'                        groupSize = groupSize,
#'                        groupRho = 0.3,
#'                        beta = beta,
#'                        family = gaussian(),
#'                        intercept = 1)
#' 
#' coefMat <- see(y = trainingData$y,
#'                x = trainingData$x,
#'                     family = gaussian(),
#'                     clusterID = trainingData$clusterID, 
#'                     corstr="exchangeable", 
#'                     maxIt = 200,
#'                     epsilon = .1)
#'
#' analysisResults <- summary(coefMat,
#'                            newX = testingData$x,
#'                            newY = testingData$y)
#' 
#' 
#' @export 
summary.sgee <- function(object,
                         newX = NULL,
                         newY = NULL,
                         newOffset = NULL,
                         trueBeta = NULL,
                         trueIntercept = NULL,
                         scale = NULL,
                         classification = 0.5,
                         averaged = TRUE,
                         ...){

    path <- object$path
    interceptPath <- rep(0, nrow(path))
    modelingFamily <- object$family

    
    
    it<- 0
    maxIt <- object$maxIt

    if(is.null(newX) | is.null(newY)){
        cat("\nnewX and/or newY missing; original data used\n")
        newX <- object$x
        newY <- object$y
        offset <- object$offset
    } else{
        if(!is.null(newOffset)){
            if(length(newOffset) != length(newY) & length(newOffset) != 1){
                stop("length of newY and newOffset must match") 
            } else{
                offset <- newOffset
            }
        } else{
            offset <- 0 
        }
    }
    
    addIntercept <- object$intercept
    if(addIntercept){
        interceptPath <- path[,1]
        path <- path[,-1]
        #newX <- cbind(rep(1,nrow(newX)),
        #                     newX)
    }

    if(!is.null(trueBeta)){
        if(is.null(trueIntercept)){
            warning("trueIntercept Missing; assuming intercept of zero")
            ##cat("\n trueIntercept Missing; assuming intercept of zero \n")
            trueIntercept <- 0
        }
    }
    ## prediction measure used for comparison
    minMeasure <- Inf
    bestIndex <- 0
    allMeasures <- rep(0, maxIt)

    if(!is.null(trueBeta)){
        FP <- 0
        FD <- 0
        FN <- 0
        TPCount <- 0
        FPCount <- 0
        minMisClass <- Inf
        bestMisClassIndex <- 0
        allMisClass <- rep(0, maxIt)
    }
        

    if(!is.null(trueBeta)){
        ##if(addIntercept){
        ##    trueEta <- newX %*% c(trueIntercept, trueBeta) + offset
        ##} else{
            trueEta <- trueIntercept + newX %*% trueBeta + offset
        ##}
    }
  while(it < maxIt){
    it <- it + 1

    ## score measurement of path
    ## supposed to generate new X and Y to see how each
    ## step along path does in terms of some score equation
    ## or when doing cross validation, instead of using
    ## new X and Y, the remaining fold can be supplied


    eta <- interceptPath[it] + newX%*%path[it,] + offset
    Yhat <- modelingFamily$linkinv(t(eta))
    if(is.null(trueBeta)){
        if(modelingFamily$family == "poisson"){
            if(is.null(scale)){
                ## If no scale is provided, then the deviance will be used
                modelLogLikelihood <- sum(stats::dpois(x = newY,
                                                lambda = Yhat,
                                                log = TRUE))
                saturatedLogLikelihood <- sum(stats::dpois(x = newY,
                                                     lambda = newY,
                                                     log = TRUE))

                measure <- -2*(modelLogLikelihood - saturatedLogLikelihood)
            } else{
                
                if(!((length(scale[it,]) == 1) | (length(scale[it,]) == length(Yhat)))){
                    
                    
                    stop("scale parameter provided is not appropriately sized; must have a either a length of 1 or length(newY)")
                }
                standardErrorSquared <- (newY - Yhat)^2/(modelingFamily$variance(Yhat)*scale[it,])

                
                measure <- sum(standardErrorSquared)
            }

           
            
        }else if(classification >=0 & modelingFamily$family == "binomial"){
            error <- newY - (Yhat > classification)
            measure <- sum(abs(error))
        }else{ ## non-poisson/non-classification case
            
            error <- newY - Yhat
            measure <- sum(error^2)
        }
        
    } else{
        if(classification >=0 & modelingFamily$family == "binomial"){
            error <-  newY - (Yhat > classification)
            measure <- sum(abs(error))
        } else{
            ## If true beta is given, then theMSe in terms of the linear
            ## predictor is given
            error <- trueEta - eta
            measure <- sum(error^2)
        }
    }

    allMeasures[it] <- measure
    
    if(measure < minMeasure){
      bestIndex <- it
      minMeasure <- measure
      if(!is.null(trueBeta)){
          TPCount <- sum((trueBeta != 0) & (path[bestIndex,] != 0))
          FPCount <- sum((trueBeta == 0) & (path[bestIndex,] != 0))
          FP <- FPCount/sum(trueBeta == 0)
          FD <- sum((trueBeta == 0) & (path[bestIndex,] != 0))/sum(path[bestIndex,] != 0)
          FN <- sum((trueBeta != 0) & (path[bestIndex,] == 0))/sum(trueBeta != 0)

      }
      
    }

    if(!is.null(trueBeta)){
        misClass <- sum((trueBeta!=0) != (path[it,] !=0))
        allMisClass[it] <- misClass

        if(measure < minMeasure){
            bestMisClassIndex <- it
            minMisClass <- misClass
        }
    }

    
  }
    ## if(modelingFamily$family == "binomial"){
    ##     minMeasure <- 1-minMeasure
    ## }

    if(averaged){
        allMeasures <- allMeasures/nrow(newX)
        minMeasure <- minMeasure/nrow(newX)
    }

    if(!is.null(trueBeta)){
        summaryResults <-list(sgee = object,
                              intercept = addIntercept,
                              allMeasures = allMeasures,
                              minMeasure = minMeasure,
                              bestIndex = bestIndex,
                              trueBeta = trueBeta,
                              FP = FP,
                              FD = FD,
                              FN = FN,
                              TPCount = TPCount,
                              FPCount = FPCount,
                              allMisClass = allMisClass,
                              minMisClass = minMisClass,
                              bestMisClassIndex = bestMisClassIndex)
    } else{
        summaryResults <- list(sgee = object,
                               intercept = addIntercept,
                               allMeasures = allMeasures,
                               minMeasure = minMeasure,
                               bestIndex = bestIndex,
                               trueBeta = NULL)
    }

    class(summaryResults) <- "sgeeSummary"
    summaryResults$call <- object$call

    summaryResults
}