R/summarizeNA.R
In LMMstar: Repeated Measurement Models for Discrete Times

Documented in summarizeNA

### summarizeNA.R --- 
##----------------------------------------------------------------------
## Author: Brice Ozenne
## Created: dec  7 2022 (17:13) 
## Version: 
## Last-Updated: jul 10 2023 (18:16) 
##           By: Brice Ozenne
##     Update #: 51
##----------------------------------------------------------------------
## 
### Commentary: 
## 
### Change Log:
##----------------------------------------------------------------------
## 
### Code:

## * summarizeNA (documentation)
##' @title Summarize missing data patterns
##' @description Summarize missing data patterns.
##'
##' @param data [data.frame] dataset containing the observations.
##' @param repetition [formula] Specify the structure of the data when in the long format: the time/repetition variable and the grouping variable, e.g. ~ time|id.
##' When specified the missing data pattern is specific to each variable not present in the formula. 
##' @param sep [character] character used to separate the missing data indicator (0/1) when naming the missing data patterns.
##' @param newnames [character vector of length 4] additional column containing the variable name (only when argument \code{repetition} is used),
##' the frequency of the missing data pattern in the dataset, the name of the missing data pattern in the dataset, and the number of missing data per pattern.
##' @param keep.data [logical] should the indicator of missing data per variable in the original dataset per pattern be output.
##'
##' @return a data frame
##' 
##' @keywords utilities
##'
##' @seealso
##' \code{\link{autoplot.summarizeNA}} for a graphical display.
##' 
##' @examples
##' data(gastricbypassW, package = "LMMstar")
##' summarizeNA(gastricbypassW) 
##' summarizeNA(gastricbypassW, keep.data = FALSE)
##' 
##' data(gastricbypassL, package = "LMMstar")
##' summarizeNA(gastricbypassL, repetition = ~time|id)
##' 
##' data(calciumL, package = "LMMstar")
##' mp <- summarizeNA(calciumL, repetition = ~visit|girl)
##' plot(mp, variable = "bmd")
##' summarizeNA(calciumL[,c("visit","girl","bmd")], repetition = ~visit|girl)
##' 
##' data(vasscoresW, package = "LMMstar")
##' summarizeNA(vasscoresW)

## * summarizeNA (code)
##' @export
summarizeNA <- function(data, repetition = NULL, sep = "",
                        newnames = c("variable","frequency","missing.pattern","n.missing"),
                        keep.data = TRUE){

    ## ** check and normalize user input

    ## *** check data
    data <- as.data.frame(data)
    name.all <- names(data)
    if(any(name.all %in% newnames) && keep.data){
        invalid <- name.all[name.all %in% newnames]
        stop("Name(s) \"",paste(invalid, collapse = "\" \""),"\" is used internally. \n",
             "Consider renaming the variables in the dataset. \n",
             sep = "")
    }

    ## *** handle repetition
    if(!is.null(repetition)){

        detail.formula <- formula2var(repetition, name.argument = "repetition")
        var.time <- detail.formula$var$time
        if(length(var.time)==0){
            stop("Missing time variable in argument \'repetition\'. \n",
                 "Should be something like: ~time|cluster. \n")
        }
        if(any(var.time %in% name.all == FALSE)){
            stop("Mismatch between argument \'repetition\' and argument \'data\'. \n",
                 "Could not find the time variable in the dataset.\n")
        }

        var.cluster <- detail.formula$var$cluster
        if(length(var.cluster)==0){
            stop("Missing cluster variable in argument \'repetition\'. \n",
                 "Should be something like: ~time|cluster. \n")
        }else if(length(var.cluster)>1){
            stop("Too many cluster variables in argument \'repetition\'. \n",
                 "Should be something like: ~time|cluster. \n")
        }
        if(any(var.cluster %in% name.all == FALSE)){
            stop("Mismatch between argument \'repetition\' and argument \'data\'. \n",
                 "Could not find the cluster variable in the dataset.\n")
        }

        name.Y <- setdiff(name.all, c(var.time,var.cluster))
        if(length(name.Y)==0){
            stop("Missing column in argument \'data\'. \n",
                 "There should be at least one column other than the time and cluster variable. \n")
        }

        if(!is.factor(data[[var.time]])){
            data[[var.time]] <- as.factor(data[[var.time]])
        }
        Utime <- levels(data[[var.time]])
            
        ls.data <- stats::setNames(lapply(name.Y, function(iY){ ## iY <- name.Y[1]
            stats::reshape(data[,c(var.cluster,var.time,iY)], direction = "wide", timevar = var.time, idvar = var.cluster, varying = Utime)
        }), name.Y)
    }

    ## ** warper

    warper.pattern <- function(iData, sep){ ## iData <- ls.data[[1]]
        iMtest <- is.na(iData)*1.0
        iVtest <- nlme::collapse(iMtest, sep = sep, as.factor = TRUE)
        iUpattern <- levels(iVtest)
        iUpattern.nobs <- unname(table(iVtest))
    
        iIndex0.Upattern <- which(duplicated(iVtest)==FALSE)
        iIndex0.Upattern <- stats::setNames(iIndex0.Upattern,iVtest[iIndex0.Upattern])
        iMtest.Upattern <- iMtest[iIndex0.Upattern[iUpattern],,drop=FALSE]

        if(keep.data){
            iOut <- data.frame(as.numeric(iUpattern.nobs), iUpattern, rowSums(iMtest.Upattern), iMtest.Upattern)
            names(iOut) <- c(newnames[2:4], names(iData))
        }else{
            iOut <- data.frame(as.numeric(iUpattern.nobs), iUpattern, rowSums(iMtest.Upattern))
            names(iOut) <- c(newnames[2:4])
        }

        return(iOut)
    }
    
    ## ** identify patterns
    if(is.null(repetition)){
        df.pattern <- warper.pattern(data, sep = sep)
    }else{
        ls.df.pattern <- lapply(name.Y, function(iY){
            iDf <- cbind(iY, warper.pattern(ls.data[[iY]], sep = sep))
            names(iDf)[1] <- newnames[1]
            return(iDf)
        })
        df.pattern <- do.call(rbind,ls.df.pattern)
        rownames(df.pattern) <- NULL
    }


    ## ** export
    attr(df.pattern,"args") <- list(newnames = newnames, keep.data = keep.data, repetition = repetition, sep = sep)

    class(df.pattern) <- append("summarizeNA", class(df.pattern))
    return(df.pattern)
}

##----------------------------------------------------------------------
### summarizeNA.R ends here