R/lav_data.R

# constructor for the 'lavData' class
#
# the lavData class describes how the data looks like
#  - do we have a full data frame, or only sample statistics?
#    (TODO: allow for patterns + freq, if data is categorical)
#  - variable type ("numeric", "ordered", ...)
#  - how many groups, how many observations, ...
#  - what about missing patterns?
#
# initial version: YR 14 April 2012

# YR 23 feb 2017: blocks/levels/groups, but everything is group-based!

# FIXME: if nlevels > 1L, and ngroups > 1L, we should check that
# group is at the upper-level

# extract the data we need for this particular model
lavData <- function(data              = NULL,          # data.frame
                    group             = NULL,          # multiple groups?
                    cluster           = NULL,          # clusters?
                    ov.names          = NULL,          # variables in model
                    ov.names.x        = character(0),  # exo variables
                    ov.names.l        = list(),        # names per level
                    ordered           = NULL,          # ordered variables
                    sampling.weights  = NULL,          # sampling weights
                    sample.cov        = NULL,          # sample covariance(s)
                    sample.mean       = NULL,          # sample mean vector(s)
                    sample.nobs       = NULL,          # sample nobs
 
                    lavoptions        = lavOptions(),  # lavoptions
                    allow.single.case = FALSE          # for newdata in predict
                   ) 
{

    # get info from lavoptions

    # group.labels
    group.label <- lavoptions$group.label
    if(is.null(group.label)) {
        group.label <- character(0L)
    }

    # level.labels
    level.label <- lavoptions$level.label
    if(is.null(level.label)) {
        level.label <- character(0L)
    }

    # std.ov?
    std.ov <- lavoptions$std.ov
    if(is.null(std.ov)) {
        std.ov <- FALSE
    }

    # missing?
    missing <- lavoptions$missing
    if(is.null(missing) || missing == "default") {
        missing <- "listwise"
    }
    
    # warn?
    warn <- lavoptions$warn
    if(is.null(warn)) {
        warn <- TRUE
    }

    # four scenarios:
    #    0) data is already a lavData object: do nothing
    #    1) data is full data.frame (or a matrix)
    #    2) data are sample statistics only
    #    3) no data at all

    # 1) full data
    if(!is.null(data)) {
 
       # catch psindex/lavData objects
        if(inherits(data, "lavData")) {
            return(data)
        } else if(inherits(data, "psindex")) {
            return(data@Data)
        }

        # catch matrix 
        if(!is.data.frame(data)) {
            # is it a matrix?
            if(is.matrix(data)) {
                if(nrow(data) == ncol(data)) {
                    # perhaps it is a covariance matrix?
                    if(data[2,1] == data[1,2] && warn) { # not perfect...
                        warning("psindex WARNING: data argument looks like a covariance matrix; please use the sample.cov argument instead")
                    }
                } 
                # or perhaps it is a data matrix?
                ### FIXME, we should avoid as.data.frame() and handle
                ### data matrices directly
                data <- as.data.frame(data, stringsAsFactors = FALSE)
            } else {
                stop("psindex ERROR: data object of class ", class(data))
            }
        }

        lavData <- lav_data_full(data              = data,
                                 group             = group,
                                 cluster           = cluster,
                                 group.label       = group.label,
                                 level.label       = level.label,
                                 ov.names          = ov.names,
                                 ordered           = ordered,
                                 sampling.weights  = sampling.weights,
                                 ov.names.x        = ov.names.x,
                                 ov.names.l        = ov.names.l,
                                 std.ov            = std.ov,
                                 missing           = missing,
                                 warn              = warn,
                                 allow.single.case = allow.single.case)
        sample.cov <- NULL # not needed, but just in case
    }
    
    
    # 2) sample moments
    if(is.null(data) && !is.null(sample.cov)) {

        # for now: no levels!!
        nlevels <- 1L
        
        # we also need the number of observations (per group)
        if(is.null(sample.nobs))
            stop("psindex ERROR: please specify number of observations")

        # list?
        if(is.list(sample.cov)) {
            # multiple groups, multiple cov matrices
            if(!is.null(sample.mean)) {
                stopifnot(length(sample.mean) == length(sample.cov))
            }
            # multiple groups, multiple cov matrices
            ngroups     <- length(sample.cov)
            LABEL <- names(sample.cov)
            if(is.null(group.label) || length(group.label) == 0L) {
                if(is.null(LABEL))
                    group.label <- paste("Group ", 1:ngroups, sep="")
                else
                    group.label <- LABEL
            } else {
                if(is.null(LABEL)) {
                    stopifnot(length(group.label) == ngroups)
                } else {
                    # FIXME!!!!
                    # check if they match
                }   
            }
        } else {
            ngroups <- 1L; group.label <- character(0)
            if(!is.matrix(sample.cov))
                stop("psindex ERROR: sample.cov must be a matrix or a list of matrices")
            sample.cov <- list(sample.cov)
        }

        # get ov.names
        if (is.null(ov.names)) {
            ov.names <- lapply(sample.cov, row.names)            
        } else if (!is.list(ov.names)) {
            # duplicate ov.names for each group
            tmp <- ov.names; ov.names <- vector("list", length = ngroups)
            ov.names[1:ngroups] <- list(tmp)
        } else {
            if (length(ov.names) != ngroups)
                stop("psindex ERROR: ov.names assumes ", length(ov.names),
                     " groups; data contains ", ngroups, " groups")
            # nothing to do
        }

        # handle ov.names.x
        if(!is.list(ov.names.x)) {
            tmp <- ov.names.x; ov.names.x <- vector("list", length = ngroups)
            ov.names.x[1:ngroups] <- list(tmp)
        } else {
            if(length(ov.names.x) != ngroups)
                stop("psindex ERROR: ov.names.x assumes ", length(ov.names.x),
                     " groups; data contains ", ngroups, " groups")
        }

        ov <- list()
        ov$name <- unique( unlist(c(ov.names, ov.names.x)) )
        nvar    <- length(ov$name)
        ov$idx  <- rep(NA, nvar)
        ov$nobs <- rep(sample.nobs, nvar)
        ov$type <- rep("numeric", nvar)

        # if std.ov = TRUE, give a warning (suggested by Peter Westfall)
        if(std.ov && warn) {
            warning("psindex WARNING: std.ov argument is ignored if only sample statistics are provided.")
        }

        # construct lavData object
        lavData <- new("lavData",
                       data.type   = "moment",
                       ngroups     = ngroups, 
                       group       = character(0L),
                       nlevels     = 1L, # for now
                       cluster     = character(0L),
                       group.label = group.label,
                       level.label = character(0L),
                       nobs        = as.list(sample.nobs),
                       norig       = as.list(sample.nobs),
                       ov.names    = ov.names, 
                       ov.names.x  = ov.names.x,
                       ov.names.l  = ov.names.l,
                       ordered     = as.character(ordered),
                       weights     = vector("list", length = ngroups),
                       sampling.weights = character(0L),
                       ov          = ov,
                       std.ov      = FALSE,
                       missing     = "listwise",
                       case.idx    = vector("list", length = ngroups),
                       Mp          = vector("list", length = ngroups),
                       Rp          = vector("list", length = ngroups),
                       Lp          = vector("list", length = ngroups),
                       X           = vector("list", length = ngroups),
                       eXo         = vector("list", length = ngroups)
                      )

    }

    # 3) data.type = "none":  both data and sample.cov are NULL
    if(is.null(data) && is.null(sample.cov)) {

        # multilevel? --> ov.names.l should be filled in
        if(length(ov.names.l) > 0L) {
            nlevels <- length(ov.names.l[[1]]) # we assume the same number
                                               # of levels in each group!

            # do we have a cluster argument? if not, create one
            if(is.null(cluster)) {
                if(nlevels == 2L) {
                    cluster <- "cluster"
                } else {
                    cluster <- paste0("cluster", seq_len(nlevels - 1L))
                }
            }
 
            # default level.labels
            if(length(level.label) == 0L) {
                level.label <- c("within", cluster)
            } else {
                # check if length(level.label) = 1 + length(cluster)
                if(length(level.label) != length(cluster) + 1L) {
                    stop("psindex ERROR: length(level.label) != length(cluster) + 1L")
                }
                # nothing to do
            }
        } else {
            nlevels <- 1L
            cluster <- character(0L)
            level.label <- character(0L)
        }

        # ngroups: ov.names (when group: is used), or sample.nobs
        if(is.null(ov.names)) {
            warning("psindex WARNING: ov.names is NULL")
            ov.names <- character(0L)
            if(is.null(sample.nobs)) {
                ngroups <- 1L
                sample.nobs <- rep(list(0L), ngroups)
            } else {
                sample.nobs <- as.list(sample.nobs)
                ngroups <- length(sample.nobs)
            }
        } else if(!is.list(ov.names)) {
            if(is.null(sample.nobs)) {
                ngroups <- 1L
                sample.nobs <- rep(list(0L), ngroups)
            } else {
                sample.nobs <- as.list(sample.nobs)
                ngroups <- length(sample.nobs)
            }
            ov.names <- rep(list(ov.names), ngroups)
        } else if(is.list(ov.names)) {
            ngroups <- length(ov.names)
            if(is.null(sample.nobs)) {
                sample.nobs <- rep(list(0L), ngroups)
            } else {
                sample.nobs <- as.list(sample.nobs)
                if(length(sample.nobs) != ngroups) {
                    stop("psindex ERROR: length(sample.nobs) = ",
                          length(sample.nobs), 
                         " but syntax implies ngroups = ", ngroups)
                }
            }
        }


        # group.label
        if(ngroups > 1L) {
            if(is.null(group)) {
                group <- "group"
            }
            group.label <- paste("Group", 1:ngroups, sep="")
        } else {
            group <- character(0L)
            group.label <- character(0L)
        }

        # handle ov.names.x
        if(!is.list(ov.names.x)) {
            ov.names.x <- rep(list(ov.names.x), ngroups)
        }

        ov <- list()
        ov$name <- unique( unlist(c(ov.names, ov.names.x)) )
        nvar    <- length(ov$name)
        ov$idx  <- rep(NA, nvar)
        ov$nobs <- rep(0L, nvar)
        ov$type <- rep("numeric", nvar)
        ov$nlev <- rep(0L, nvar)

        # collect information per upper-level group
        Lp <- vector("list", length = ngroups)
        for(g in 1:ngroups) {
            if(nlevels > 1L) {
                Lp[[g]] <- lav_data_cluster_patterns(Y = NULL, clus = NULL,
                                                 cluster = cluster,
                                                 ov.names = ov.names[[g]],
                                                 ov.names.l = ov.names.l[[g]])
            }
        } # g

        # construct lavData object
        lavData <- new("lavData",
                       data.type   = "none",
                       ngroups     = ngroups,
                       group       = group,
                       nlevels     = nlevels,
                       cluster     = cluster,
                       group.label = group.label,
                       level.label = level.label,
                       nobs        = sample.nobs,
                       norig       = sample.nobs,
                       ov.names    = ov.names, 
                       ov.names.x  = ov.names.x,
                       ov.names.l  = ov.names.l,
                       ordered     = as.character(ordered),
                       weights     = vector("list", length = ngroups),
                       sampling.weights = character(0L),
                       ov          = ov,
                       missing     = "listwise",
                       case.idx    = vector("list", length = ngroups),
                       Mp          = vector("list", length = ngroups),
                       Rp          = vector("list", length = ngroups),
                       Lp          = Lp,
                       X           = vector("list", length = ngroups),
                       eXo         = vector("list", length = ngroups)
                      )
    }

    lavData
}


# handle full data
lav_data_full <- function(data          = NULL,          # data.frame
                          group         = NULL,          # multiple groups?
                          cluster       = NULL,
                          group.label   = NULL,          # custom group labels?
                          level.label   = NULL,
                          ov.names      = NULL,          # variables needed 
                                                         # in model
                          ordered       = NULL,          # ordered variables
                          sampling.weights = NULL,       # sampling weights
                          ov.names.x    = character(0L), # exo variables
                          ov.names.l    = list(),        # var per level
                          std.ov        = FALSE,         # standardize ov's?
                          missing       = "listwise",    # remove missings?
                          warn          = TRUE,          # produce warnings?
                          allow.single.case = FALSE      # allow single case?
                        )
{
    # number of groups and group labels
    if(!is.null(group) && length(group) > 0L) {
        if(!(group %in% names(data))) {
            stop("psindex ERROR: grouping variable ", sQuote(group),
                 " not found;\n  ",
                 "variable names found in data frame are:\n  ", 
                 paste(names(data), collapse=" "))
        }
        # note: by default, we use the order as in the data; 
        # not as in levels(data[,group])
        if(length(group.label) == 0L) {
            group.label <- unique(as.character(data[[group]]))
            if(warn && any(is.na(group.label))) {
                warning("psindex WARNING: group variable ", sQuote(group), 
                        " contains missing values\n", sep="")
            }
            group.label <- group.label[!is.na(group.label)]
        } else {
            group.label <- unique(as.character(group.label))
            # check if user-provided group labels exist
            LABEL <- unique(as.character(data[[group]]))
            idx <- match(group.label, LABEL)
            if(warn && any(is.na(idx))) {
                warning("psindex WARNING: some group.labels do not appear ",
                        "in the grouping variable: ",  
                        paste(group.label[which(is.na(idx))], collapse=" "))
            }
            group.label <- group.label[!is.na(idx)]
            # any groups left?
            if(length(group.label) == 0L)
                stop("psindex ERROR: no group levels left; check the group.label argument")
        }
        ngroups     <- length(group.label)
    } else {
        if(warn && length(group.label) > 0L)
            warning("psindex WARNING: `group.label' argument",
                    " will be ignored if `group' argument is missing")
        ngroups <- 1L
        group.label <- character(0L)
        group <- character(0L)
    }

    # sampling weights
    if(!is.null(sampling.weights)) {
        if(is.character(sampling.weights)) {
            if(!(sampling.weights %in% names(data))) {
                stop("psindex ERROR: sampling weights variable ", 
                     sQuote(sampling.weights),
                     " not found;\n  ",
                     "variable names found in data frame are:\n  ",
                     paste(names(data), collapse=" "))
            }
            # check for missing values in sampling weight variable
            if(any(is.na(data[[sampling.weights]]))) {
                stop("psindex ERROR: sampling.weights variable ",
                        sQuote(sampling.weights),
                        " contains missing values\n", sep = "")
            }
        } else {
            stop("psindex ERROR: sampling weights argument should be a variable name in the data.frame")
        }
    }

    # cluster
    # number of levels and level labels
    if(!is.null(cluster) && length(cluster) > 0L) {
        # cluster variable in data?
        if(!all(cluster %in% names(data))) {
            # which one did we not find?
            not.ok <- which(!cluster %in% names(data)) 

            stop("psindex ERROR: cluster variable(s) ", sQuote(cluster[not.ok]),
                 " not found;\n  ",
                 "variable names found in data frame are:\n  ", 
                 paste(names(data), collapse = " "))
        }
        # default level.labels
        if(length(level.label) == 0L) {
            level.label <- c("within", cluster)
        } else {
            # check if length(level.label) = 1 + length(cluster)
            if(length(level.label) != length(cluster) + 1L) {
                stop("psindex ERROR: length(level.label) != length(cluster) + 1L")
            }
            # nothing to do
        }
        # check for missing values in cluster variable(s)
        for(cl in 1:length(cluster)) {
            if(warn && any(is.na(data[[cluster[cl]]]))) {
                warning("psindex WARNING: cluster variable ",
                        sQuote(cluster[cl]),
                        " contains missing values\n", sep = "")
            }
        }
        nlevels <- length(level.label)
    } else {
        if(warn && length(level.label) > 0L)
            warning("psindex WARNING: `level.label' argument",
                    " will be ignored if `cluster' argument is missing")
        nlevels <- 1L
        level.label <- character(0L)
        cluster <- character(0L)
    }

    # ov.names (still needed???)
    if(is.null(ov.names)) {
        ov.names <- names(data)
        # remove 'group' name from ov.names
        if(length(group) > 0L) {
            group.idx <- which(ov.names == group)
            ov.names <- ov.names[-group.idx]
        }
        # remove 'cluster' names from ov.names
        if(length(cluster) > 0L) {
            cluster.idx <- which(ov.names %in% cluster)
            ov.names <- ov.names[-cluster.idx]
        }
    }

    # check ov.names vs ngroups
    if(ngroups > 1L) {
        if(is.list(ov.names)) {
            if(length(ov.names) != ngroups)
                stop("psindex ERROR: ov.names assumes ", length(ov.names),
                     " groups; data contains ", ngroups, " groups")
        } else {
            tmp <- ov.names
            ov.names <- vector("list", length = ngroups)
            ov.names[1:ngroups] <- list(tmp)
        }
        if(is.list(ov.names.x)) {
            if(length(ov.names.x) != ngroups)
                stop("psindex ERROR: ov.names assumes ", length(ov.names.x),
                     " groups; data contains ", ngroups, " groups")
        } else {
            tmp <- ov.names.x
            ov.names.x <- vector("list", length = ngroups)
            ov.names.x[1:ngroups] <- list(tmp)
        }
    } else {
        if(is.list(ov.names)) {
            if(length(ov.names) > 1L)
                stop("psindex ERROR: model syntax defines multiple groups; data suggests a single group")
        } else {
            ov.names <- list(ov.names)
        }
        if(is.list(ov.names.x)) {
            if(length(ov.names.x) > 1L)
                stop("psindex ERROR: model syntax defines multiple groups; data suggests a single group")
        } else {
            ov.names.x <- list(ov.names.x)
        }
    }

    # check if all ov.names can be found in the data.frame
    for(g in 1:ngroups) {
        # does the data contain all the observed variables
        # needed in the user-specified model for this group
        ov.all <- unique(ov.names[[g]], ov.names.x[[g]]) # no overlap if categ

        # handle interactions
        ov.int.names <- ov.all[ grepl(":", ov.all) ]
        n.int <- length(ov.int.names)
        if(n.int > 0L) {
            ov.names.noint <- ov.all[!ov.all %in% ov.int.names]
            for(iv in seq_len(n.int)) {
                NAMES <- strsplit(ov.int.names[iv], ":", fixed = TRUE)[[1L]]
                if(all(NAMES %in% ov.names.noint)) {
                    # add this interaction term to the data.frame, unless
                    # it already exists
                    if(is.null(data[[ ov.int.names[iv] ]])) {
                        data[[ ov.int.names[iv] ]] <- 
                            data[[NAMES[1L]]] * data[[NAMES[2L]]]
                    }
                }
            }
        }

        # check for missing observed variables
        idx.missing <- which(!(ov.all %in% names(data)))

        if(length(idx.missing)) {
            stop("psindex ERROR: missing observed variables in dataset: ",
                 paste(ov.all[idx.missing], collapse=" "))
        }
    }


    # here, we know for sure all ov.names exist in the data.frame
    # create varTable
    # FIXME: should we add the 'group'/'cluster' variable (no for now)
    ov <- lav_dataframe_vartable(frame = data, ov.names = ov.names, 
                                 ov.names.x = ov.names.x, ordered = ordered,
                                 as.data.frame. = FALSE)

    # do some checking
    # check for unordered factors (but only if nlev > 2)
    if("factor" %in%  ov$type) {
        f.names <- ov$name[ov$type == "factor" & ov$nlev > 2L]
        if(warn && any(f.names %in% unlist(ov.names)))
            warning(paste("psindex WARNING: unordered factor(s) with more than 2 levels detected in data:", paste(f.names, collapse=" ")))
    }
    # check for ordered exogenous variables
    if("ordered" %in% ov$type[ov$name %in% unlist(ov.names.x)]) {
        f.names <- ov$name[ov$type == "ordered" & 
                           ov$name %in% unlist(ov.names.x)]
        if(warn && any(f.names %in% unlist(ov.names.x)))
            warning(paste("psindex WARNING: exogenous variable(s) declared as ordered in data:", paste(f.names, collapse=" ")))
    }
    # check for zero-cases
    idx <- which(ov$nobs == 0L | ov$var == 0)
    if(length(idx) > 0L) {
        OV <- as.data.frame(ov)
        rn <- rownames(OV)
        rn[idx] <- paste(rn[idx], "***", sep="")
        rownames(OV) <- rn
        print(OV)
        stop("psindex ERROR: some variables have no values (only missings) or no variance")
    }
    # check for single cases (no variance!)
    idx <- which(ov$nobs == 1L | (ov$type == "numeric" & !is.finite(ov$var)))
    if(!allow.single.case && length(idx) > 0L) {
        OV <- as.data.frame(ov)
        rn <- rownames(OV)
        rn[idx] <- paste(rn[idx], "***", sep="")
        rownames(OV) <- rn
        print(OV)
        stop("psindex ERROR: some variables have only 1 observation or no finite variance")
    }
    # check for ordered variables with only 1 level
    idx <- which(ov$type == "ordered" & ov$nlev == 1L)
    if(length(idx) > 0L) {
        OV <- as.data.frame(ov)
        rn <- rownames(OV)
        rn[idx] <- paste(rn[idx], "***", sep="")
        rownames(OV) <- rn
        print(OV)
        stop("psindex ERROR: ordered variable(s) has/have only 1 level")
    }
    # check for mix small/large variances (NOT including exo variables)
    if(!std.ov && !allow.single.case && warn && any(ov$type == "numeric")) {
        num.idx <- which(ov$type == "numeric" & ov$exo == 0L)
        if(length(num.idx) > 0L) {
            min.var <- min(ov$var[num.idx])
            max.var <- max(ov$var[num.idx])
            rel.var <- max.var/min.var
            if(warn && rel.var > 1000) {
                warning("psindex WARNING: some observed variances are (at least) a factor 1000 times larger than others; use varTable(fit) to investigate")
            }
        }
    }
    # check for all-exogenous variables (eg in f <~ x1 + x2 + x3)
    if(warn && all(ov$exo == 1L)) {
        warning("psindex WARNING: all observed variables are exogenous; model may not be identified")
    }

    # prepare empty lists 
   
    # group-based
    case.idx <- vector("list", length = ngroups)
    Mp       <- vector("list", length = ngroups)
    Rp       <- vector("list", length = ngroups)
    norig    <- vector("list", length = ngroups)
    nobs     <- vector("list", length = ngroups)
    X        <- vector("list", length = ngroups)
    eXo      <- vector("list", length = ngroups)
    Lp       <- vector("list", length = ngroups)
    weights  <- vector("list", length = ngroups)

    # collect information per upper-level group
    for(g in 1:ngroups) {

        # extract variables in correct order
        ov.idx  <- ov$idx[match(ov.names[[g]],   ov$name)]
        exo.idx <- ov$idx[match(ov.names.x[[g]], ov$name)] 
        all.idx <- unique(c(ov.idx, exo.idx))

        # extract cases per group
        if(ngroups > 1L || length(group.label) > 0L) {
            if(missing == "listwise") {
                case.idx[[g]] <- which(data[[group]] == group.label[g] &
                                           complete.cases(data[all.idx]))
                nobs[[g]] <- length(case.idx[[g]])
                norig[[g]] <- length(which(data[[group]] == group.label[g]))
            #} else if(missing == "pairwise" && length(exo.idx) > 0L) {
            #    case.idx[[g]] <- which(data[[group]] == group.label[g] &
            #                           complete.cases(data[exo.idx]))
            #    nobs[[g]] <- length(case.idx[[g]])
            #    norig[[g]] <- length(which(data[[group]] == group.label[g]))
            } else if(length(exo.idx) > 0L) {
                case.idx[[g]] <- which(data[[group]] == group.label[g] &
                                       complete.cases(data[exo.idx]))
                nobs[[g]] <- length(case.idx[[g]])
                norig[[g]] <- length(which(data[[group]] == group.label[g]))
                if(warn && (nobs[[g]] < norig[[g]])) {
                    warning("psindex WARNING: ", (nobs[[g]] - norig[[g]]),
                        " cases were deleted in group ", group.label[g], 
                        " due to missing values in ",
                        "\n\t\t  exogenous variable(s), while fixed.x = TRUE.")
                }
            } else {
                case.idx[[g]] <- which(data[[group]] == group.label[g])
                nobs[[g]] <- norig[[g]] <- length(case.idx[[g]])
            }
        } else {
            if(missing == "listwise") {
                case.idx[[g]] <- which(complete.cases(data[all.idx]))
                nobs[[g]] <- length(case.idx[[g]])
                norig[[g]] <- nrow(data)
            #} else if(missing == "pairwise" && length(exo.idx) > 0L) {
            #    case.idx[[g]] <- which(complete.cases(data[exo.idx]))
            #    nobs[[g]] <- length(case.idx[[g]])
            #    norig[[g]] <- nrow(data)
            } else if(length(exo.idx) > 0L) {
                case.idx[[g]] <- which(complete.cases(data[exo.idx]))
                nobs[[g]] <- length(case.idx[[g]])
                norig[[g]] <- nrow(data)
                if(warn && (nobs[[g]] < norig[[g]])) {
                    warning("psindex WARNING: ", (norig[[g]] - nobs[[g]]),
                        " cases were deleted due to missing values in ",
                        "\n\t\t  exogenous variable(s), while fixed.x = TRUE.")
                }
            } else {
                case.idx[[g]] <- 1:nrow(data)
                nobs[[g]] <- norig[[g]] <- length(case.idx[[g]])
            }
        }

        # extract data
        X[[g]] <- data.matrix( data[case.idx[[g]], ov.idx, drop = FALSE] )
        dimnames(X[[g]]) <- NULL ### copy?

        if(!is.null(sampling.weights)) {
            WT <- data[[sampling.weights]][case.idx[[g]]]
            if(any(WT < 0)) {
                stop("psindex ERROR: some sampling weights are negative")
            }

            # check for missing values in sampling weight variable
            if(any(is.na(WT))) {
                stop("psindex ERROR: sampling.weights variable ",
                        sQuote(sampling.weights),
                        " contains missing values\n", sep = "")
            }

            # rescale, so sum equals sample size in this group
            WT2 <- WT / sum(WT) * nobs[[g]]
            weights[[g]] <- WT2
        }

        # construct integers for user-declared 'ordered' factors
        # FIXME: is this really (always) needed???
        #  (but still better than doing lapply(data[,idx], ordered) which
        #   generated even more copies)
        user.ordered.names <- ov$name[ov$type == "ordered" & ov$user == 1L]
        user.ordered.idx <- which(ov.names[[g]] %in% user.ordered.names)
        if(length(user.ordered.idx) > 0L) {
            for(i in user.ordered.idx) {
                X[[g]][,i] <- as.numeric(as.factor(X[[g]][,i]))
            }
        }

        ## FIXME: 
        ## - why also in X? (for samplestats, for now)
        if(length(exo.idx) > 0L) {
            eXo[[g]] <- data.matrix(data[case.idx[[g]], exo.idx, drop = FALSE])
            dimnames(eXo[[g]]) <- NULL
        } else {
            eXo[g] <- list(NULL)
        }

        # standardize observed variables? numeric only!
        if(std.ov) {
            num.idx <- which(ov.names[[g]] %in% ov$name & ov$type == "numeric")
            if(length(num.idx) > 0L) {
                X[[g]][,num.idx] <- 
                   scale(X[[g]][,num.idx,drop = FALSE])[,,drop = FALSE] 
                # three copies are made!!!!!
            }
            if(length(exo.idx) > 0L) {
                eXo[[g]] <- scale(eXo[[g]])[,,drop = FALSE]
            }
        }

        # missing data
        if(missing != "listwise") {
            # get missing patterns
            Mp[[g]] <- lav_data_missing_patterns(X[[g]], 
                           sort.freq = TRUE, coverage = TRUE)
            # checking!
            if(length(Mp[[g]]$empty.idx) > 0L) {
                empty.case.idx <- Mp[[g]]$empty.idx
                if(warn) {
                    warning("psindex WARNING: some cases are empty and will be ignored:\n  ", paste(empty.case.idx, collapse=" "))
                }
            }
            if(warn && any(Mp[[g]]$coverage < 0.1)) {
                warning("psindex WARNING: due to missing values, some pairwise combinations have less than 10% coverage")
            }
            # in case we had observations with only missings
            nobs[[g]] <- NROW(X[[g]]) - length(Mp[[g]]$empty.idx)
        }

        # response patterns (categorical only, no exogenous variables)
        all.ordered <- all(ov.names[[g]] %in% ov$name[ov$type == "ordered"])
        if(length(exo.idx) == 0L && all.ordered) {
            Rp[[g]] <- lav_data_resp_patterns(X[[g]])
        }

        # warn if we have a small number of observations (but NO error!)
        if( !allow.single.case && warn && 
            nobs[[g]] < (nvar <- length(ov.idx)) ) {
            txt <- ""
            if(ngroups > 1L) txt <- paste(" in group ", g, sep="")
            warning("psindex WARNING: small number of observations (nobs < nvar)", txt,
                    "\n  nobs = ", nobs[[g]], " nvar = ", nvar)
        }

        # cluster information
        if(nlevels > 1L) {
            # extract cluster variable(s), for this group
            clus <- data.matrix(data[case.idx[[g]], cluster])
            Lp[[g]] <- lav_data_cluster_patterns(Y = X[[g]], clus = clus,
                                                 cluster = cluster,
                                                 ov.names = ov.names[[g]],
                                                 ov.names.l = ov.names.l[[g]])
        }

    } # groups, at first level 

    if(is.null(sampling.weights)) {
        sampling.weights <- character(0L)
    }

    lavData <- new("lavData",
                   data.type       = "full",
                   ngroups         = ngroups,
                   group           = group,
                   nlevels         = nlevels,
                   cluster         = cluster,
                   group.label     = group.label,
                   level.label     = level.label,
                   std.ov          = std.ov,
                   nobs            = nobs,
                   norig           = norig,
                   ov.names        = ov.names,
                   ov.names.x      = ov.names.x,
                   ov.names.l      = ov.names.l,
                   #ov.types        = ov.types,
                   #ov.idx          = ov.idx,
                   ordered         = as.character(ordered),
                   weights         = weights,
                   sampling.weights = sampling.weights,
                   ov              = ov,
                   case.idx        = case.idx,
                   missing         = missing,
                   X               = X,
                   eXo             = eXo,
                   Mp              = Mp,
                   Rp              = Rp,
                   Lp              = Lp
                  )
    lavData                     
}

# get missing patterns
lav_data_missing_patterns <- function(Y, sort.freq = FALSE, coverage = FALSE) {

    # construct TRUE/FALSE matrix: TRUE if value is observed
    OBS <- !is.na(Y)

    # empty cases
    empty.idx <- which(rowSums(OBS) == 0L)

    # this is what we did in < 0.6
    #if(length(empty.idx) > 0L) {
    #    OBS <- OBS[-empty.idx,,drop = FALSE]
    #}

    # pattern of observed values per observation
    case.id <- apply(1L * OBS, 1L, paste, collapse = "")

    # remove empty patterns
    if(length(empty.idx)) {
        case.id.nonempty <- case.id[-empty.idx]
    } else {
        case.id.nonempty <- case.id
    }

    # sort non-empty patterns (from high occurence to low occurence)
    if(sort.freq) {
        TABLE <- sort(table(case.id.nonempty), decreasing = TRUE)
    } else {
        TABLE <- table(case.id.nonempty)
    }

    # unique pattern ids
    pat.id <- names(TABLE)

    # number of patterns
    pat.npatterns  <- length(pat.id)

    # case idx per pattern
    pat.case.idx <- lapply(seq_len(pat.npatterns), 
                           function(p) which(case.id == pat.id[p]))

    # unique pattern frequencies
    pat.freq <- as.integer(TABLE)

    # first occurrence of each pattern
    pat.first <- match(pat.id, case.id)

    # TRUE/FALSE for each pattern
    pat.obs <- OBS[pat.first,,drop = FALSE] # observed per pattern

    Mp <- list(npatterns = pat.npatterns, id = pat.id, freq = pat.freq,
               case.idx = pat.case.idx, pat = pat.obs, empty.idx = empty.idx)

    if(coverage) {
        # FIXME: if we have empty cases, include them in N?
        # no for now
        Mp$coverage <- crossprod(OBS) / sum(pat.freq)
        #Mp$coverage <- crossprod(OBS) / NROW(Y)
    }

    Mp
}

# get response patterns (ignore empty cases!)
lav_data_resp_patterns <- function(Y) {

    # construct TRUE/FALSE matrix: TRUE if value is observed
    OBS <- !is.na(Y)

    # empty cases
    empty.idx <- which(rowSums(OBS) == 0L)

    # removeYempty cases
    if(length(empty.idx) > 0L) {
        Y <- Y[-empty.idx,,drop = FALSE]
    }

    ntotal <- nrow(Y); nvar <- ncol(Y)

    # identify, label and sort response patterns 
    id <- apply(Y, MARGIN = 1, paste, collapse = "")

    # sort patterns (from high occurence to low occurence)
    TABLE <- sort(table(id), decreasing = TRUE)
    order <- names(TABLE)
    npatterns <- length(TABLE)
    pat <- Y[match(order, id), , drop = FALSE]
    row.names(pat) <- as.character(TABLE)

    # handle NA?
    Y[is.na(Y)] <- -9
    total.patterns <- prod(apply(Y, 2, function(x) length(unique(x))))
    empty.patterns <- total.patterns - npatterns
    # return a list
    #out <- list(nobs=ntotal, nvar=nvar,
    #            id=id, npatterns=npatterns,
    #            order=order, pat=pat)

    # only return pat
    out <- list(npatterns=npatterns, pat=pat, total.patterns=total.patterns,
                empty.patterns=empty.patterns)

    out
}

# get cluster information
# - cluster can be a vector!
# - clus can contain multiple columns!
lav_data_cluster_patterns <- function(Y = NULL, clus = NULL, cluster = NULL,
                                      ov.names, ov.names.l) {

    # how many levels?
    nlevels <- length(cluster) + 1L

    # did we get any data (or is this just for simulateData)
    if(!is.null(Y) && !is.null(clus)) {
        haveData <- TRUE
    } else {
        haveData <- FALSE
    }

    # check clus
    if(haveData) {
        stopifnot(ncol(clus) == (nlevels - 1L), nrow(Y) == nrow(clus))
    }
    
    cluster.size    <- vector("list", length = nlevels)
    cluster.id      <- vector("list", length = nlevels)
    cluster.idx     <- vector("list", length = nlevels)
    nclusters       <- vector("list", length = nlevels)
    cluster.sizes   <- vector("list", length = nlevels)
    ncluster.sizes  <- vector("list", length = nlevels)
    cluster.size.ns <- vector("list", length = nlevels)
    ov.idx          <- vector("list", length = nlevels)
    both.idx        <- vector("list", length = nlevels)
    within.idx      <- vector("list", length = nlevels)
    between.idx     <- vector("list", length = nlevels)
    both.names      <- vector("list", length = nlevels)
    within.names    <- vector("list", length = nlevels)
    between.names   <- vector("list", length = nlevels)

    # level-1 is special
    if(haveData) {
        nclusters[[1]] <- NROW(Y)
    }
    ov.idx[[1]]    <- match(ov.names.l[[1]], ov.names)

    # for the remaining levels...
    for(l in 2:nlevels) {
        if(haveData) {
            CLUS <- clus[,(l-1L)]
            cluster.id[[l]]      <- unique(CLUS)
            cluster.idx[[l]]     <- match(CLUS, cluster.id[[l]])
            cluster.size[[l]]    <- tabulate(cluster.idx[[l]])
            nclusters[[l]]       <- length(cluster.size[[l]])
            cluster.sizes[[l]]   <- unique(cluster.size[[l]])
            ncluster.sizes[[l]]  <- length(cluster.sizes[[l]])
            cluster.size.ns[[l]] <- as.integer(table(factor(cluster.size[[l]], 
                                     levels = as.character(cluster.sizes[[l]]))))
        } else {
            cluster.id[[l]]      <- integer(0L)
            cluster.idx[[l]]     <- integer(0L)
            cluster.size[[l]]    <- integer(0L)
            nclusters[[l]]       <- integer(0L)
            cluster.sizes[[l]]   <- integer(0L)
            ncluster.sizes[[l]]  <- integer(0L)
            cluster.size.ns[[l]] <- integer(0L)
        }

        # index of ov.names for this level
        ov.idx[[l]]         <- match(ov.names.l[[l]], ov.names)
 
        both.idx[[l]]       <- which( ov.names %in% ov.names.l[[1]] & 
                                      ov.names %in% ov.names.l[[2]])
        within.idx[[l]]     <- which( ov.names %in% ov.names.l[[1]] & 
                                     !ov.names %in% ov.names.l[[2]])
        between.idx[[l]]    <- which(!ov.names %in% ov.names.l[[1]] &
                                      ov.names %in% ov.names.l[[2]])

        # names
        both.names[[l]]     <- ov.names[ ov.names %in% ov.names.l[[1]] &
                                         ov.names %in% ov.names.l[[2]] ]
        within.names[[l]]   <- ov.names[ ov.names %in% ov.names.l[[1]] &
                                        !ov.names %in% ov.names.l[[2]] ]
        between.names[[l]]  <- ov.names[!ov.names %in% ov.names.l[[1]] &
                                         ov.names %in% ov.names.l[[2]] ]
    }

    out <- list(cluster = cluster, # clus = clus, 
                # per level
                nclusters = nclusters,
                cluster.size = cluster.size, cluster.id = cluster.id,
                cluster.idx = cluster.idx, cluster.sizes = cluster.sizes,
                ncluster.sizes = ncluster.sizes, 
                cluster.size.ns = cluster.size.ns,
                ov.idx = ov.idx, both.idx = both.idx, within.idx = within.idx, 
                between.idx = between.idx,
                both.names = both.names, within.names = within.names,
                between.names = between.names)

    out
}

setMethod("show", "lavData",
function(object) {
    # print 'lavData' object
    lav_data_print_short(object)
})

lav_data_print_short <- function(object) {

    lavdata <- object

    # listwise deletion?
    listwise <- FALSE
    for(g in 1:lavdata@ngroups) {
       if(lavdata@nobs[[1L]] != lavdata@norig[[1L]]) {
           listwise <- TRUE
           break
       }
    }

    #cat("Data information:\n\n")
    if(lavdata@ngroups == 1L) {
        if(listwise) {
            cat(sprintf("  %-40s", ""), sprintf("  %10s", "Used"),
                                        sprintf("  %10s", "Total"),
                "\n", sep="")
        }
        t0.txt <- sprintf("  %-40s", "Number of observations")
        t1.txt <- sprintf("  %10i", lavdata@nobs[[1L]])
        t2.txt <- ifelse(listwise,
                  sprintf("  %10i", lavdata@norig[[1L]]), "")
        cat(t0.txt, t1.txt, t2.txt, "\n", sep="")

        if( (.hasSlot(lavdata, "nlevels")) && # in case we have an old obj
            (lavdata@nlevels > 1L) ) {
            #cat("\n")
            for(l in 2:lavdata@nlevels) {
                t0.txt <- sprintf("  %-40s", 
                    paste("Number of clusters [", lavdata@cluster[l-1], "]",
                          sep = ""))
                t1.txt <- sprintf("  %10i", lavdata@Lp[[1]]$nclusters[[l]])
                #t2.txt <- ifelse(listwise,
                #          sprintf("  %10i", lavdata@norig[[1L]]), "")
                t2.txt <- ""
                cat(t0.txt, t1.txt, t2.txt, "\n", sep="")
            }
        }
    } else {
        if(listwise) {
            cat(sprintf("  %-40s", ""), sprintf("  %10s", "Used"),
                                       sprintf("  %10s", "Total"),
               "\n", sep="")
        }
        t0.txt <- sprintf("  %-40s", "Number of observations per group")
        cat(t0.txt, "\n")
        for(g in 1:lavdata@ngroups) {
            t.txt <- sprintf("  %-40s  %10i", lavdata@group.label[[g]],
                                              lavdata@nobs[[g]])
            t2.txt <- ifelse(listwise,
                      sprintf("  %10i", lavdata@norig[[g]]), "")
            cat(t.txt, t2.txt, "\n", sep="")

            if( (.hasSlot(lavdata, "nlevels")) &&
                (lavdata@nlevels > 1L) ) {
                #cat("\n")
                for(l in 2:lavdata@nlevels) {
                    t0.txt <- sprintf("  %-40s", 
                        paste("Number of clusters [", lavdata@cluster[l-1], "]",
                              sep = ""))
                    t1.txt <- sprintf("  %10i", 
                                      lavdata@Lp[[g]]$nclusters[[l]])
                    #t2.txt <- ifelse(listwise,
                    #          sprintf("  %10i", lavdata@norig[[1L]]), "")
                    t2.txt <- ""
                    cat(t0.txt, t1.txt, t2.txt, "\n", sep="")
                }
            }
        }
    }

    # missing patterns?
    if(!is.null(lavdata@Mp[[1L]])) {
        if(lavdata@ngroups == 1L) {
            t0.txt <- sprintf("  %-40s", "Number of missing patterns")
            t1.txt <- sprintf("  %10i",
                              lavdata@Mp[[1L]]$npatterns)
            cat(t0.txt, t1.txt, "\n", sep="")
        } else {
            t0.txt <- sprintf("  %-40s", "Number of missing patterns per group")
            cat(t0.txt, "\n")
            for(g in 1:lavdata@ngroups) {
                t.txt <- sprintf("  %-40s  %10i", lavdata@group.label[[g]],
                                 lavdata@Mp[[g]]$npatterns)
                cat(t.txt, "\n", sep="")
            }
        }
    }

    # sampling weights?
    if( (.hasSlot(lavdata, "weights")) && # in case we have an old object
        (!is.null(lavdata@weights[[1L]])) ) {
        t0.txt <- sprintf("  %-30s", "Sampling Weights variable")
        t1.txt <- sprintf("  %20s", lavdata@sampling.weights)
        cat(t0.txt, t1.txt, "\n", sep="")
    }

    cat("\n")
}
nietsnel/psindex documentation built on June 22, 2019, 10:56 p.m.