R/cv_two_part.R

Defines functions getmin cvcompute

#' Cross validation for hd2part models
#'
#' @param x an n x p matrix of covariates for the zero part data, where each row is an observation
#' and each column is a predictor. MUST be ordered such that the first n_s rows align with the
#' observations in x_s and s
#' @param z a length n vector of responses taking values 1 and 0, where 1 indicates the response is positive
#' and zero indicates the response has value 0. MUST be ordered such that the first n_s values align with the
#' observations in x_s and s
#' @param x_s an n_s x p matrix of covariates (which is a submatrix of x) for the positive part data, where
#' each row is an observation and each column is a predictor
#' @param s a length n_s vector of responses taking strictly positive values
#' @param weights a length n vector of observation weights for the zero part data
#' @param weights_s a length n_s vector of observation weights for the positive part data
#' @param offset a length n vector of offset terms for the zero part data
#' @param offset_s a length n_s vector of offset terms for the positive part data
#' @param lambda A user supplied lambda sequence. By default, the program computes
#' its own lambda sequence based on nlambda and lambda.min.ratio. Supplying
#' a value of lambda overrides this.
#' @param type.measure measure to evaluate for cross-validation. Will add more description later
#' @param nfolds number of folds for cross-validation. default is 10. 3 is smallest value allowed.
#' @param foldid an optional vector of values between 1 and nfold specifying which fold each observation belongs to.
#' @param grouped Like in \pkg{glmnet}, this is an experimental argument, with default \code{TRUE}, and can be ignored by most users.
#' For all models, this refers to computing nfolds separate statistics, and then using their mean and estimated standard
#' error to describe the CV curve. If \code{grouped = FALSE}, an error matrix is built up at the observation level from the
#' predictions from the \code{nfold} fits, and then summarized (does not apply to \code{type.measure = "auc"}).
#' @param keep If \code{keep = TRUE}, a prevalidated list of arrasy is returned containing fitted values for each observation
#' and each value of lambda for each model. This means these fits are computed with this observation and the rest of its
#' fold omitted. The folid vector is also returned. Default is \code{keep = FALSE}
#' @param parallel If TRUE, use parallel foreach to fit each fold. Must register parallel before hand, such as \pkg{doMC}.
#' @param ... other parameters to be passed to \code{\link[personalized2part]{hd2part}} function
#' @export
#' @examples
#' set.seed(1)
cv.hd2part <- function (x,   z,
                        x_s, s,
                        weights      = rep(1, NROW(x)),
                        weights_s    = rep(1, NROW(x_s)),
                        offset       = NULL,
                        offset_s     = NULL,
                        lambda       = NULL,
                        type.measure = c("mae", "mse", "sep-auc-mse", "sep-auc-mae"),
                        nfolds       = 10,
                        foldid       = NULL,
                        grouped      = TRUE,
                        keep         = FALSE,
                        parallel     = FALSE,
                        ...)
{
    type.measure <- match.arg(type.measure)
    if (!is.null(lambda) && length(lambda) < 2)
    {
        stop("Need more than one value of lambda for cv.2part")
    }
    n   <- nrow(x)
    n_s <- nrow(x_s)
    if (is.null(weights))
    {
        weights <- rep(1, n)
    }
    if (is.null(weights_s))
    {
        weights_s <- rep(1, n_s)
    }

    weights   <- as.double(weights)
    weights_s <- as.double(weights_s)
    s <- drop(s)
    z <- drop(z)
    hd2part.call <- match.call(expand.dots = TRUE)
    which <- match(c("type.measure", "nfolds", "foldid", "grouped", "keep"), names(hd2part.call), FALSE)
    if (any(which))
    {
        hd2part.call = hd2part.call[-which]
    }
    hd2part.call[[1]] = as.name("hd2part")
    hd2part.object = hd2part(x = x, z = z,
                             x_s = x_s, s = s,
                             weights = weights, weights_s = weights_s, #offset = offset,
                           lambda = lambda, ...)
    hd2part.object$call <- hd2part.call
    subclass <- class(hd2part.object)[[1]]
    #type.measure=cvtype(type.measure,subclass)
    is.offset   <- hd2part.object$offset
    is.offset.s <- hd2part.object$offset_s
    ###Next line is commented out so each call generates its own lambda sequence
    # lambda=glmnet.object$lambda
    nz   <- sapply(predict(hd2part.object, type = "nonzero", model = "zero"),     length)
    nz_s <- sapply(predict(hd2part.object, type = "nonzero", model = "positive"), length)
    if (is.null(foldid))
    {
        ## make sure sampling is stratified within the positive part of the population
        foldid_s     <- sample(rep(seq(nfolds), length = n_s))
        foldid_not_s <- sample(rep(seq(nfolds), length = n - n_s))

        foldid <- c(foldid_s, foldid_not_s)
    } else
    {
        nfolds <- max(foldid)
    }
    if (nfolds < 2)
    {
        stop("nfolds must be bigger than 2; nfolds=10 recommended")
    }
    outlist <- as.list(seq(nfolds))
    if (parallel)
    {
        outlist = foreach(i = seq(nfolds), .packages = c("personalized2part")) %dopar%
        {
            which   <- foldid == i
            which_s <- foldid_s == i #which[1:n_s]

            if (length(dim(z))>1)
            {
                z_sub <- z[!which, ]
            } else
            {
                z_sub <- z[!which]
            }

            if (length(dim(s))>1)
            {
                s_sub <- s[!which_s, ]
            } else
            {
                s_sub <- s[!which_s]
            }

            if (is.offset)
            {
                offset_sub <- as.matrix(offset)[!which, ]
            } else
            {
                offset_sub <- NULL
            }

            if (is.offset.s)
            {
                offset_sub_s <- as.matrix(offset_s)[!which_s, ]
            } else
            {
                offset_sub_s <- NULL
            }

            hd2part(x[!which, , drop = FALSE], z_sub,
                    x_s[!which_s, , drop = FALSE], s_sub,
                    lambda    = lambda,
                    offset    = offset_sub,
                    offset_s  = offset_sub_s,
                    weights   = weights[!which],
                    weights_s = weights_s[!which_s],
                   ...)
        }
    } else
    {
        for (i in seq(nfolds))
        {
            which   <- foldid == i
            which_s <- foldid_s == i #which[1:n_s]

            if (length(dim(z))>1)
            {
                z_sub <- z[!which, ]
            } else
            {
                z_sub <- z[!which]
            }

            if (length(dim(s))>1)
            {
                s_sub <- s[!which_s, ]
            } else
            {
                s_sub <- s[!which_s]
            }

            if (is.offset)
            {
                offset_sub <- as.matrix(offset)[!which, ]
            } else
            {
                offset_sub <- NULL
            }

            if (is.offset.s)
            {
                offset_sub_s <- as.matrix(offset_s)[!which_s, ]
            } else
            {
                offset_sub_s <- NULL
            }

            outlist[[i]] <- hd2part(x[!which, , drop = FALSE], z_sub,
                                    x_s[!which_s, , drop = FALSE], s_sub,
                                    lambda    = lambda,
                                    offset    = offset_sub,
                                    offset_s  = offset_sub_s,
                                    weights   = weights[!which],
                                    weights_s = weights_s[!which_s],
                                    ...)
        }
    }

    #fun <- paste("cv", subclass, sep = ".")
    lambda <- hd2part.object$lambda

    cv_res_zero <- cv_function(outlist, lambda, x, z,
                               weights, offset, foldid, "auc",
                               "zero",
                               grouped, keep)

    tm_pos <- type.measure
    if (type.measure == "sep-auc-mse")
    {
        tm_pos <- "mse"
    } else if (type.measure == "sep-auc-mae")
    {
        tm_pos <- "mae"
    }

    cv_res_pos <- cv_function(outlist, lambda, x_s, s,
                              weights_s, offset_s, foldid[1:NROW(x_s)], tm_pos,
                              "positive",
                              grouped, keep)



    cvm_z  <- cv_res_zero$cvm
    cvsd_z <- cv_res_zero$cvsd
    nas    <- is.na(cvsd_z)
    if(any(nas))
    {
        lambda_z <- lambda[!nas]
        cvm_z    <- cvm_z[!nas]
        cvsd_z   <- cvsd_z[!nas]
        nz_z     <- nz[!nas]
    }

    cvm_s  <- cv_res_pos$cvm
    cvsd_s <- cv_res_pos$cvsd
    nas    <- is.na(cvsd_s)
    if(any(nas))
    {
        lambda_s <- lambda[!nas]
        cvm_s    <- cvm_s[!nas]
        cvsd_s   <- cvsd_s[!nas]
        nz_s     <- nz_s[!nas]
    }

    lamin_z <- getmin(lambda,-cvm_z, cvsd_z)  ## negative due to AUC
    lamin_s <- getmin(lambda, cvm_s, cvsd_s)

    names(lamin_z) <- c("lambda.min.z", "lambda.1se.z")
    names(lamin_s) <- c("lambda.min.s", "lambda.1se.s")

    if (type.measure %in% c("mae", "mse"))
    {
        cv_res <- cv_function_all(outlist, lambda,
                                  x,   z,
                                  x_s, s,
                                  weights, weights_s,
                                  offset, offset_s,
                                  foldid, type.measure,
                                  grouped, keep = keep)

        cv_res_sep <- cv_function_all(outlist, lambda,
                                      x,   z,
                                      x_s, s,
                                      weights, weights_s,
                                      offset, offset_s,
                                      foldid, type.measure,
                                      grouped, keep = keep,
                                      lambda_s = lamin_s$lambda.min.s)
    } else
    {
        cv_res <- cv_function_all(outlist, lambda,
                                  x,   z,
                                  x_s, s,
                                  weights, weights_s,
                                  offset, offset_s,
                                  foldid, "mae",
                                  grouped, keep = keep)

        cv_res_sep <- cv_function_all(outlist, lambda,
                                      x,   z,
                                      x_s, s,
                                      weights, weights_s,
                                      offset, offset_s,
                                      foldid, "mae",
                                      grouped, keep = keep,
                                      lambda_s = lamin_s$lambda.min.s)
    }


    cvm  <- cv_res$cvm
    cvsd <- cv_res$cvsd
    nas  <- is.na(cvsd)
    if(any(nas))
    {
        lambda <- lambda[!nas]
        cvm    <- cvm[!nas]
        cvsd   <- cvsd[!nas]
        nz     <- nz[!nas]
    }



    cvm_sep  <- cv_res_sep$cvm
    cvsd_sep <- cv_res_sep$cvsd
    nas  <- is.na(cvsd_sep)
    if(any(nas))
    {
        lambda_sep <- lambda[!nas]
        cvm_sep    <- cvm_sep[!nas]
        cvsd_sep   <- cvsd_sep[!nas]
    }

    lamin_sep <- getmin(lambda_sep, cvm_sep, cvsd_sep)

    names(lamin_sep) <- c("lambda.min.z.sep", "lambda.1se.z")

    lamin_sep <- c(lamin_sep, list(lambda.min.s.sep = lamin_s$lambda.min.s))


    cvname <- type.measure

    out = list(lambda   = lambda,
               cvm      = cvm,
               cvsd     = cvsd,
               cvup     = cvm + cvsd,
               cvlo     = cvm - cvsd,
               cvm_sep  = cvm_sep,
               cvsd_sep = cvsd_sep,
               cvup_sep = cvm_sep + cvsd_sep,
               cvlo_sep = cvm_sep - cvsd_sep,
               cvm_z    = cvm_z,
               cvsd_z   = cvsd_z,
               cvup_z   = cvm_z + cvsd_z,
               cvlo_z   = cvm_z - cvsd_z,
               cvm_s    = cvm_s,
               cvsd_s   = cvsd_s,
               cvup_s   = cvm_s + cvsd_s,
               cvlo_s   = cvm_s - cvsd_s,
               nonzero_z  = nz_z,
               nonzero_s  = nz_s,
               name     = cvname,
               hd2part.fit = hd2part.object)
    if (keep)
        out = c(out, list(fit.preval = cv_res$fit.preval, foldid = foldid))



    lamin   <- getmin(lambda, cvm, cvsd)

    obj = c(out, as.list(lamin),
                 as.list(lamin_sep),
                 as.list(lamin_z),
                 as.list(lamin_s))
    class(obj) = "cv.hd2part"
    obj
}


## from glmnet
cv_function <- function (outlist, lambda, x, y,
                         weights, offset, foldid, type.measure,
                         model = c("zero", "positive"),
                         grouped, keep = FALSE)
{
    prob_min = 1e-05
    prob_max = 1 - prob_min
    model <- match.arg(model)
    nc = dim(y)
    if (is.null(nc))
    {
        if (model == "zero")
        {
            y = as.factor(y)
            ntab = table(y)
            nc = as.integer(length(ntab))
            y = diag(nc)[as.numeric(y), ]
        }
    }
    N = NROW(y)
    nfolds = max(foldid)
    if ((N/nfolds < 10) && type.measure == "auc")
    {
        warning("Too few (< 10) observations per fold for type.measure='auc' in cv.lognet; changed to type.measure='deviance'. Alternatively, use smaller value for nfolds",
                call. = FALSE)
        type.measure = "deviance"
    }
    if ((N/nfolds < 3) && grouped) {
        warning("Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold",
                call. = FALSE)
        grouped = FALSE
    }
    if (!is.null(offset))
    {
        is.offset = TRUE
        offset = drop(offset)
    }
    else is.offset = FALSE
    mlami=max(sapply(outlist,function(obj)min(obj$lambda)))
    which_lam=lambda >= mlami

    predmat = matrix(NA, NROW(y), length(lambda))
    nlams = double(nfolds)
    for (i in seq(nfolds))
    {
        which = foldid == i
        fitobj = outlist[[i]]
        if (is.offset)
        {
            off_sub = offset[which]
        } else
        {
            off_sub <- NULL
        }
        preds = predict(fitobj,x[which, , drop = FALSE],
                        s=lambda[which_lam],
                        newoffset = off_sub,
                        model = model,
                        type = "model_response")
        nlami = sum(which_lam)
        predmat[which, seq(nlami)] = preds
        nlams[i] = nlami
    }
    if (type.measure == "auc")
    {
        cvraw = matrix(NA, nfolds, length(lambda))
        good = matrix(0, nfolds, length(lambda))
        for (i in seq(nfolds))
        {
            good[i, seq(nlams[i])] = 1
            which = foldid == i
            for (j in seq(nlams[i]))
            {
                cvraw[i, j] = auc.mat(y[which, ], predmat[which, j], weights[which])
            }
        }
        N = apply(good, 2, sum)
        weights = tapply(weights, foldid, sum)
    } else
    {
        if (model == "zero")
        {
            ywt = apply(y, 1, sum)
            y = y/ywt
            weights = weights * ywt
        }

        N = NROW(y) - apply(is.na(predmat), 2, sum)
        if (model == "zero")
        {
            cvraw = switch(type.measure,
                           mse = (y[, 1] - (1 - predmat))^2 +
                               (y[, 2] - predmat)^2,
                           mae = abs(y[, 1] - (1 - predmat)) +
                               abs(y[, 2] - predmat),
                           deviance = {
                               predmat = pmin(pmax(predmat, prob_min), prob_max)
                               lp = y[, 1] * log(1 - predmat) + y[, 2] * log(predmat)
                               ly = log(y)
                               ly[y == 0] = 0
                               ly = drop((y * ly) %*% c(1, 1))
                               2 * (ly - lp)
                           },
                           class = y[, 1] * (predmat > 0.5) + y[, 2] * (predmat <=0.5)
            )
        } else
        {
            cvraw = switch(type.measure,
                           mse = (y - predmat)^2,
                           deviance = (y - predmat)^2,
                           mae = abs(y - predmat))
        }
        if (grouped)
        {
            cvob = cvcompute(cvraw, weights, foldid, nlams)
            cvraw = cvob$cvraw
            weights = cvob$weights
            N = cvob$N
        }
    }
    cvm  = apply(cvraw, 2, weighted.mean, w = weights, na.rm = TRUE)
    cvsd = sqrt(apply(scale(cvraw, cvm, FALSE)^2, 2, weighted.mean,
                      w = weights, na.rm = TRUE)/(N - 1))
    out  = list(cvm = cvm,
                cvsd = cvsd,
                type.measure = type.measure)
    if (keep)
    {
        out$fit.preval = predmat
    }
    out
}


cv_function_all <- function (outlist, lambda,
                             x,   z,
                             x_s, s,
                             weights, weights_s,
                             offset, offset_s,
                             foldid, type.measure,
                             grouped, keep = FALSE,
                             lambda_s = NULL)
{
    prob_min = 1e-05
    prob_max = 1 - prob_min
    nc = dim(z)
    if (is.null(nc))
    {
        z = as.factor(z)
        ntab = table(z)
        nc = as.integer(length(ntab))
        z = diag(nc)[as.numeric(z), ]
    }
    N   <- NROW(z)
    N_s <- NROW(s)
    nfolds = max(foldid)
    if ((N_s/nfolds < 3) && grouped)
    {
        warning("Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold",
                call. = FALSE)
        grouped = FALSE
    }
    if (!is.null(offset))
    {
        is.offset = TRUE
        offset = drop(offset)
    } else
    {
        is.offset = FALSE
    }
    if (!is.null(offset_s))
    {
        is.offset.s = TRUE
        offset_s = drop(offset_s)
    } else
    {
        is.offset.s = FALSE
    }
    mlami     = max(sapply(outlist,function(obj)min(obj$lambda)))
    which_lam = lambda >= mlami

    predmat = matrix(NA, NROW(z), length(lambda))
    nlams   = double(nfolds)
    for (i in seq(nfolds))
    {
        which   <- foldid == i
        which_s <- which[1:N_s]
        fitobj  <- outlist[[i]]
        if (is.offset)
        {
            off_sub <- offset[which]
        } else
        {
            off_sub <- NULL
        }
        if (is.offset.s)
        {
            off_sub_s <- offset_s[which_s]
        } else
        {
            off_sub_s <- NULL
        }
        preds_z = predict(fitobj,
                          x[which, , drop = FALSE],
                          s=lambda[which_lam],
                          newoffset = off_sub,
                          model = "zero",
                          type = "model_response")

        if (is.null(lambda_s))
        {
            preds_s = predict(fitobj,
                              x[which, , drop = FALSE],
                              s=lambda[which_lam],
                              #newoffset = off_sub,
                              model = "positive",
                              type = "model_response")

            preds <- preds_z * preds_s
        } else
        {
            #lambda_s[which_lam_s]
            preds_s = predict(fitobj,
                              x[which, , drop = FALSE],
                              s=lambda_s[1],
                              #newoffset = off_sub,
                              model = "positive",
                              type = "model_response")
            preds <- preds_z * array(drop(preds_s), dim = dim(preds_z))
        }

        nlami <- sum(which_lam)
        predmat[which, seq(nlami)] = preds
        nlams[i] = nlami
    }




    zwt = apply(z, 1, sum)
    z   = z/zwt
    #weights = weights * ywt


    y <- ifelse(z[,2] == 0, 0, s)

    N = NROW(y) - apply(is.na(predmat), 2, sum)
    cvraw = switch(type.measure,
                   mse = (y - predmat)^2,
                   deviance = (y - predmat)^2,
                   mae = abs(y - predmat))
    if (grouped)
    {
        cvob = cvcompute(cvraw, weights, foldid, nlams)
        cvraw = cvob$cvraw
        weights = cvob$weights
        N = cvob$N
    }
    cvm  = apply(cvraw, 2, weighted.mean, w = weights, na.rm = TRUE)
    cvsd = sqrt(apply(scale(cvraw, cvm, FALSE)^2, 2, weighted.mean,
                      w = weights, na.rm = TRUE)/(N - 1))
    out  = list(cvm = cvm,
                cvsd = cvsd,
                type.measure = type.measure)
    if (keep)
    {
        out$fit.preval = predmat
    }
    out
}



## from glmnet
cvcompute=function(mat, weights, foldid, nlams)
{
    ###Computes the weighted mean and SD within folds, and hence the se of the mean
    wisum=tapply(weights,foldid,sum)
    nfolds=max(foldid)
    outmat=matrix(NA,nfolds,ncol(mat))
    good=matrix(0,nfolds,ncol(mat))
    mat[is.infinite(mat)]=NA#just in case some infinities crept in
    for(i in seq(nfolds)){
        mati=mat[foldid==i,,drop=FALSE]
        wi=weights[foldid==i]
        outmat[i,]=apply(mati,2,weighted.mean,w=wi,na.rm=TRUE)
        good[i,seq(nlams[i])]=1
    }
    N=apply(good,2,sum)
    list(cvraw=outmat,weights=wisum,N=N)
}

## from glmnet
getmin=function(lambda,cvm,cvsd){
    cvmin=min(cvm,na.rm=TRUE)
    idmin=cvm<=cvmin
    lambda.min=max(lambda[idmin],na.rm=TRUE)
    idmin=match(lambda.min,lambda)
    semin=(cvm+cvsd)[idmin]
    idmin=cvm<=semin
    lambda.1se=max(lambda[idmin],na.rm=TRUE)
    list(lambda.min=lambda.min,lambda.1se=lambda.1se)
}

Try the personalized2part package in your browser

Any scripts or data that you put into this service are public.

personalized2part documentation built on Jan. 13, 2021, 7:51 a.m.