R/genLogi.R
In logisticDx: Diagnostic tests for logistic regression models

Documented in genLogiDf genLogiDt

##' @name genLogi
##' @rdname genLogi
##' @title Generate data for logistic regression
##' @description
##' Generates a \code{data.frame} or \code{data.table}
##' with a binary outcome, and a logistic model to
##' describe it.
##'
##' @param b \dfn{binomial predictors}, the number of predictors which are
##' binary, i.e. limited to \eqn{0} or \eqn{1}
##' @param f \dfn{factors}, the number of predictors which are factors
##' @param c \dfn{continuous predictors}, the number of predictors which are
##' continuous
##' @param n number of observations in the data frame
##' @param nlf the no. of levels in a factor
##' @param pb \dfn{probability for binomnial predictors}:
##' the probability of binomial predictors being \eqn{=1}
##' e.g. if \code{pb=0.3}, \eqn{30\%} will be \eqn{1}s, \eqn{70\%} will be \eqn{0}s
##' @param rc \dfn{ratio for continuous variables} the ratio of levels of
##' continuous variables to the total number of observations \dfn{n} e.g. if
##' \code{rc=0.8} and \code{n=100}, it will be in the range 1-80
##' @param py \dfn{ratio for y} the ratio of 1s to total observations for the
##' binomial predictors e.g. if \code{ry=0.5},
##' 50\% will be \eqn{1}s, \eqn{50\%} will be \eqn{0}s
##' @param asFactor If \code{asFactor=TRUE} (the default),
##' predictors given as \code{factor}s
##' will be converted to \code{factor}s in the data frame before the model
##' is fit
##' @param model If \code{model=TRUE} will also return a model fitted with
##' \code{stats::glm} or \code{speedglm::speedglm}
##' @param timelim function will timeout after \code{timelim} secs.
##' This is present to prevent duplication of rows.
##' @param speedglm If \code{speedglm=TRUE}, return a model fitted with \code{speedglm}
##' instead of \code{glm}
##' @return If \code{model=TRUE}: a list with the following values:
##'  \item{df or dt}{A \code{data.frame} (for \code{genLogiDf}) or \code{data.table}
##' (for \code{genLogiDt}).
##' \cr
##' Predictors are labelled \eqn{x1, x2, ..., xn}.
##' \cr
##' Outcome is \eqn{y}.
##' \cr
##' Rows represent to \eqn{n} observations}
##'  \item{model}{A model fit with \code{stats::glm} or \code{speedglm::speedglm}}
##' If \code{model=FALSE} a \code{data.frame} or \code{data.table} as above.
##'
##' @note  \code{genLogiDt} is faster and more efficient for larger datasets.
##' \cr \cr
##' Using \code{asFactor=TRUE} with factors which have a large number of
##' levels (e.g. \code{nlf >30}) on large datasets (e.g. \eqn{n >1000}) can cause
##' fitting to be excessively slow.
##' @keywords datagen
NULL
##' @rdname genLogi
##' @export genLogiDf
##' @aliases genLogiDf
##' @examples
##' set.seed(1)
##' genLogiDf(speedglm=TRUE)
##'
genLogiDf <- function(b=2L, f=2L, c=1L, n=20L,
                      nlf=3L,
                      pb=0.5, rc=0.8, py=0.5,
                      asFactor=TRUE, model=TRUE, timelim=5, speedglm=FALSE) {
    if ( (pb|rc|py) > 1 ) stop("Ratios should be <1")
    if (nlf <= 2) stop("Factors should have at least 3 levels")
    sbcf1 <- sum(c(b, f, c))
    if (sbcf1 <= 0) stop("Need at least one predictor")
    if (sbcf1 >= n) stop("Need n to be larger for this no. predictors")
### prevent taking more than (timelimit)
    setTimeLimit(elapsed=timelim, transient=TRUE)
### define frame to hold values
    df1 <- as.data.frame(matrix(0L, ncol=(sbcf1 + 1L), nrow=n))
    xnam1 <- paste("x", 1:(ncol(df1)-1), sep="")
    colnames(df1) <- c(xnam1, "y")
### repeat until no dupicated columns
    repeat{
        if(!b==0) df1[, 1L:b] <- sample(x=c(0L, 1L), size=b*n, replace=TRUE, prob=c(pb, 1-pb))
        if(!f==0) {
            if (asFactor) {
                df1[, (b+1L):(b+f)] <- as.factor(sample(x=seq.int(1L, nlf), size=f*n, replace=TRUE))
             } else {
                 df1[, (b+1L):(b+f)] <- sample(x=seq.int(1L, nlf), size=f*n, replace=TRUE)
             }
        }
        if(!c==0) df1[, (b+f+1L):(b+f+c)] <- sample(x=seq.int(1, (rc*n)), size=c*n,replace=TRUE)
        df1[, ncol(df1)] <- sample(x=c(0L, 1L), size=n, replace=TRUE, prob=c(py, 1-py))
### check no duplicate columns (prevent overfitting)
        if (!all(duplicated(df1))){break}
    }
###
    if(!model) return(df1)
### generate formula
    fmla <- as.formula(paste("y ~ ", paste(xnam1, collapse= "+")  ))
### use speeedglm?
    if(speedglm){
        f1 <- speedglm::speedglm(formula=fmla, family=binomial(), data=df1)
    } else {
        f1 <- stats::glm(formula=fmla, family=binomial("logit"), data=df1)
    }
    res <- list(df=df1,
                model=f1)
    return(res)
}
###
###----------------------------------------
###
##' @rdname genLogi
##' @export genLogiDt
##' @aliases genLogiDt
##' @examples
##' genLogiDt(b=0, c=2, n=100, rc=0.7, model=FALSE)
##'
genLogiDt <- function(b=2L, f=2L, c=1L, n=20L,
                      nlf=3L,
                      pb=0.5, rc=0.8, py=0.5,
                      asFactor=TRUE, model=TRUE, timelim=5, speedglm=FALSE) {
    if ( (pb|rc|py) > 1 ) stop("Ratios should be <1")
    if (nlf <= 2) stop("Factors should have at least 3 levels")
    sbcf1 <- sum(c(b, f, c))
    if (sbcf1 <= 0) stop("Need at least one predictor")
    if (sbcf1 >= n) stop("Need n to be larger for this no. predictors")
### prevent taking more than (timelimit)
    setTimeLimit(elapsed=timelim, transient=TRUE)
### define frame to hold values
    dt1 <- data.table::data.table(matrix(0, ncol=(sbcf1+1L), nrow=n))
    xnam1 <- paste("x", 1:(ncol(dt1)-1), sep="")
    data.table::setnames(dt1, c(xnam1, "y"))
    repeat{
        if(!b==0){
            for (i in 1L:b){
                data.table::set(dt1, j=i, value=sample(x=c(0L, 1L),
                                          size=n,
                                          replace=TRUE,
                                          prob=c(pb, 1-pb)
                                          )
                                )
            }
        }
        if(!f==0){
            for (i in (b+1L) : (b+f)){
                if (asFactor){
                    data.table::set(dt1, j=i, value=as.factor(sample(x=seq.int(1L, nlf),
                                              size=n,
                                              replace=TRUE))
                                    )
                } else {
                    data.table::set(dt1, j=i, value=sample(x=seq.int(1L, nlf),
                                              size=n,
                                              replace=TRUE)
                                    )
                }
            }
        }
        if(!c==0){
            for (i in (b+f+1L) : (b+f+c)){
                data.table::set(dt1, j=i, value=sample(x=seq.int(1, (rc*n)),
                                          size=n,
                                          replace=TRUE)
                                )
            }
        }
        data.table::set(dt1, j=as.integer(sbcf1+1L), value= sample(x=c(0L, 1L),
                                         size=n,
                                         replace=TRUE,
                                         prob=c(py, 1-py)
                                         ))
### check no duplicate columns (prevent overfitting)
        if (!all(duplicated(dt1))){break}
    }
###
    if(!model) return(dt1)
### generate model
### generate formula
    fmla <- as.formula(paste("y ~ ", paste(xnam1, collapse= "+")  ))
### use speeedglm?
    if(speedglm){
        f1 <- speedglm::speedglm(formula=fmla, family=binomial(), data=dt1)
    } else {
        f1 <- stats::glm(formula=fmla, family=binomial("logit"), data=dt1)
    }
    res <- list(dt=dt1,
                model=f1)
    return(res)
}