R/main.R

Defines functions simulate_GLM_dataset mfx.misclassGLM df.residual.misclassGLM vcov.misclassGLM print.summary.misclassGLM summary.misclassGLM estimateRobustSE boot.misclassGLM predict.misclassGLM misclassGLM

Documented in boot.misclassGLM mfx.misclassGLM misclassGLM predict.misclassGLM simulate_GLM_dataset

#' GLM estimation under misclassified covariate
#'
#' \code{misclassGLM} computes estimator for a GLM with a misclassified covariate
#' using additional side information on the misclassification process
#'
#' @param Y a vector of integers or numerics. This is the dependent variable.
#' @param X a matrix containing the independent variables.
#' @param P probabilities corresponding to each of the potential pattern conditional on the other
#'        covariates denoted in x.
#' @param na.action how to treat NAs
#' @param family a description of the error distribution and link function to be used in the model.
#'        This can be a character string naming a family function, a family function or the result
#'        of a call to a family function. (See \code{\link{family}} for details of family functions.)
#' @param control options for the optimization procedure (see \code{\link{optim}},
#'        \code{\link{ucminf}} for options and details).
#' @param par (optional) starting parameter vector
#' @param setM (optional) matrix, rows containing potential patterns for a misclassified (latent) covariate M
#'        in any coding for a categorical independent variable, e.g. dummy coding (default: Identity).
#' @param x logical, add covariates matrix to result?
#' @param robust logical, if true the computed asymptotic standard errors are replaced by their
#'        robust counterparts.
#' @examples
#' ## simulate data
#' \donttest{
#' data <- simulate_GLM_dataset()
#' }
#'
#' ## estimate model without misclassification error
#' \donttest{
#' summary(lm(Y ~ X + M2, data))
#' }
#'
#' ## estimate model with misclassification error
#' \donttest{
#' summary(lm(Y ~ X + M, data))
#' }
#'
#' ## estimate misclassification probabilities
#' \donttest{
#' Pmodel <- glm(M2 ~ M + X, data = data, family = binomial("logit"))
#' summary(Pmodel)
#' }
#'
#' ## construct a-posteriori probabilities from Pmodel
#' \donttest{
#' P <- predict(Pmodel, newdata = data, type = "response")
#' P <- cbind(1 - P, P)
#' dimnames(P)[[2]] <- c("M0", "M1") ## speaking names
#' }
#'
#' ## estimate misclassGLM
#' \donttest{
#' est <- misclassGLM(Y = data$Y,
#'                    X = as.matrix(data[, 2, drop = FALSE]),
#'                    setM = matrix(c(0, 1), nrow = 2),
#'                    P = P)
#' summary(est)
#' }
#'
#' ## and bootstrapping the results from dataset
#' \dontrun{
#'   summary(boot.misclassGLM(est,
#'                            Y = data$Y,
#'                            X = data.matrix(data[, 2, drop = FALSE]),
#'                            Pmodel = Pmodel,
#'                            PX = data,
#'                            repetitions = 100))
#' }
#'
#' @export
misclassGLM <- function(Y, X, setM, P,
                        na.action = na.omit,
                        family = gaussian(link = "identity"),
                        control = list(),
                        par = NULL,
                        x = FALSE,
                        robust = FALSE) {
  if (!is.matrix(X)) stop("X is no matrix")
  if (!is.matrix(setM)) stop("setM is no matrix")
  if (!is.vector(Y)) stop("Y must be a vector")
  if (length(Y) != nrow(X)) stop("dimensions of Y and X do not match")
  if (!is.matrix(P)) stop("P is no matrix")
  if (nrow(setM) != ncol(P)) stop("dimensions of P and setM do not match")
  if (nrow(P) != nrow(X)) stop("dimensions of P and X do not match")

  g <- NULL

  control <- do.call("make.control", control)

  # log-likelihood definieren

    # Validation
    if (family$family == "gaussian") {
      f <- fgaussValidation
      if (control$method == "BFGS" || control$method == "BFGS2" || control$method == "CG" ||
          control$method == "nlm") {
        g <- ggaussValidation
      }
    } else if (family$family == "binomial") {
      if (family$link == "logit") {
        f <- flogitValidation
        if (control$method == "BFGS" || control$method == "BFGS2" || control$method == "CG" ||
            control$method == "nlm") {
          g <- glogitValidation
        }
      } else if (family$link == "probit") {
        f <- fprobitValidation
        if (control$method == "BFGS" || control$method == "BFGS2" || control$method == "CG" ||
            control$method == "nlm") {
          g <- gprobitValidation
        }
      } else {
        stop("unknown link")
      }
    } else {
      stop("unknown family")
    }

    # starting parameters
    length.par <- ncol(X) + ncol(setM) + 1
    if (is.null(par)) {
      cat("estimate starting parameters...")
      par <- rep.int(0, length.par)
      if (family$family == "gaussian") {
        par <- c(par, 0.5)
        par[1:(ncol(X) + 1)] <- lm.fit(x = cbind(rep.int(1, nrow(X)), X), y = Y)$coefficients
      } else {
        par[1:(ncol(X) + 1)] <- glm.fit(x = cbind(rep.int(1, nrow(X)), X), y = Y,
                                        family = family)$coefficients
      }
    } else {
      if (family$family == "gaussian") length.par <- length.par + 1
      if (length(par) != length.par) stop("starting parameter par is not valid.")
    }

  # optimise
  cat("optimize...")
  erg <- fit.algorithm(par, f, g, Y, X, P, setM, control)
  cat("ok.\n")

  # fertig
  ret <- list()
  ret$family <- family
  ret$setM <- setM
  p <- ncol(X) + ncol(setM) + 1
  ret$coefficients <- erg$par[1:p]
  names(ret$coefficients) <- c("(Intercept)", dimnames(X)[[2]], dimnames(P)[[2]][-1])

  ret$y <- Y
  ret$na.action <- na.action
  ret$df.residual <- nrow(X) - length(par)
  ret$optim <- erg
  if (!is.null(g)) ret$optim$gradient <- g(erg$par, Y, X, P, setM)
  if (control$method == "BFGS2") {
    ret$iter <- erg$info[4]
    ret$optim$call <- "ucminf"
  } else {
    ret$optim$call <- "optim"
    #ret$optim$hessian <- hessian(f, erg$par, Y=Y, X=X, W=P, SetM=setM)
    ret$iter <- erg$counts
  }
  if (x) ret$x <- cbind(rep.int(1, nrow(X)), X, P[, -1, drop = FALSE])

  ret$SEtype <- "standard"
  ret$setM <- setM
  ret$control <- control

  class(ret) <- c("misclassGLM", "glm", "lm")

  ret$fitted.values <- predict(ret, X, P, type = "response")
  ret$prior.weights <- rep_len(1, nrow(X))

  if (robust) ret <- estimateRobustSE(ret, Y, X, P)

  return(ret)
}


#' Predict Method for \code{misclassGLM} Fits
#'
#' Obtains predictions
#'
#' @usage ## S3 method for class 'misclassGLM'
#'        \method{predict}{misclassGLM}(object, X, P = NULL, type = c("link", "response"),
#'                                      na.action = na.pass, ...)
#' @param object a fitted object of class inheriting from 'misclassGLM'.
#' @param X matrix of fixed covariates
#' @param P a-posteriori probabilities for the true values of the misclassified variable.
#'        If provided, the conditional expectation on X,P is computed, otherwise a set of marginal
#'        predictions is provided, one for each alternative.
#' @param type the type of prediction required. The default is on the scale of the linear predictors;
#'        the alternative "response" is on the scale of the response variable.
#'        Thus for a default binomial model the default predictions are of log-odds
#'        (probabilities on logit scale) and type = "response" gives the predicted probabilities.
#'
#'        The value of this argument can be abbreviated.
#' @param na.action function determining what should be done with missing values in \code{newdata}.
#'        The default is to predict NA.
#' @param \dots additional arguments (not used at the moment)
#' @seealso \code{\link{misclassGLM}}
#' @export
predict.misclassGLM <- function(object, X, P = NULL, type = c("link", "response"),
                                na.action = na.pass, ...) {

  type <- match.arg(type)

    ret <- matrix(rep_len(0.0, nrow(object$setM) * nrow(X)), nrow = nrow(X))
    for (m in nrow(object$setM)) {
      ret[, m] <- cbind(rep_len(1, nrow(X)), X,
                        matrix(rep.int(object$setM[m,], nrow(X)), nrow = nrow(X), byrow = TRUE)) %*%
        object$coefficients
      if (type == "response") family(object)$linkinv(ret[, m])
    }
    if (type == "response" & !is.null(P)) ret <- rowSums(ret * P)

  return(ret)
}


#' Compute Bootstrapped Standard Errors for \code{misclassGLM} Fits
#'
#' Obtain bootstrapped standard errors.
#'
#' @param ret a fitted object of class inheriting from 'misclassGLM'.
#' @param Y a vector of integers or numerics. This is the dependent variable.
#' @param X a matrix containing the independent variables.
#' @param Pmodel a fitted model (e.g. of class 'GLM' or 'mlogit') to implicitly produce variations
#'        of the predicted true values probabilities. (Usually conditional on the observed
#'        misclassified values and additional covariates.)
#' @param PX covariates matrix suitable for predicting probabilities from \code{Pmodel},
#'        usually including the mismeasured covariate.
#' @param boot.fraction fraction of sample to be used for estimating the bootstrapped standard
#'        errors, for speedup.
#' @param repetitions number of bootstrap samples to be drown.
#' @seealso \code{\link{misclassGLM}}
#' @export
boot.misclassGLM <- function(ret, Y, X, Pmodel, PX,
                             boot.fraction = 1, repetitions = 1000) {
  if (!("misclassGLM" %in% class(ret))) stop("object is no misclassGLM result")
  family <- ret$family
  if (boot.fraction > 1) stop("boot.fraction must be <1")
  if (boot.fraction <= 0) stop("boot.fraction must be >0")

  if (!is.matrix(X)) stop("X is no matrix")
  if (!is.vector(Y)) stop("Y must be a vector")
  if (length(Y) != nrow(X)) stop("dimensions of Y and X do not match")

  g <- NULL

  # log-likelihood definieren
  if (family$family == "gaussian") {
    f <- cfgaussValidation
    if (ret$control$method == "BFGS" || ret$control$method == "BFGS2" ||
        ret$control$method == "CG" || ret$control$method == "nlm") {
      g <- cggaussValidation
    }
  } else if (family$family == "binomial") {
    if (family$link == "logit") {
      f <- cflogitValidation
      if (ret$control$method == "BFGS" || ret$control$method == "BFGS2" ||
          ret$control$method == "CG" || ret$control$method == "nlm") {
        g <- cglogitValidation
      }
    } else if (family$link == "probit") {
      f <- fprobitValidation
      if (ret$control$method == "BFGS" || ret$control$method == "BFGS2" ||
          ret$control$method == "CG" || ret$control$method == "nlm") {
        g <- gprobitValidation
      }
    } else {
      stop("unknown link")
    }
  } else {
    stop("unknown family")
  }

  return(bootstrapping(ret, Y, X, Pmodel, PX, f, g, boot.fraction, repetitions))
}


estimateRobustSE <- function(object, Y, X, P) {
  if (!inherits(object, "misclassGLM")) stop("object is not of class misclassGLM.")

  if (object$SEtype != "standard") stop("robust errors require usual computation of SE.")

  ok <- try(covmat <- chol2inv(chol(object$optim$hessian)), silent = TRUE)
  if (inherits(ok, "try-error")) {
    print(ok[1])
    return(NULL)
  }

  if (object$family$family == "gaussian") {
    g <- cggaussValidation
  } else if (object$family$family == "binomial") {
    if (object$family$link == "logit") {
      g <- cglogitValidation
    } else if (object$family$link == "probit") {
      g <- gprobitValidation
    } else {
      stop("unknown link")
    }
  } else {
    stop("unknown family")
  }
  Lstrich <- matrix(rep.int(0, length(object$optim$par)^2), ncol = length(object$optim$par))
  for (i in 1:nrow(X)) {
    if (inherits(Y, "matrix")) {
      Lstrich <- Lstrich + tcrossprod(g(object$optim$par, Y[i,, drop = FALSE], X[i,, drop = FALSE],
                                        P[i,, drop = FALSE], object$setM))
    } else {
      Lstrich <- Lstrich + tcrossprod(g(object$optim$par, Y[i], X[i,, drop = FALSE],
                                        P[i,, drop = FALSE], object$setM))
    }
  }
  object$CoVar <- covmat %*% Lstrich %*% covmat

  return(object)
}

#' @export
summary.misclassGLM <- function(object, ...) {
  coef.p <- object$coefficients
  p <- length(coef.p)

  if (is.null(object$CoVar)) {
    ok <- try(covmat <- chol2inv(chol(object$optim$hessian)), silent = TRUE)
    if (inherits(ok, "try-error")) {
      print(ok[1])
      return(NULL)
    }
  } else {
    covmat <- object$CoVar
  }
  var.cf <- diag(covmat[1:p, 1:p])
  s.err <- sqrt(var.cf)
  tvalue <- coef.p / s.err
  if (object$SEtype == "standard") {
    df.r <- object$df.residual
    pvalue <- 2 * pt(-abs(tvalue), df.r)
  } else if (object$SEtype == "robust") {
    df.r <- object$df.residual
    pvalue <- 2 * pt(-abs(tvalue), df.r)
  } else if (object$SEtype == "bootstrap") {
    pvalue <- rep_len(0.0, p)
    for (j in 1:p) {
      tmp <- object$boot.beta[, j]
      pvalue[j] <- 2 * min(length(which(tmp <= 0)),
                           length(which(tmp >= 0))) / (nrow(object$boot.beta))
    }
  }
  coef.table <- cbind(coef.p, s.err, tvalue, pvalue)
  dimnames(coef.table) <- list(names(coef.p), c("Estimate", "Std. Error", "t value", "Pr(>|t|)"))

  keep <- match(c(#"call",
          "family", #"deviance", "aic",
          #"contrasts",
          "df.residual", #"null.deviance", "df.null",
          "iter", "na.action", "SEtype"), names(object), 0L)
  ret <- c(object[keep],
      list(deviance.resid = residuals(object, type = "deviance"),
          coefficients = coef.table, #aliased = aliased,
          #dispersion = dispersion, df = c(object$rank, df.r, df.f),
          #cov.unscaled = covmat.unscaled,
          cov.scaled = covmat
          ))

  class(ret) <- c("summary.misclassGLM", "summary.glm")
  return(ret)
}

#' @export
print.summary.misclassGLM <- function(x, digits = max(3L, getOption("digits") - 3L), ...) {
  class(x) <- setdiff(class(x), "summary.misclassGLM")
  print(x = x, digits = digits, ...)

  if (x$SEtype == "robust") {
    cat("Results show robust standard errors.\n")
  } else if (x$SEtype == "bootstrap") {
    cat("Results based on ")
    cat(nrow(x$boot.beta))
    cat(" bootstrap samples.\n\n")
  }
}

#' @export
vcov.misclassGLM <- function(object, ...) {
  summary.misclassGLM(object)$cov.scaled
}

#' @export
df.residual.misclassGLM <- function(object, ...) {
  object$df.residuals
}

#' Compute Marginal Effects for \code{misclassGLM} Fits
#'
#' Obtain marginal Effects.
#'
#' @param w a fitted object of class inheriting from 'misclassGLM'.
#' @param x.mean logical, if true computes marginal effects at mean, otherwise average marginal
#'               effects.
#' @param rev.dum logical, if true, computes differential effects for switch from 0 to 1.
#' @param digits number of digits to be presented in output.
#' @param \dots further arguments passed to or from other functions.
#' @seealso \code{\link{misclassGLM}}
#' @export
mfx.misclassGLM <- function(w, x.mean = TRUE, rev.dum = TRUE, digits = 3, ...) {
    if (!("glm" %in% class(w))) {
      stop("Please provide an object from 'glm()' or 'mlogit()'.\n")
    }
    if ("glm" %in% class(w)) link <- w$family$link
    if (link != "probit" & link != "logit") {
      stop("Need a binary probit, a logit or mlogit model'.")}
    if (is.null(dim(w$x))) {
      stop("Please specify 'x = TRUE' in glm().\n")}

    x.bar <- as.matrix(colMeans(w$x))
    b.est <- as.matrix(coef(w))
    K <- nrow(x.bar)

    xb <- t(x.bar) %*% b.est

    if (!x.mean) {
      xb2 <- as.matrix(w$x) %*% b.est
      if (link == "probit") f.xb <- mean(dnorm(xb2))
      if (link == "logit")  f.xb <- mean(dlogis(xb2))
    }
    if (link == "probit") f.xb <- dnorm(xb)
    if (link == "logit" ) f.xb <- dlogis(xb)
    me <- f.xb * coef(w)

    bx <- b.est %*% t(x.bar)

    if (link == "probit") {
      dr <- diag(1, K, K) - as.numeric(xb) * bx
      va <- as.numeric(f.xb)^2 * dr %*% vcov(w) %*% t(dr)
    } else if (link == "logit") {
      pg <- as.numeric(plogis(xb))
      dr <- diag(1, K, K) + (1 - 2 * pg) * bx
      va <- (pg * (1 - pg))^2 * dr %*% vcov(w) %*% t(dr)
    }
    se <- sqrt(diag(va))

    if (rev.dum) {
      for (i in 1:ncol(w$x)) {
        if (identical(sort(unique(w$x[,i])), c(0, 1)) ||
            ("misclassGLM" %in% class(w) && i >= ncol(w$x) - dim(w$setM)[2]) ) {
          x.d1 <- x.bar; x.d1[i, 1] <- 1
          x.d0 <- x.bar; x.d0[i, 1] <- 0
          if (link == "probit") {
            me[i] <- pnorm(t(x.d1) %*% b.est) -
              pnorm(t(x.d0) %*% b.est)
            dr2 <- dnorm(t(x.d1) %*% b.est) %*%  t(x.d1) -
              dnorm(t(x.d0) %*% b.est) %*%  t(x.d0)
            va2 <- dr2 %*% vcov(w) %*% t(dr2)
          } else if (link == "logit") {
            me[i] <- plogis(t(x.d1) %*% b.est) -
              plogis(t(x.d0) %*% b.est)
            dr2 <- dlogis(t(x.d1) %*% b.est) %*%  t(x.d1) -
              dlogis(t(x.d0) %*% b.est) %*%  t(x.d0)
            va2 <- dr2 %*% vcov(w) %*% t(dr2)
          }
          se[i] <- sqrt(as.numeric(va2))
        }
      }
    }
    out <- data.frame(effect = me, error = se)
    out$t.value <- out$effect / out$error
    out$p.value <- 2 * (1 - pt(abs(out[, 3]), w$df.residual))
    out <- round(out, digits = digits)
    result <- list(link = link, f.xb = f.xb, w = w, out = out)
    class(result) <- "mfx"
    return(result)

}


#' Simulate a Data Set to Use With \code{misclassGLM}
#'
#' simulates a data set with
#' - one continuous variable X drawn from a Gaussian distribution,
#' - a binary or trinary variable M with misclassification (M2)
#' - a dependent variable either with added Gaussian noise or drawn from a logit distribution
#'
#' This can be used to demonstrate the abilities of \code{\link{misclassGLM}}. For an example
#' see \code{\link{misclassGLM}}.
#'
#' @param n number observations
#' @param const constant
#' @param alpha parameter for X
#' @param beta parameter for M(1)
#' @param beta2 parameter for M2, if NULL, M is a binary covariate, otherwise a three-valued
#'              categorical
#' @param logit logical, if true logit regression, otherwise Gaussian regression
#' @seealso \code{\link{misclassGLM}}
#' @export
simulate_GLM_dataset <- function(n = 50000, const = 0, alpha = 1, beta = -2, beta2 = NULL,
                                 logit = FALSE) {
  set.seed(30)

  X <- rnorm(n, 0, 2)

  if (is.null(beta2)) {
    Mmisclass <- rbinom(n, 1, 0.5)

    M <- rep.int(0, n)
    for (i in 1:n) {
      temp <- exp(X[i] + Mmisclass[i])
      M[i] <- rbinom(1, 1, temp / (1 + temp))
    }

    Y <- (alpha * X) + beta * M + const
  } else {
    M <- matrix(rep.int(0, 2 * n), ncol = 2)
    for (i in 1:n) {
      tmp1 <- exp(Mmisclass[i, 1] - Mmisclass[i, 2] - (1 - Mmisclass[i, 1]) * (1 - Mmisclass[i, 2]))
      tmp2 <- exp(Mmisclass[i, 2] - Mmisclass[i, 1] - (1 - Mmisclass[i, 1]) * (1 - Mmisclass[i, 2]))
      s <- tmp1 + tmp2 + 1
      M[i,] <- rmultinom(1, 1, c(tmp1, tmp2, 1) / s)[-3]
    }

    Y <- alpha * X + beta * M[, 1] + beta2 * M[, 2] + const
  }

  if (logit) {
    Y <- exp(Y)
    for (i in 1:n) {
      Y[i] <- rbinom(1, 1, Y[i] / (1 + Y[i]))
    }
  } else {
    E <- rnorm(n, 0, 0.5)
    Y <- Y + E
  }

  return(data.frame(Y = Y, X = X, M = Mmisclass, M2 = M))
}

Try the misclassGLM package in your browser

Any scripts or data that you put into this service are public.

misclassGLM documentation built on Nov. 19, 2023, 9:06 a.m.