R/bayesmlogit.R

Defines functions bayesmlogit.default bayesmlogit

Documented in bayesmlogit

#' @title Multistate Life Table Method
#' @description  A Bayesian Multistate Life Table Method for survey data, developed by Lynch and Zang (2022), allowing for large state spaces with quasi-absorbing states (i.e., structural zeros in a transition matrix). 
#' @details This function came from the deprecated \href{https://github.com/jwindle/BayesLogit}{bayeslogit} package, which conducts Bayesian multinomial logistic regressions using Polya-Gamma latent variables (Polson et al. 2013). It should be jointly used with the mlifetable() function, which will generate life tables based on the estimates from regressions.
#' @param y A vector of state transitions, which can be created either manually or with \code{CreateTrans()}. See more details using \code{?lifedata}.
#' @param X A matrix of covariates. Note that \code{X} must include age as a covariate.
#' @param samp Number of posterior samples. For efficiency purposes, if you need a large sample (e.g., \eqn{\ge}5000), we recommend parallel computing in a cluster.
#' @param burn 'burn-in' period. Default is 500.  
#' @param verbose Progress report. Default is 10, which means this function will report the current progress for every 10 posterior samples.
#' @param thin The thinning strategy to reduce autocorrelation. For example, if \code{thin = 5}, this function will select 1 from every 5 posterior samples and generate a new dataset named \code{outwstepwidth.txt}. Default is 5.
#' @param trace.plot If TRUE, this function will create a new directory under given \code{file_path} and output corresponding trace plots using samples after burn-in.
#' @param file_path The file path for outputs. If a path is specified, the result will also be saved in the given file path. You can find two result files in the specified file: \code{result.txt} and \code{resultwstep.txt}. The former contains all posterior samples generated after burn-in. The latter is sampled from the former one with a specified sampling interval. 
#' @seealso \code{\link{mlifeTable}}, \code{\link{lifedata}}, \code{\link{CreateTrans}}
#' @import grDevices
#' @importFrom stats model.matrix pnorm runif rexp rnorm
#' @export
#' @return  
#' A list that contains two arrays:
#' \itemize{
#'   \item \strong{out}: An array that contains all posterior samples generated.
#'   \item \strong{outwstepwidth}: An array generated by selecting one sample from every \emph{thin} samples in \strong{out}.
#' }
#' The number of columns in both arrays is determined by the number of covariates in X and the number of unique transition status in y. For example, if we have 12 covariates in X and 36 unique transitions in y, our result will contain (12+1)*(36-1)= 455 columns in total. 
#' @examples
#' \donttest{
#' data <- lifedata
#' y <- data[,1]
#' X <- data[,-1]
#' 
#' # This example will take about 30 mins.
#' out <- bayesmlogit(y, X ,samp=1000, burn=500,verbose=10)
#' 
#' 
#' }

bayesmlogit <- function(y, X,
                   file_path=NA,
                   samp=1000, burn=500, verbose=100,
                   thin = 5,
                   trace.plot=FALSE){
  UseMethod("bayesmlogit")
}
#' @export
bayesmlogit.default <- function(y, X,file_path=NA,
                   samp=1000, burn=500, verbose=1000,
                   thin = 5,
                   trace.plot=FALSE)

{
  if (any(is.na(y)) | any(is.na(X)))
    stop("Missing values need to be elimnated before using this function",
         call. = FALSE)
  ##Pre-process
  y.1 <- sort(unique(y))
  y <- match(y,y.1)
  data_f <- as.data.frame(cbind(y,X))
  names(data_f)[1] <- 'y'

  data_f$y <- as.factor(data_f$y)

  J.1 <- nlevels(data_f$y)

  X <- stats::model.matrix(y ~ ., data=data_f)
  col_names <- colnames(X)
  y.all <- stats::model.matrix(~ y - 1, data=data_f)
  y <- y.all[,-J.1]

  ##Construct sub-functions. These codes are all copied from the deprecated bayeslogit package.
  TRUNC = 0.64

  mass.texpon <- function(Z)
  {
    x = TRUNC;
    fz = pi^2 / 8 + Z^2 / 2;
    b = sqrt(1.0 / x) * (x * Z - 1);
    a = -1.0 * sqrt(1.0 / x) * (x * Z + 1);

    x0 = log(fz) + fz * TRUNC;
    xb = x0 - Z + stats::pnorm(b, log.p=TRUE);
    xa = x0 + Z + stats::pnorm(a, log.p=TRUE);

    qdivp = 4 / pi * ( exp(xb) + exp(xa) );

    1.0 / (1.0 + qdivp);
  }

  rtigauss <- function(Z, R=TRUNC)
  {
    Z = abs(Z);
    mu = 1/Z;
    X = R + 1;
    if (mu > R) {
      alpha = 0.0;
      while (stats::runif(1) > alpha) {
        ## X = R + 1
        ## while (X > R) {
        ##     X = 1.0 / rgamma(1, 0.5, rate=0.5);
        ## }
        E = stats::rexp(2)
        while ( E[1]^2 > 2 * E[2] / R) {
          E = stats::rexp(2)
        }
        X = R / (1 + R*E[1])^2
        alpha = exp(-0.5 * Z^2 * X);
      }
    }
    else {
      while (X > R) {
        lambda = 1.0;
        Y = stats::rnorm(1)^2;
        X = mu + 0.5 * mu^2 / lambda * Y -
          0.5 * mu / lambda * sqrt(4 * mu * lambda * Y + (mu * Y)^2);
        if ( stats::runif(1) > mu / (mu + X) ) {
          X = mu^2 / X;
        }
      }
    }
    X;
  }

  a.coef <- function(n,x)
  {
    if ( x>TRUNC )
      pi * (n+0.5) * exp( -(n+0.5)^2*pi^2*x/2 )
    else
      (2/pi/x)^1.5 * pi * (n+0.5) * exp( -2*(n+0.5)^2/x )
  }

  rpg.devroye.1 <- function(Z)
  {
    Z = abs(Z) * 0.5;

    ## PG(1,z) = 1/4 J*(1,Z/2)
    fz = pi^2 / 8 + Z^2 / 2;
    ## p = (0.5 * pi) * exp( -1.0 * fz * TRUNC) / fz;
    ## q = 2 * exp(-1.0 * Z) * pigauss(TRUNC, 1.0/Z, 1.0);

    num.trials = 0;
    total.iter = 0;

    while (TRUE)
    {
      num.trials = num.trials + 1;

      if ( stats::runif(1) < mass.texpon(Z) ) {
        ## Truncated Exponential
        X = TRUNC + stats::rexp(1) / fz
      }
      else {
        ## Truncated Inverse Normal
        X = rtigauss(Z)
      }

      ## C = cosh(Z) * exp( -0.5 * Z^2 * X )

      ## Don't need to multiply everything by C, since it cancels in inequality.
      S = a.coef(0,X)
      Y = stats::runif(1)*S
      n = 0

      while (TRUE)
      {
        n = n + 1
        total.iter = total.iter + 1;
        if ( n %% 2 == 1 )
        {
          S = S - a.coef(n,X)
          if ( Y<=S ) break
        }
        else
        {
          S = S + a.coef(n,X)
          if ( Y>S ) break
        }
      }

      if ( Y<=S ) break
    }

    ## 0.25 * X
    list("x"=0.25 * X, "n"=num.trials, "total.iter"=total.iter)
  }

  ## N - number of trials
  ## J - number of categories
  ## P - number of covariates
  ## y - N x J-1 matrix.  y_{ij} = fraction of outcomes in jth category on trial i.
  ## X - N x P design matrix
  ## n - N x 1 matrix of rolls
  ## Assume beta_J = 0 for identification.
  m.0=array(0, dim=c(ncol(X), ncol(y)))
  P.0=array(0, dim=c(ncol(X), ncol(X), ncol(y)))
  
  N = nrow(X);
  P = ncol(X);
  J = ncol(y) + 1;

  out = list(
    beta = array(0, dim=c(samp, P, J-1))
  )

  beta = matrix(0, P, J);
  w    = matrix(0, N, J);

  ## Precompute. (Initialize). These codes are all copied from the deprecated bayeslogit package.
  n=rep(1,nrow(as.matrix(y)))
  kappa = (y - 0.5)*n;

  b.0 = matrix(0, P, J-1);
  for (j in 1:(J-1)) b.0[,j] = P.0[,,j] %*% m.0[,j];

  ## A = rowSums( exp(X %*% beta[,-1]) );
  for (i in 1:(samp+burn)) {
    for (j in 1:(J-1)) {

      ## For now recompute at each iteration.  Try taking out later.
      A = rowSums( exp(X %*% beta[,-j]) );

      c.j   = log(A);
      eta.j = X %*% beta[,j] - c.j;

      ## omega.j
      for (q in 1:N){
        #print(eta.j[q])
        w[q,j] = rpg.devroye.1(eta.j[q])$x;
      }
        
        

      ## beta.j
      PL.j = t(X) %*% (X * w[,j]);
      bL.j = t(X) %*% (kappa[,j] + c.j * w[,j]);
      P1.j = PL.j + P.0[,,j];
      V1.j = chol2inv(chol(P1.j));
      m1.j = V1.j %*% (bL.j + b.0[,j]);

      sqrtV1.j = t(chol(V1.j));
      beta[,j] = m1.j + sqrtV1.j %*% stats::rnorm(P);

      ## A += exp(X %*% beta[,j]) - exp(X %*% beta[,(j+1) %% (J-1)])
      ## Store
      if (i > burn) {
        out$beta[i-burn,,j] = beta[,j];
      }
    }
    if (i %% verbose == 0) {
      cat("Finished", i,"/",burn+samp,"\n")
      
    }
  }

  ##Process the result
  dim(out$beta) <- c(samp,P*(J-1))
  out <- as.matrix(out$beta)
  indx <- seq(thin, samp, thin)
  out2 <- out[indx, ]
  if(!is.na(file_path)){
    utils::write.csv(out, file=paste(file_path,"/result.csv",sep=''), row.names=FALSE)
    utils::write.csv(out2, file=paste(file_path,"/resultwstep.csv",sep=''), row.names=FALSE)
  }
  if(trace.plot == TRUE & !is.na(file_path)){
    output.path <- paste(file_path,"/TracePlots",sep ='')
    dir.create(output.path)
    for(i in 1:dim(out)[2]){
      grDevices::png(file=paste(output.path,"/TracePlot",
                                "_transition=", y.1[ceiling(i/P)],
                                "_covariate=", col_names[(i-1) %% P +1],
                                ".png",
                                sep=''))
      p <- plot(x=1:dim(out)[1],y=out[,i],type="l",main="Trace plot")
      grDevices::dev.off()
    }
  }
  
  result <- list(out = data.frame(out), outwstepwidth = data.frame(out2))
  
  return(result)
}

Try the bayesmlogit package in your browser

Any scripts or data that you put into this service are public.

bayesmlogit documentation built on Nov. 2, 2023, 6:05 p.m.