UsingSAMBA.R
In SAMBA: Selection and Misclassification Bias Adjustment for Logistic Regression Models

## ---- include = FALSE---------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ---- echo=FALSE, fig.cap="Figure 1: Model Structure", out.width = '80%'------
knitr::include_graphics("images/ModelDiagram.png")

## ---- echo = TRUE, eval = TRUE,  fig.width = 7, fig.height = 4----------------

library(SAMBA)
library(MASS)
expit <- function(x) exp(x) / (1 + exp(x))
logit <- function(x) log(x / (1 - x))

nobs <- 5000

### Generate Predictors and Follow-up Information
set.seed(1234)
cov <- mvrnorm(n = nobs, mu = rep(0, 3), Sigma = rbind(c(1,   0, 0.4),
                                                       c(0,   1,   0),
                                                       c(0.4, 0,   1)))

data <- data.frame(Z = cov[, 1], X = cov[, 2], W = cov[, 3])

# Generate random uniforms
set.seed(5678)
U1 <- runif(nobs)
set.seed(4321)
U2 <- runif(nobs)
set.seed(8765)
U3 <- runif(nobs)

# Generate Disease Status
DISEASE <- expit(-2 + 0.5 * data$Z)
data$D   <- ifelse(DISEASE > U1, 1, 0)

# Relate W and D
data$W <- data$W + 1 * data$D

# Generate Misclassification
SENS <- expit(-0.4 + 1 * data$X)
SENS[data$D == 0] = 0
data$Dstar <- ifelse(SENS > U2, 1, 0)

# Generate Sampling Status
SELECT <- expit(-0.6 + 1 * data$D + 0.5 * data$W)
S  <- ifelse(SELECT > U3, T, F)

# Observed Data
data.samp <- data[S,]

# True marginal sampling ratio
prob1 <- expit(-0.6 + 1 * 1 + 0.5 * data$W)
prob0 <- expit(-0.6 + 1 * 0 + 0.5 * data$W)
r.marg.true <- mean(prob1[data$D == 1]) / mean(prob0[data$D == 0])

# True inverse probability of sampling weights
prob.WD <- expit(-0.6 + 1 * data.samp$D + 0.5 * data.samp$W)
weights <- nrow(data.samp) * (1  / prob.WD) / (sum(1 / prob.WD))

# True associations with D in population
trueX <- glm(D ~ X, binomial(), data = data)
trueZ <- glm(D ~ Z, binomial(), data = data)

# Initial Parameter Values
fitBeta  <- glm(Dstar ~ X, binomial(), data = data.samp)
fitTheta <- glm(Dstar ~ Z, binomial(), data = data.samp)

## ---- results='hide', message=F-----------------------------------------------
# Using marginal sampling ratio r and P(D=1)
sens1 <- sensitivity(data.samp$Dstar, data.samp$X, mean(data$D),
                      r = r.marg.true)

# Using inverse probability of selection weights and P(D=1)
sens2 <- sensitivity(data.samp$Dstar, data.samp$X, prev = mean(data$D),
                     weights = weights)

# Using marginal sampling ratio r and P(D=1|X)
prev  <- predict(trueX, newdata = data.samp, type = 'response')
sens3 <- sensitivity(data.samp$Dstar, data.samp$X, prev, r = r.marg.true)

# Using inverse probability of selection weights and P(D=1|X)
prev  <- predict(trueX, newdata = data.samp, type = 'response')
sens4 <- sensitivity(data.samp$Dstar, data.samp$X, prev, weights = weights)

## ---- results='hide'----------------------------------------------------------
# Approximation of D*|Z
approx1 <- approxdist(data.samp$Dstar, data.samp$Z, sens1$c_marg,
                      weights = weights)

# Non-logistic link function method
nonlog1 <- nonlogistic(data.samp$Dstar, data.samp$Z, c_X = sens3$c_X,
                       weights = weights)

# Direct observed data likelihood maximization without fixed intercept
start <- c(coef(fitTheta), logit(sens1$c_marg), coef(fitBeta)[2])
fit1 <- obsloglik(data.samp$Dstar, data.samp$Z, data.samp$X, start = start,
                 weights = weights)
obsloglik1 <- list(param = fit1$param, variance = diag(fit1$variance))

# Direct observed data likelihood maximization with fixed intercept
fit2   <- obsloglik(data.samp$Dstar, data.samp$Z, data.samp$X, start = start,
                 beta0_fixed = logit(sens1$c_marg), weights = weights)
obsloglik2 <- list(param = fit2$param, variance = diag(fit2$variance))

# Expectation-maximization algorithm without fixed intercept
fit3 <- obsloglikEM(data.samp$Dstar, data.samp$Z, data.samp$X, start = start,
                 weights = weights)
obsloglik3 <- list(param = fit3$param, variance = diag(fit3$variance))

# Expectation-maximization algorithm with fixed intercept
fit4 <- obsloglikEM(data.samp$Dstar, data.samp$Z, data.samp$X, start = start,
                  beta0_fixed = logit(sens1$c_marg), weights = weights)
obsloglik4 <- list(param = fit4$param, variance = diag(fit4$variance))

## ---- echo = FALSE, eval = TRUE,  fig.width = 5, fig.height= 5----------------
plot(sort(sens3$c_X), xlab = 'Patients', ylab = 'Sensitivity',
     main = 'Figure 2: Sensitivity Estimates', type = 'l', col = 'red', lwd = 2)
lines(sort(expit(obsloglik1$param[3] + obsloglik1$param[4]*data.samp$X)), col = 'blue', lwd = 2)
lines(sort(expit(obsloglik2$param[3] + obsloglik2$param[4]*data.samp$X)), col = 'green', lwd = 2)
abline(h=sens1$c_marg, col = 'purple', lwd = 2)
lines(sort(expit(-0.4 + 1*data.samp$X)), col = 'black', lwd = 2)
legend(x='topleft', fill = c('purple', 'red','blue', 'green', 'black'),
       legend = c('Estimated marginal sensitivity',
                  'Using non-logistic link method',
                  'Using obs. data log-lik',
                  'Using obs. data log-lik (fixed intercept)',
                  'Truth'), cex = 0.7)

## ---- echo = FALSE, eval = TRUE,  fig.width = 5, fig.height= 5,  results='hide', message=F----
rvals = c(1,1.5,2,2.5,5,10)
COL = c('red', 'orange', 'yellow', 'green', 'blue', 'purple')
true_prevs = predict(trueX, newdata = data.samp, type = 'response')
plot(sort(expit(-0.4 + 1*data.samp$X)), xlab = 'Patients', ylab = 'Sensitivity',
main = 'Figure 3: Estimated sensitivity across \n marginal sampling ratios',
type = 'l', col = 'black', lwd = 2, ylim = c(0,1))

for (i in 1:length(rvals)) {
  TEMP <- sensitivity(X = data.samp$X, Dstar = data.samp$Dstar,  r = rvals[i], prev = true_prevs)
  lines(sort(TEMP$c_X), col = COL[i])
}
legend(x='topleft', legend = c(rvals, 'Truth'), title = 'Sampling Ratio',
       fill = c(COL, 'black'), cex = 0.8)

## ---- echo = FALSE, eval = TRUE,  fig.width = 5, fig.height= 5----------------
plot(fit1$beta0_fixed, fit1$loglik.seq, xlab = 'Beta_0', ylab = 'Log-likelihood',
     main = 'Figure 4: Profile Log-Likelihood Values \n for Direct Maximization', pch = 16)
abline(v=-0.4, col = 'blue', lwd = 2)
points(logit(sens1$c_marg), fit2$loglik.seq, pch = 17, col = 'red')
legend(x='topright', legend = c('No fixed beta_0', 'Fixed beta_0'), col = c('black', 'red'), pch = c(16,17))
text(x=-0.1, y=mean(fit1$loglik.seq),label = 'True beta_0', srt = 90)

plot(fit3$loglik.seq[-c(1)], xlab = 'EM algorithm iteration', ylab = 'Log-likelihood',
     main = 'Figure 5: Log-Likelihood Values \n Across EM Iterations', pch = 16)
points(fit4$loglik.seq[-c(1)], pch = 17, col = 'red')
legend(x='bottomright', legend = c('No fixed beta_0', 'Fixed beta_0'), pch = c(16,17),
       col = c('black', 'red'))

## ---- echo = FALSE, eval = TRUE,  fig.width = 7, fig.height= 4, message=FALSE----
library(ggplot2)
library(scales)

## ---- echo = FALSE, eval = TRUE,  fig.width = 7, fig.height= 4----------------
METHODS = c('True',  'Uncorrected', 'Approx D*|Z + IPW','Non-logistic Link + IPW','Obs. log-lik + IPW', 'Fixed intercept obs. log-lik + IPW')
PARAM = c( coef(trueZ)[2], coef(fitTheta)[2],approx1$param,  nonlog1$param[2], obsloglik1$param[2], obsloglik2$param[2] )
VARIANCE = c(diag(summary(trueZ)$cov.scaled)[2],diag(summary(fitTheta)$cov.scaled)[2],
             approx1$variance,nonlog1$variance[2],obsloglik1$variance[2], obsloglik2$variance[2])
pd = position_dodge(width=0.6)
a <- ggplot(data = data.frame(METHODS = METHODS, PARAM = PARAM, VARIANCE = VARIANCE),
       aes(xmin= METHODS, xmax = METHODS, ymin = PARAM - 1.96*sqrt(VARIANCE), ymax =  PARAM + 1.96*sqrt(VARIANCE),
           col = METHODS,x = METHODS, y = PARAM)) +
  geom_point(position = position_dodge(.7), size = 2) +
  geom_linerange(position = position_dodge(.7), size = 1.2) +
  xlab('') + ylab('logOR')+ggtitle('Figure 6: Estimated Log-Odds Ratio Across Methods')+
  scale_x_discrete(limits=METHODS)+
  geom_hline(yintercept = PARAM[1], linetype = 1, color = 'black')+
  guides(fill=guide_legend(nrow=2,byrow=TRUE))+
  theme(legend.position="top",panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(), legend.text=element_text(size=8), legend.title = element_blank(),
        axis.text.x=element_text(angle=20,hjust=1,vjust=1), text = element_text(size=12))
print(a)

Any scripts or data that you put into this service are public.

SAMBA documentation built on Feb. 20, 2020, 9:07 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

SAMBA
Selection and Misclassification Bias Adjustment for Logistic Regression Models

inst/doc/UsingSAMBA.R
In SAMBA: Selection and Misclassification Bias Adjustment for Logistic Regression Models

Try the SAMBA package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

SAMBA Selection and Misclassification Bias Adjustment for Logistic Regression Models

inst/doc/UsingSAMBA.R In SAMBA: Selection and Misclassification Bias Adjustment for Logistic Regression Models

Try the SAMBA package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

SAMBA
Selection and Misclassification Bias Adjustment for Logistic Regression Models

inst/doc/UsingSAMBA.R
In SAMBA: Selection and Misclassification Bias Adjustment for Logistic Regression Models