knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
TO DO: Resolve the stratum-specific Relative Risk rates. I think it has to do with collapsability. Find some way to explain prior to reviewing with group.
Now we extend the OR to RR to include covariates. We can do this with categorical covariates
or continuous covariates. There is an R packge with a function (epitools::probratio
) that will help you do this, but I want to do it by hand so we all understand what is actually happening.
Also note that I recently had to compute the adjusted RR for an ordinal regression model. I couldn't use an R library, I had to do it by hand. Once you understand this code below, you'll be able to extend it to ordinal context.
library(SPR) library(MASS) N = 1e5 #this should be divisible by however many groups you use! number.groups <- 2 number.timepoints <- 1 set.seed(9122021) dat <- data.frame('USUBJID' = rep(paste0('Subject_', formatC(1:N, width = 4, flag = '0')), length.out= N*number.timepoints), 'Group' = rep(paste0('Group_', 1:number.groups), length.out = N*number.timepoints), stringsAsFactors=F) # Generate a Covariate # Option 1: Continuous, unbalanced: # dat$Covariate <- rlnorm(n = N, mean = dat$Group == 'Group_2', sd = 0.25) # range(dat$Covariate) # aggregate(Covariate ~ Group, FUN = function(x) round(mean(x, na.rm = T),2), dat = dat, na.action = na.pass) # Covariate is unbalanced across groups, could serve as a confound # Option 2: Categorical, Balanced: # dat$Z <- sample(x = c(0,1), size = N, replace = T, prob = c(0.5, 0.5)) # Option 3: Categorical, Unbalanced: dat$Z <- rbinom(n = N, size = 1, prob = 0.4 + 0.4*(dat$Group == 'Group_1')) table(dat$Z) xtabs(~ Group + Z, data = dat) # Create Beta parameters for these design matrix: X <- model.matrix( ~ Group + Z , data = dat) # Create Beta Beta <- matrix(0, nrow = ncol(X), dimnames=list(colnames(X), 'param')) Beta[] <- c(-0.25, -1, -0.5) # Matrix multiply: XB <- X %*% Beta p <- exp(XB) # prevalence - log link round(range(p), 2) #p <- exp(XB)/(1 + exp(XB)) - this is the logistic link # Dichotomize dat$Y_binom <- as.vector(1*(p > runif(n = N))) # Plot aggregate(Y_binom ~ Group, FUN = function(x) round(mean(x, na.rm = T),2), dat = dat, na.action = na.pass) barplot(100*table(dat$Y_binom )/sum(table(dat$Y_binom )), ylim = c(0, 100), ylab = 'Percentage', col = 'grey', main = 'Binomial') library(ggplot2) ggplot(data = dat, aes(x= as.factor(Y_binom), fill = Group)) + geom_bar(aes(y = (..count..)/sum(..count..)), color="#e9ecef", alpha=0.6, position="dodge", stat="count") + scale_y_continuous(labels=scales::percent_format(accuracy = 1)) + scale_fill_manual(name = 'Groups', values=c("blue2", "red2")) + theme_minimal() + theme(legend.position = 'bottom') + labs(x = 'Score', y = 'Percentage', title = 'Scores with a Binomial Distribution', caption = 'Note: Simulated data') # Observed Data: xtabs(~ Y_binom + Group, data = dat) ptab <- prop.table(xtabs(~ Y_binom + Group, data = dat), 2)
This models the relative risk directly.
# Compare to model output: mod <- glm(Y_binom ~ Group, data = dat, family = binomial(link ='log')) # Adjusted Relative Risk - log binomial model with covariate (confound) mod2 <- glm(Y_binom ~ Group + Z, start = as.vector(Beta), data = dat, family = binomial(link ='log'))
#-------------------------------- # Logistic Regression: mod.lr <- glm(Y_binom ~ Group + Z, data = dat, family = binomial(link ='logit')) # Now we have the odds ratio from the model, we can compute # the relative risk using the baseline prevalence. # # Can think of several ways of getting the baseline prevalance: # # 1. observed rate in control group: p0_v1 <- ptab['1', 'Group_1'] # 2. Intercept from the logistic regression model: # Pulled this from Stata function: http://fmwww.bc.edu/repec/bocode/l/logittorisk.ado odds0 <- exp( coef(mod.lr)['(Intercept)'] ) p0_v2 <- odds0/(1 + odds0) # but this assumes the covariates are at reference value # 3. Prediction aligns with intercept: p0_v3 <- predict(mod.lr, type = 'response', newdata = data.frame('Group' = 'Group_1', 'Z' = 0)) p0_v2 p0_v3 # So if you use this as your base rate, you are assuming that the other covariate # is set at the reference level (Z = 0). This may or may not be what you want to do. # All depends on intended interpretation. OR <- exp(coef(mod.lr)['GroupGroup_2']) P0 <- p0_v2 RR <- OR/( (1-P0) + (P0 * OR) ) RR # RR computed above assumes Z = 0; to confirm, we compute it this way: p1 <- predict(mod.lr, type = 'response', newdata = data.frame('Group' = 'Group_2', 'Z' = 0)) p0 <- predict(mod.lr, type = 'response', newdata = data.frame('Group' = 'Group_1','Z' = 0)) p1/p0 # Same value. This is the correct RR if you want to assume the base rate is when Z = 0. # Note that it does NOT line up with the generating parameter: exp(Beta['GroupGroup_2', ]) # Why? because the generating parameter doesn't assume the base rate is when Z = 0
Can we recover the generating parameter? We want the probability of Y given Group membership, averaged over covariates. This is computing the RR marginalizing over the covariates. So this is the RR specific to the sample characteristics. You've selected a base rate specific to the sample
p1i <- predict(mod.lr, type = 'response', newdata = data.frame('Group' = 'Group_2', 'Z' = dat$Z)) p0i <- predict(mod.lr, type = 'response', newdata = data.frame('Group' = 'Group_1', 'Z' = dat$Z)) mean(p1i)/mean(p0i) # This aligns with the generating value: exp(Beta['GroupGroup_2', ])
https://escholarship.org/content/qt3ng2r0sm/qt3ng2r0sm.pdf
"Standardized measures are constructed by taking averages over C before comparisons (e.g., ratios or differences) across X...Recalling Jensen’s inequality (an average of a nonlinear function does not equal the function applied to the averages), it should not be surprising to find divergences between collapsibility conditions depending on the step at which averaging is done (Samuels, 1981, sec. 3)."
epitools
library(epitools) # Function is 'probratio()' # See documentation - marginalizes over observed covariates pr <- epitools::probratio(mod.lr, method='delta', scale='linear') pr
Let's put it all together and see if we can understand
# Generating parameter (true value): exp(Beta['GroupGroup_2', ]) # Observed RR (just using observed proportions): ptab # Group 1 & Group 2 ptab['1', 'Group_2']/ptab['1', 'Group_1'] # RR from unadjusted log-binomial model: exp(coef(mod)['GroupGroup_2']) # RR from Adjusted log-binomial model exp(coef(mod2)['GroupGroup_2']) # RR from adjusted Logistic Regression - Marginalized over covariates mean(p1i)/mean(p0i) # same, but letting `epitools::probratio()` do all the work: pr # Logistic Regression (assume base rate is when Z = 0) RR p1/p0 # Remember, this isn't wrong *if the research question is, tell me the RR when # covariate Z = 0!*
Just going to dump code here - note that the unadjusted lines up very nicely.
#------------- rm(list = ls()) gc() N = 1e5 #this should be divisible by however many groups you use! number.groups <- 2 number.timepoints <- 1 set.seed(9132021) dat <- data.frame( 'USUBJID' = rep(paste0('Subject_', formatC(1:N, width = 4, flag = '0')), length.out= N*number.timepoints), 'Group' = rep(paste0('Group_', 1:number.groups), length.out = N*number.timepoints), stringsAsFactors=F) # Create Beta parameters for these design matrix: X <- model.matrix( ~ Group , data = dat) # Create Beta Beta <- matrix(0, nrow = ncol(X), dimnames=list(colnames(X), 'param')) Beta[] <- c(0, -0.25) # Matrix multiply: XB <- X %*% Beta # Thresholds: thr <- c(-2.5, -1.5, -0.5) eta <- matrix(thr, nrow = nrow(XB), ncol = length(thr), byrow = T) - matrix(XB, nrow = nrow(XB), ncol = length(thr), byrow = F) p <- exp(eta)/(1 + exp(eta)) #p <- exp(eta) # I don't know how to do it this way range(p) dat$p <- p dat$Y <- apply(runif(n = N) > p, 1, sum) # Quick barplot, base R plot functions aggregate(Y ~ Group, FUN = function(x) round(mean(x, na.rm = T),2), dat = dat, na.action = na.pass) xtabs( ~ Y + Group, dat = dat, na.action = na.pass) barplot(100*table(dat$Y)/sum(table(dat$Y)), ylim = c(0, 100), ylab = 'Percentage', col = 'grey', main = 'Ordinal') library(ggplot2) ggplot(data = dat, aes(x= as.factor(Y), fill = Group)) + geom_bar(aes(y = (..count..)/sum(..count..)), color="#e9ecef", alpha=0.6, position="dodge", stat="count") + scale_y_continuous(labels=scales::percent_format(accuracy = 1)) + scale_fill_manual(name = 'Groups', values=c("blue2", "red2")) + theme_minimal() + theme(legend.position = 'bottom') + labs(x = 'Score', y = 'Percentage', title = 'Scores with a Binomial Distribution', caption = 'Note: Simulated data') #---- # Key here is that this data is ORDERED # Observed Data: xtabs(~ Y + Group, data = dat) ptab <- prop.table(xtabs(~ Y + Group, data = dat), 2) # Percentage achieving each category across Groups: ptab # Fit ordinal regression # use the predicted probabilities to compute relative risk library(ordinal) mod.clm <- clm(as.factor(Y) ~ Group, data = dat) summary(mod.clm) beta <- summary(mod.clm)$beta alpha <- summary(mod.clm)$alpha # Group 2: XB <- matrix(summary(mod.clm)$beta) thr <- summary(mod.clm)$alpha eta <- matrix(thr, nrow = nrow(XB), ncol = length(thr), byrow = T) - matrix(XB, nrow = nrow(XB), ncol = length(thr), byrow = F) p1 <- as.numeric(exp(eta)/(1 + exp(eta))) # Group 1 XB <- matrix(0) # obviously this is more complicated with more predictors! thr <- summary(mod.clm)$alpha eta <- matrix(thr, nrow = nrow(XB), ncol = length(thr), byrow = T) - matrix(XB, nrow = nrow(XB), ncol = length(thr), byrow = F) p0 <- (exp(eta)/(1 + exp(eta))) p0 <- as.numeric(c(0, p0, 1)) p1 <- as.numeric(c(0, p1, 1)) p0 <- diff(p0) p1 <- diff(p1) cbind(p0, p1) ptab ptab[, 'Group_1']/ptab[, 'Group_2'] p0/p1 # # > ptab[, 'Group_1']/ptab[, 'Group_2'] # 0 1 2 3 # 0.7987500 0.8373752 0.9105826 1.1056235 # > p0/p1 # [1] 0.7987899 0.8394065 0.9088316 1.1058414
#------------- rm(list = ls()) gc() N = 1e5 #this should be divisible by however many groups you use! number.groups <- 2 number.timepoints <- 1 set.seed(9152021) dat <- data.frame( 'USUBJID' = rep(paste0('Subject_', formatC(1:N, width = 4, flag = '0')), length.out= N*number.timepoints), 'Group' = rep(paste0('Group_', 1:number.groups), length.out = N*number.timepoints), stringsAsFactors=F) # Unbalanced: dat$Z <- rbinom(n = N, size = 1, prob = 0.4 + 0.4*(dat$Group == 'Group_1')) table(dat$Z) xtabs(~ Group + Z, data = dat) # Create Beta parameters for these design matrix: X <- model.matrix( ~ Group + Z , data = dat) # Create Beta Beta <- matrix(0, nrow = ncol(X), dimnames=list(colnames(X), 'param')) Beta[] <- c(0, -0.25, -0.25) # Matrix multiply: XB <- X %*% Beta # Thresholds: thr <- c(-2, -1, 0) eta <- matrix(thr, nrow = nrow(XB), ncol = length(thr), byrow = T) - matrix(XB, nrow = nrow(XB), ncol = length(thr), byrow = F) p <- exp(eta)/(1 + exp(eta)) #p <- exp(eta) dat$p <- p dat$Y <- apply(runif(n = N) > p, 1, sum) # Quick barplot, base R plot functions aggregate(Y ~ Group, FUN = function(x) round(mean(x, na.rm = T),2), dat = dat, na.action = na.pass) xtabs( ~ Y + Group, dat = dat, na.action = na.pass) barplot(100*table(dat$Y)/sum(table(dat$Y)), ylim = c(0, 100), ylab = 'Percentage', col = 'grey', main = 'Ordinal') library(ggplot2) ggplot(data = dat, aes(x= as.factor(Y), fill = Group)) + geom_bar(aes(y = (..count..)/sum(..count..)), color="#e9ecef", alpha=0.6, position="dodge", stat="count") + scale_y_continuous(labels=scales::percent_format(accuracy = 1)) + scale_fill_manual(name = 'Groups', values=c("blue2", "red2")) + theme_minimal() + theme(legend.position = 'bottom') + labs(x = 'Score', y = 'Percentage', title = 'Scores with a Binomial Distribution', caption = 'Note: Simulated data') #---- # Key here is that this data is ORDERED # Observed Data: xtabs(~ Y + Group, data = dat) ptab <- prop.table(xtabs(~ Y + Group, data = dat), 2) # Percentage achieving each category across Groups: ptab # Fit ordinal regression # use the predicted probabilities to compute relative risk library(ordinal) mod.clm <- clm(as.factor(Y) ~ Group + Z, data = dat) summary(mod.clm) beta <- summary(mod.clm)$beta alpha <- summary(mod.clm)$alpha # Group 2: X1 <- cbind('Group' = rep(1, N), 'Z' = dat$Z) BB <- matrix(summary(mod.clm)$beta) XB <- X1 %*% BB #XB <- matrix(summary(mod.clm)$beta) thr <- summary(mod.clm)$alpha eta <- matrix(thr, nrow = nrow(XB), ncol = length(thr), byrow = T) - matrix(XB, nrow = nrow(XB), ncol = length(thr), byrow = F) p1 <- exp(eta)/(1 + exp(eta)) # Group 1 X0 <- cbind('Group' = rep(0, N), 'Z' = dat$Z) BB <- matrix(summary(mod.clm)$beta) XB <- X0 %*% BB #XB <- matrix(0) # obviously this is more complicated with more predictors! thr <- summary(mod.clm)$alpha eta <- matrix(thr, nrow = nrow(XB), ncol = length(thr), byrow = T) - matrix(XB, nrow = nrow(XB), ncol = length(thr), byrow = F) p0 <- exp(eta)/(1 + exp(eta)) p0 <- cbind(0, p0, 1) p1 <- cbind(0, p1, 1) p0 <- t(apply(p0, 1, diff)) p1 <- t(apply(p1, 1, diff)) p0 <- colMeans(p0) p1 <- colMeans(p1) ptab cbind(p0, p1) ptab[, 'Group_1']/ptab[, 'Group_2'] p0/p1 #------------------------------------ # > ptab[, 'Group_1']/ptab[, 'Group_2'] # 0 1 2 # 0.8732260 0.8552308 1.1510231 # > p0/p1 # [1] 0.6966564 0.8638131 1.2358708
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.