In CJangelo/SPR: Tools to Learn Statistical Programming in R

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

Generate Data, Visualize Data, Fit Models

N = 5000 #this should be divisible by however many groups you use!
number.groups <- 2
number.timepoints <- 1
set.seed(6152021)

dat <- data.frame(
                  'USUBJID' = rep(paste0('Subject_', formatC(1:N, width = 4, flag = '0')), length.out= N*number.timepoints),
                  'Group' = rep(paste0('Group_', 1:number.groups), length.out = N*number.timepoints),
                  stringsAsFactors=F)


# Create Beta parameters for these design matrix:
X <- model.matrix( ~ Group  , data = dat)


# Create Beta
Beta <- matrix(0, nrow = ncol(X), dimnames=list(colnames(X), 'param'))
Beta[] <- c(0.0, 2)

# Matrix multiply:
XB <- X %*% Beta

# Thresholds:
thr <- c(0, 1, 2, 3)
eta <- matrix(thr, nrow = nrow(XB), ncol = length(thr), byrow = T) - 
  matrix(XB, nrow = nrow(XB), ncol = length(thr), byrow = F)

p <- exp(eta)/(1 + exp(eta))
dat$p <- p
dat$Y <- apply(runif(n = N)  > p, 1, sum)


# Quick barplot, base R plot functions
barplot(100*table(dat$Y)/sum(table(dat$Y)),
        ylim = c(0, 100), ylab = 'Percentage',
        col = 'grey', main = 'Ordinal')

#----
# Key here is that this data is ORDERED


# Fit Models:
library(MASS)
mod <- MASS:::polr(as.factor(Y) ~ Group , data= dat, method = 'logistic', Hess = T)
summary(mod)
mod$coefficients # can't estimate an intercept here
Beta
mod$zeta
thr

library(ordinal)
mod.clm <- clm(as.factor(Y) ~ Group, data = dat)
summary(mod.clm)
mod.clm$beta
# Compute odds ratios by hand
# Examine the proportional odds assumption
# Notice that it's not exact
tab <- addmargins(xtabs(~ Y + Group, data = dat), 1)
#
# >0
G1 <- (tab['Sum', 'Group_1'] - tab['0','Group_1'])/tab['0', 'Group_1']
G2 <- (tab['Sum', 'Group_2'] - tab['0','Group_2'])/tab['0', 'Group_2']
G2/G1
exp(mod.clm$beta)
#
# > 1 (includes 0 and 1 categories)
G1 <- (tab['Sum', 'Group_1'] - sum(tab[c('0', '1'), 'Group_1']))/
  sum(tab[c('0', '1'), 'Group_1'])
G2 <- (tab['Sum', 'Group_2'] - sum(tab[c('0', '1'), 'Group_2']))/
  sum(tab[c('0', '1'), 'Group_2'])
G2/G1
exp(mod.clm$beta)
#
# > 2 (includes 0, 1, 2)
G1 <- (tab['Sum', 'Group_1'] - sum(tab[c('0', '1', '2'), 'Group_1']))/
  sum(tab[c('0', '1', '2'), 'Group_1'])
G2 <- (tab['Sum', 'Group_2'] - sum(tab[c('0', '1', '2'), 'Group_2']))/
  sum(tab[c('0', '1', '2'), 'Group_2'])
G2/G1
exp(mod.clm$beta)
#
# > 3 (includes 0, 1, 2, 3)
G1 <- (tab['Sum', 'Group_1'] - sum(tab[c('0', '1', '2', '3'), 'Group_1']))/
  sum(tab[c('0', '1', '2', '3'), 'Group_1'])
G2 <- (tab['Sum', 'Group_2'] - sum(tab[c('0', '1', '2', '3'), 'Group_2']))/
  sum(tab[c('0', '1', '2', '3'), 'Group_2'])
G2/G1
exp(mod.clm$beta)
#

Custom Estimation R code

Ordinal Logistic Estimation

# Custom written estimation routine:
ord_loglike <- function(vP, X, Y, k){

  N <- nrow(X)
  Y <- Y + 1 # Must start at 1!
  XX <- X[ , -1, drop = F] # drop intercept
  XB <- XX %*% vP[1] # Not sure why polr parameterizes it this way
  XB <- XB %*% matrix(1, nrow = ncol(XB), ncol = k)
  eta <- matrix(vP[-1], nrow = nrow(XB), ncol = k, byrow = T) - XB
  p <- exp(eta)/(1 + exp(eta))
  p <- cbind(0, p, 1)

  like <- p[cbind(1:N, Y + 1)] - p[cbind(1:N, Y)]  
  loglike <- log(like)
  loglike <- -1*sum(loglike, na.rm = F)
  return(loglike)

}


# optimize
# Set starting values:
Y <- dat$Y + 1
intercepts.hat = matrix(NA, nrow = 1, ncol = length(unique(Y)) - 1)
  for (score in 2:length(unique(Y))){
    intercepts.hat[ , score -1] = log(mean(Y >= score, na.rm =T))
  }

# vP is the vector of parameters passed to the optim() function
vP <- c(1, -1*intercepts.hat)
# optim() function optimizes the objective function
out <- optim(par = vP, 
             hessian = T,
             fn = ord_loglike, 
             method = 'BFGS', 
             X = X, 
             Y = dat$Y,  
             k = length(intercepts.hat))

# Compare Generating Parameter to R package estimate and the custom code estimate
cbind('polr() estimate' = c(mod$coefficients, mod$zeta), 
      'clm() estimate' = c(mod.clm$beta, mod.clm$alpha), 
      'custom code' = out$par) 

# Standard errors:
SE <- sqrt(diag(solve(out$hessian)))
zval <- out$par/SE
2*(1-pnorm(zval[1]))
summary(mod.clm)

Simulation Study, Check Type I error

The ordinal R package uses Z-tests, compare the Type I error control of that to the t-distribution.

rm(list = ls())
gc()

library(ordinal)


N = 100 #this should be divisible by however many groups you use!
number.groups <- 2
number.timepoints <- 1
number.repl <- 1000
out <- vector()

for (repl in 1:number.repl){

  set.seed(6242021 + repl)

dat <- data.frame(
                  'USUBJID' = rep(paste0('Subject_', formatC(1:N, width = 4, flag = '0')), length.out= N*number.timepoints),
                  'Group' = rep(paste0('Group_', 1:number.groups), length.out = N*number.timepoints),
                  stringsAsFactors=F)


X <- model.matrix( ~ Group  , data = dat)
Beta <- matrix(0, nrow = ncol(X), dimnames=list(colnames(X), 'param'))
Beta[] <- c(0.0, 0)
XB <- X %*% Beta
thr <- c(0, 1, 2, 3)
eta <- matrix(thr, nrow = nrow(XB), ncol = length(thr), byrow = T) -
  matrix(XB, nrow = nrow(XB), ncol = length(thr), byrow = F)

p <- exp(eta)/(1 + exp(eta))
dat$p <- p
dat$Y <- apply(runif(n = N)  > p, 1, sum)


library(ordinal)
mod.clm <- clm(as.factor(Y) ~ Group, data = dat)
mod.out <- summary(mod.clm)
pval.z <- mod.out$coef['GroupGroup_2', 'Pr(>|z|)']

tval <- mod.out$coef['GroupGroup_2', 'Estimate']/
  mod.out$coef['GroupGroup_2', 'Std. Error']
#tval #yes, same as the zvalue reported out
#2*(1-pnorm(tval)) # This is model reported out, use t-distribution:
pval.t <- 2*(1-pt(q = tval, df = N - length(coef(mod.clm))))
out <- rbind(out, c(pval.z, pval.t))
cat('Replication: ', repl, '\n')

}


colMeans(out < 0.05)