In tagteam/heaven: Data Preparation Routines for Medical Registry Data

This document provides some hopefully useful examples of programs used for typical epidemiological studies. The examples all use a public version of Framingham data. The data in this version of Framingham has been adjusted to ensure confidentiality of individuals and are not useful for science.

The examples below are meant only to show use of R for studies, not to provide useful results. There is no attempt to explain statistical analysis, only show procedure.

Always start a project by defining a working directory and load necessary libraries. Some libraries may need installation.

For "real" programming is is very wise to separate data management from result production in separate files or in separate chunks og program. This ensures that a program run from start to end is reproducible. That rule has been violated in this vignette to keep relevant sections together.

# Typical studies
library(heaven)
# if heaven is not installed use: "devtools::install_github('tagteam/heaven)"
library(data.table)
library(Publish)
library(survival)
# library(survminer) # for plotting Cox assumptions
library(rms) # for splines
library(riskRegression) # prediction
library(gtools)
library(knitr)

knitr::opts_chunk$set(fig.width=5, fig.height=5) 
# The following lines used because I cannot make knitr load data using "data()"

Useful householding functions

# Housholding functions for data
data(Framingham,package="heaven")
str(Framingham) # lists variables, types and first values
names(Framingham) # String of names
head(Framingham) # First 5 records
head(Framingham,25) # First 25 records
tail(Framingham) # last 5 records
class(Framingham) # Type
dim(Framingham) # dimensions

setDT(Framingham) # Change to data.table
class(Framingham) # Now data.table AND data.frame
# setDF(Framingham) will return to simple data.frame

Data management

Analysis of survival based on initial blood pressure and covariates

First find first observation each individual

setkey(Framingham,randid) # Creates an index and sorts the database
Fram1 <- Framingham[,.SD[1],by=randid] # first record for each randid
dim(Fram1) # Now 4434 records
lazyFactorCoding(Fram1) # provides the folloiwng list of factor conversions that has been 
  # copied to the program - some are left as comments
# Factor coding
Fram1[,sex:=factor(sex,levels=c("1","2"),labels=c("Male","Female"))]
Fram1[,cursmoke:=factor(cursmoke,levels=c("0","1"),labels=c("0","1"))]
Fram1[,diabetes:=factor(diabetes,levels=c("0","1"),labels=c("0","1"))]
Fram1[,bpmeds:=factor(bpmeds,levels=c("0","1"),labels=c("0","1"))]
Fram1[,educ:=factor(educ,levels=c("1","2","3","4"),labels=c("1","2","3","4"))]
Fram1[,prevchd:=factor(prevchd,levels=c("0","1"),labels=c("?????0","1"))]
Fram1[,prevap:=factor(prevap,levels=c("0","1"),labels=c("0","1"))]
Fram1[,prevmi:=factor(prevmi,levels=c("0","1"),labels=c("0","1"))]
Fram1[,prevstrk:=factor(prevstrk,levels=c("0","1"),labels=c("0","1"))]
Fram1[,prevhyp:=factor(prevhyp,levels=c("0","1"),labels=c("0","1"))]
#Fram1[,time:=factor(time,levels=c("0"),labels=c("0"))]
#Fram1[,period:=factor(period,levels=c("1"),labels=c("1"))]
#Fram1[,hdlc:=factor(hdlc,levels=c(""),labels=c(""))]
#Fram1[,ldlc:=factor(ldlc,levels=c(""),labels=c(""))]
#Fram1[,death:=factor(death,levels=c("0","1"),labels=c("0","1"))]
#Fram1[,angina:=factor(angina,levels=c("0","1"),labels=c("0","1"))]
#Fram1[,hospmi:=factor(hospmi,levels=c("0","1"),labels=c("0","1"))]
#Fram1[,MI_FCHD:=factor(MI_FCHD,levels=c("0","1"),labels=c("0","1"))]
#Fram1[,anychd:=factor(anychd,levels=c("0","1"),labels=c("0","1"))]
#Fram1[,stroke:=factor(stroke,levels=c("0","1"),labels=c("0","1"))]
#Fram1[,cvd:=factor(cvd,levels=c("0","1"),labels=c("0","1"))]
Fram1[,hyperten:=factor(hyperten,levels=c("0","1"),labels=c("0","1"))]
# Recode sysbp and age to obtain readable estimates in model
Fram1[,':='(sysbp=sysbp/10,age=age/10)]

Make variable for systolic blood pressure above/below median

Fram1[,sysbp2:=factor(sysbp>median(sysbp),levels=c(TRUE,FALSE),labels=c("above","below"))]
table(Fram1[,sysbp2])
Fram1 <- Fram1[,.SD,.SDcols=c("sysbp2","sysbp","sex","age","diabetes","heartrte","educ",
                              "totchol","prevmi","timedth","death","timecvd","cvd")]

Demography

tab1 <- utable(sysbp2~sysbp+sex+age+diabetes+heartrte+educ+totchol+prevmi,data=Fram1)
publish(tab1)
# to export to excell, use the following:
write.csv2(summary(tab1),file="tab1.csv")

Basic Cox model with focus on Blood pressure

# Missing values not good for analysis
Fram1 <- Fram1[complete.cases(Fram1),]
fit <- coxph(Surv(timedth,death)~sysbp+sex+age+diabetes+heartrte+educ+totchol+prevmi,data=Fram1)
summary(fit) # Fast and primitive

Assumptions for Cox

Model assumptions (proportional hazards, absence of interaction, linearity) are difficult to assess with a formal statistical test because the order of assessment matters and because in large data sets even small deviations are significant whereas in small data sets even groft deviations can be insignificant.

Useful investigation of the proportional hazard assumptions of the model:

Stop time by artificially censoring and compare beta coefficients with full followup
Stratify the baseline hazard for categorical variables
Use timereg package's martingal residual plots

Test of linearity for sysbp: Does it increase information to add a restricted cubic spline of sysbp?

fit2 <- coxph(Surv(timedth,death)~sysbp+rcs(sysbp)+sex+age+diabetes+heartrte+educ+totchol+prevmi,data=Fram1)
anova(fit, fit2, test = "Chisq")

Nice table output

reg <- summary(regressionTable(fit))$rawTable
reg <- summary(regressionTable(fit,factor.reference='inline'))$rawTable
# Add a column of relevant variable explanations
c1 <- c("Blood pressure systolic","Sex","Age","Diabetes","Heart rate","Education","","","Cholesterol",
       "Prior MI")
reg <- cbind(c1,reg)
# Nice format of p
reg$Pvalue <- format.pval(reg$Pvalue,eps=0.001,digits = 3)
plotConfidence(x=reg[,c(4:6)],labels=reg[,c(1,3,7)])
dev.off() # Maybe not necessary, but resets graphic parameters

Competing risk

Analysis to time to cvd - with and without competing risk of death from other causes

fit2 <- prodlim(Hist(timecvd,cvd)~sysbp2,data=Fram1)
plot(fit2,type="cuminc",timeconverter = "days2years",atrisk=FALSE,cex=0.5)
# Problem: Cumulative incidence is not a probability

#Recode cvd to have competing risk of death=2
table(Fram1$cvd,Fram1$death) # Cross table prior to change
Fram1[death==1 & cvd !=1, ':='(cvd=2, timecvd=timedth) ]
table(Fram1$cvd,Fram1$death) # Cross table after change
fit3 <- prodlim(Hist(timecvd,cvd)~sysbp2,data=Fram1) #with competing risk
plot(fit3,type="cuminc",timeconverter = "days2years",atrisk=FALSE,cex=0.5)
dev.off() # Maybe not necessary, but resets graphic parameters

Cox for cvd

fit4 <- coxph(Surv(timecvd,cvd==1)~sysbp+sex+age+diabetes+heartrte+
                    educ+totchol+prevmi,data=Fram1)
fit4 <- regressionTable(fit4)
plot(fit4)
dev.off() # Maybe not necessary, but resets graphic parameters

Prediction

So what is sysbp worth for prediction?

Models for cvd with and without sysbp

fit_sbp <- CSC(Hist(timecvd,cvd)~sysbp+sex+age+diabetes+heartrte+
                educ+totchol+prevmi,data=Fram1)
fit_nsbp <- CSC(Hist(timecvd,cvd)~sex+age+diabetes+heartrte+
                 educ+totchol+prevmi,data=Fram1)
# Score for difference
fit_dif <- Score(list("+sbp"=fit_sbp,"-sbp"=fit_nsbp),
          formula=Hist(timecvd,cvd)~sysbp+sex+age+diabetes+heartrte+
            educ+totchol+prevmi, data=Fram1,times=10*365,nullModel = FALSE,
          plots = "ROC",summary="riskQuantile")
fit_dif
boxplot(fit_dif) # Gennemsnitlig forskel for grupper
plotROC(fit_dif) # ROC kurve

# Plot of individual prediction with and without sysbp
Pred_with_sysbp <- predict(fit_sbp, times=10*365,newdata=Fram1,cause=1)
Pred_without_sysbp <- predict(fit_nsbp, times=10*365,newdata=Fram1,cause=1)

plot(Pred_with_sysbp$absRisk,Pred_without_sysbp$absRisk,
     main="10 year risk of cvd",
     xlab="Predicted risk (clinical variables + sysbp)",
     ylab="Predicted risk (clinical variables)")
abline(0,1,col="red",lwd=2)
dev.off() # Maybe not necessary, but resets graphic parameters

Matching

Prepare to match on diabetes

#Find date of first occurrence of diabetes
diab <- Framingham[diabetes==1,c("randid","time","diabetes")]
diab <- diab[,.SD[1],by=randid]
setnames(diab,c("time","diabetes"),c("timediab","diabetestime"))

Fram1 <- Framingham[,.SD,.SDcols=c("randid","time","sysbp","sex","age","diabetes","heartrte","educ",
                              "totchol","prevmi","timedth","death","timecvd","cvd")]
Fram1 <- Fram1[,.SD[1],by="randid"]
Fram1 <- merge(Fram1,diab,by="randid",all=TRUE)
Fram1[is.na(diabetestime),diabetestime:=0]
# for exact matching continuous variables needs to ba sutiably rounded
Fram1[,age_kat:=round(age/5)]

Matching on sex, age (5 year) and prior cvd Combines matching on fixe parameters with requirement of controls to be alive at time of matching. Further, it is required to match on the time dependent variable cvd

# Risk set matching with reuse of cases (control prior to case) and reuse of 
# controls - more cases get controls
Fram1 <- Fram1[complete.cases(Fram1)]
Match <- riskSetMatch("randid","diabetestime",c("age_kat","sex"),Fram1,2,caseIndex=
                       "timediab",controlIndex="timedth",dateterms = "timecvd",
                     ,reuseCases=TRUE,reuseControls=TRUE)
matchReport(Match,"randid","diabetestime","caseid")  
# Cox model 
fit <- coxph(Surv(timediab,timedth,death)~diabetestime+strata(caseid),data=Match)
summary(regressionTable(fit))

Time dependent analysis

Prepare data for time dependent analysis

# Limited basic dataset fraom Framinham with firt record for each randid - and just some variable
setkeyv(Framingham,c("randid","time"))
Fram2 <- Framingham[,.SD[1],by="randid",
  .SDcols=c("time","sex","age","educ","timedth","death")]
Fram2[,':='(inn=time,out=timedth)] # new variables for intervals - to avoid confusion
# Comorbidities
# Diabetes as time dependent variable:
diab <- Framingham[diabetes==1,c("randid","time")]
diab <- diab[,.SD[1],by="randid"]
setnames(diab,"time","diabetes") # changes name of time to "diabetes"
# cvd as time dependent variable
cvd <- Framingham[cvd==1,c("randid","timecvd")]
cvd <- cvd[,.SD[1],by="randid"]
setnames(cvd,"timecvd","cvd")
# Combine the chosen comorbidites in a matrix like data.table where the first column
# is ID and following columns dates of time dependent comorbidites
comorb <- merge(diab,cvd,by="randid",all=TRUE)
# if there are multiple dataset, the following allows multiple merges in one step:
#comorb <-Reduce(function(x, y) merge(x, y,by="randid", all=TRUE), list(diab,cvd))

# Periodic variables
# Create a data.table with columns defining ID, start-period, end-period, a value for period, 
# and name of the period variable.
Periods <- read.csv(text="randid,start,end,value,name") #Make empty data.frame
setDT(Periods)
for (var in c("sysbp","heartrte","totchol")){
  temp <- Framingham[,.SD,.SDcols=c("randid","time",var,"timedth")]
  temp[,end:=shift(time,type="lead"),by=randid]
  temp[is.na(end),end:=timedth]
  temp[,timedth:=NULL]
  setnames(temp,c("time",var),c("start","value"))
  temp[,name:=var]
  Periods <- rbind(Periods,temp) # Add the next period sequence to the output data
}
Periods[,end:=as.integer(end)]

With the data prepared a time dependent analysis can be performed. Both Cox and Poisson are possible, but the Cox-regression is very slow and therefore left as a comment.

#tFram2 <- Fram2[randid==2448]
#Periods[,value:=as.character(value)]
# Split by comorbidities
Fram2 <- lexisTwo(Fram2,comorb,c("randid","inn","out","death")
                  ,c("diabetes","cvd"))
F#Split on variables with periods 
Fram2 <- lexisFromTo(Fram2,Periods,c("randid","inn","out","death"),
                     c("randid","start","end","value","name"))
#Split on age
Fram2[,birth:=-age*365.25]
Fram2 <- lexisSeq(Fram2,c("randid","inn","out","death"),varname = 'birth',
                  splitvector = c(0,100*365,10*365),format="seq",value = "age_time")
#Split on calender time
Fram2 <- lexisSeq(Fram2,c("randid","inn","out","death"),format="seq",varname = NULL,
                  splitvector = c(0,100*365,10*365),value = "kal_time")
head(Fram2,20)
Fram2[,age_time:=age_time*10] # Adjust age to be close to real age by multiplying with "by"

# Time dependent Cox regression - with age as underlying time scale
# Define age for start/end intervals
# Section turned to comment, because it takes a LONG time
# Fram2[,':='(age_in=age+(inn/365),age_out=age+(end/365))]
#fit_cox <- coxph(Surv(age_inn,age_out,death)~kal_time+age_time+totchol+sysbp+heartrte+cvd+diabetes+
#                   sex,data=Fram2)
#plot(regressionTable(fit_cox))

#Time dependent Poisson regression
# Step 1 calculate duration of each time interval
Fram2[,duration:=out-inn]
Fram2 <- Fram2[complete.cases(Fram2)] # In real analysis the problem should rather be solved
#Change some variables to quartiles - poisson easiets to understand with categorial variables
Fram2[,':='(sysbp=quantcut(as.numeric(sysbp)),
            totchol=quantcut(as.numeric(totchol)),
            heartrte=quantcut(as.numeric(heartrte)),
            kal_time=factor(kal_time),
            age_time=quantcut(age_time))]
Fram2[is.na(heartrte),heartrte:="(69,75]"] #Fraud!! - but the model has to work!
Fram2[is.na(totchol),totchol:="[112,211]"] # More Fraud
# Collapse on covariates of interest, that sum death and duration by all relevant variables
Fram3 <- Fram2[,list(death_s=sum(death,na.rm=T),duration_s=sum(duration,na.rm=T)),
               by=list(kal_time,age_time,totchol,sysbp,heartrte,cvd,diabetes,sex)]
Fram3 <- Fram3[duration_s==0,duration_s:=1]
dim(Fram3)               
head(Fram3)
table(Fram3[,death_s])
fit_p <- glm(death_s~kal_time+age_time+totchol+sysbp+heartrte+cvd+diabetes+sex+offset(log(duration_s)), 
             family = poisson(link = "log"),data=Fram3)
regressionTable(fit_p)
plot(regressionTable(fit_p),plot.log="x")