This document provides some hopefully useful examples of programs used for typical epidemiological studies. The examples all use a public version of Framingham data. The data in this version of Framingham has been adjusted to ensure confidentiality of individuals and are not useful for science.
The examples below are meant only to show use of R for studies, not to provide useful results. There is no attempt to explain statistical analysis, only show procedure.
Always start a project by defining a working directory and load necessary libraries. Some libraries may need installation.
For "real" programming is is very wise to separate data management from result production in separate files or in separate chunks og program. This ensures that a program run from start to end is reproducible. That rule has been violated in this vignette to keep relevant sections together.
# Typical studies library(heaven) # if heaven is not installed use: "devtools::install_github('tagteam/heaven)" library(data.table) library(Publish) library(survival) # library(survminer) # for plotting Cox assumptions library(rms) # for splines library(riskRegression) # prediction library(gtools) library(knitr)
knitr::opts_chunk$set(fig.width=5, fig.height=5) # The following lines used because I cannot make knitr load data using "data()"
Useful householding functions
# Housholding functions for data data(Framingham,package="heaven") str(Framingham) # lists variables, types and first values names(Framingham) # String of names head(Framingham) # First 5 records head(Framingham,25) # First 25 records tail(Framingham) # last 5 records class(Framingham) # Type dim(Framingham) # dimensions setDT(Framingham) # Change to data.table class(Framingham) # Now data.table AND data.frame # setDF(Framingham) will return to simple data.frame
Analysis of survival based on initial blood pressure and covariates
First find first observation each individual
setkey(Framingham,randid) # Creates an index and sorts the database Fram1 <- Framingham[,.SD[1],by=randid] # first record for each randid dim(Fram1) # Now 4434 records lazyFactorCoding(Fram1) # provides the folloiwng list of factor conversions that has been # copied to the program - some are left as comments # Factor coding Fram1[,sex:=factor(sex,levels=c("1","2"),labels=c("Male","Female"))] Fram1[,cursmoke:=factor(cursmoke,levels=c("0","1"),labels=c("0","1"))] Fram1[,diabetes:=factor(diabetes,levels=c("0","1"),labels=c("0","1"))] Fram1[,bpmeds:=factor(bpmeds,levels=c("0","1"),labels=c("0","1"))] Fram1[,educ:=factor(educ,levels=c("1","2","3","4"),labels=c("1","2","3","4"))] Fram1[,prevchd:=factor(prevchd,levels=c("0","1"),labels=c("?????0","1"))] Fram1[,prevap:=factor(prevap,levels=c("0","1"),labels=c("0","1"))] Fram1[,prevmi:=factor(prevmi,levels=c("0","1"),labels=c("0","1"))] Fram1[,prevstrk:=factor(prevstrk,levels=c("0","1"),labels=c("0","1"))] Fram1[,prevhyp:=factor(prevhyp,levels=c("0","1"),labels=c("0","1"))] #Fram1[,time:=factor(time,levels=c("0"),labels=c("0"))] #Fram1[,period:=factor(period,levels=c("1"),labels=c("1"))] #Fram1[,hdlc:=factor(hdlc,levels=c(""),labels=c(""))] #Fram1[,ldlc:=factor(ldlc,levels=c(""),labels=c(""))] #Fram1[,death:=factor(death,levels=c("0","1"),labels=c("0","1"))] #Fram1[,angina:=factor(angina,levels=c("0","1"),labels=c("0","1"))] #Fram1[,hospmi:=factor(hospmi,levels=c("0","1"),labels=c("0","1"))] #Fram1[,MI_FCHD:=factor(MI_FCHD,levels=c("0","1"),labels=c("0","1"))] #Fram1[,anychd:=factor(anychd,levels=c("0","1"),labels=c("0","1"))] #Fram1[,stroke:=factor(stroke,levels=c("0","1"),labels=c("0","1"))] #Fram1[,cvd:=factor(cvd,levels=c("0","1"),labels=c("0","1"))] Fram1[,hyperten:=factor(hyperten,levels=c("0","1"),labels=c("0","1"))] # Recode sysbp and age to obtain readable estimates in model Fram1[,':='(sysbp=sysbp/10,age=age/10)]
Make variable for systolic blood pressure above/below median
Fram1[,sysbp2:=factor(sysbp>median(sysbp),levels=c(TRUE,FALSE),labels=c("above","below"))] table(Fram1[,sysbp2]) Fram1 <- Fram1[,.SD,.SDcols=c("sysbp2","sysbp","sex","age","diabetes","heartrte","educ", "totchol","prevmi","timedth","death","timecvd","cvd")]
tab1 <- utable(sysbp2~sysbp+sex+age+diabetes+heartrte+educ+totchol+prevmi,data=Fram1) publish(tab1) # to export to excell, use the following: write.csv2(summary(tab1),file="tab1.csv")
# Missing values not good for analysis Fram1 <- Fram1[complete.cases(Fram1),] fit <- coxph(Surv(timedth,death)~sysbp+sex+age+diabetes+heartrte+educ+totchol+prevmi,data=Fram1) summary(fit) # Fast and primitive
Model assumptions (proportional hazards, absence of interaction, linearity) are difficult to assess with a formal statistical test because the order of assessment matters and because in large data sets even small deviations are significant whereas in small data sets even groft deviations can be insignificant.
Useful investigation of the proportional hazard assumptions of the model:
Test of linearity for sysbp: Does it increase information to add a restricted cubic spline of sysbp?
fit2 <- coxph(Surv(timedth,death)~sysbp+rcs(sysbp)+sex+age+diabetes+heartrte+educ+totchol+prevmi,data=Fram1) anova(fit, fit2, test = "Chisq")
reg <- summary(regressionTable(fit))$rawTable reg <- summary(regressionTable(fit,factor.reference='inline'))$rawTable # Add a column of relevant variable explanations c1 <- c("Blood pressure systolic","Sex","Age","Diabetes","Heart rate","Education","","","Cholesterol", "Prior MI") reg <- cbind(c1,reg) # Nice format of p reg$Pvalue <- format.pval(reg$Pvalue,eps=0.001,digits = 3) plotConfidence(x=reg[,c(4:6)],labels=reg[,c(1,3,7)]) dev.off() # Maybe not necessary, but resets graphic parameters
Analysis to time to cvd - with and without competing risk of death from other causes
fit2 <- prodlim(Hist(timecvd,cvd)~sysbp2,data=Fram1) plot(fit2,type="cuminc",timeconverter = "days2years",atrisk=FALSE,cex=0.5) # Problem: Cumulative incidence is not a probability #Recode cvd to have competing risk of death=2 table(Fram1$cvd,Fram1$death) # Cross table prior to change Fram1[death==1 & cvd !=1, ':='(cvd=2, timecvd=timedth) ] table(Fram1$cvd,Fram1$death) # Cross table after change fit3 <- prodlim(Hist(timecvd,cvd)~sysbp2,data=Fram1) #with competing risk plot(fit3,type="cuminc",timeconverter = "days2years",atrisk=FALSE,cex=0.5) dev.off() # Maybe not necessary, but resets graphic parameters
fit4 <- coxph(Surv(timecvd,cvd==1)~sysbp+sex+age+diabetes+heartrte+ educ+totchol+prevmi,data=Fram1) fit4 <- regressionTable(fit4) plot(fit4) dev.off() # Maybe not necessary, but resets graphic parameters
So what is sysbp worth for prediction?
fit_sbp <- CSC(Hist(timecvd,cvd)~sysbp+sex+age+diabetes+heartrte+ educ+totchol+prevmi,data=Fram1) fit_nsbp <- CSC(Hist(timecvd,cvd)~sex+age+diabetes+heartrte+ educ+totchol+prevmi,data=Fram1) # Score for difference fit_dif <- Score(list("+sbp"=fit_sbp,"-sbp"=fit_nsbp), formula=Hist(timecvd,cvd)~sysbp+sex+age+diabetes+heartrte+ educ+totchol+prevmi, data=Fram1,times=10*365,nullModel = FALSE, plots = "ROC",summary="riskQuantile") fit_dif boxplot(fit_dif) # Gennemsnitlig forskel for grupper plotROC(fit_dif) # ROC kurve # Plot of individual prediction with and without sysbp Pred_with_sysbp <- predict(fit_sbp, times=10*365,newdata=Fram1,cause=1) Pred_without_sysbp <- predict(fit_nsbp, times=10*365,newdata=Fram1,cause=1) plot(Pred_with_sysbp$absRisk,Pred_without_sysbp$absRisk, main="10 year risk of cvd", xlab="Predicted risk (clinical variables + sysbp)", ylab="Predicted risk (clinical variables)") abline(0,1,col="red",lwd=2) dev.off() # Maybe not necessary, but resets graphic parameters
Prepare to match on diabetes
#Find date of first occurrence of diabetes diab <- Framingham[diabetes==1,c("randid","time","diabetes")] diab <- diab[,.SD[1],by=randid] setnames(diab,c("time","diabetes"),c("timediab","diabetestime")) Fram1 <- Framingham[,.SD,.SDcols=c("randid","time","sysbp","sex","age","diabetes","heartrte","educ", "totchol","prevmi","timedth","death","timecvd","cvd")] Fram1 <- Fram1[,.SD[1],by="randid"] Fram1 <- merge(Fram1,diab,by="randid",all=TRUE) Fram1[is.na(diabetestime),diabetestime:=0] # for exact matching continuous variables needs to ba sutiably rounded Fram1[,age_kat:=round(age/5)]
Matching on sex, age (5 year) and prior cvd Combines matching on fixe parameters with requirement of controls to be alive at time of matching. Further, it is required to match on the time dependent variable cvd
# Risk set matching with reuse of cases (control prior to case) and reuse of # controls - more cases get controls Fram1 <- Fram1[complete.cases(Fram1)] Match <- riskSetMatch("randid","diabetestime",c("age_kat","sex"),Fram1,2,caseIndex= "timediab",controlIndex="timedth",dateterms = "timecvd", ,reuseCases=TRUE,reuseControls=TRUE) matchReport(Match,"randid","diabetestime","caseid") # Cox model fit <- coxph(Surv(timediab,timedth,death)~diabetestime+strata(caseid),data=Match) summary(regressionTable(fit))
Prepare data for time dependent analysis
# Limited basic dataset fraom Framinham with firt record for each randid - and just some variable setkeyv(Framingham,c("randid","time")) Fram2 <- Framingham[,.SD[1],by="randid", .SDcols=c("time","sex","age","educ","timedth","death")] Fram2[,':='(inn=time,out=timedth)] # new variables for intervals - to avoid confusion # Comorbidities # Diabetes as time dependent variable: diab <- Framingham[diabetes==1,c("randid","time")] diab <- diab[,.SD[1],by="randid"] setnames(diab,"time","diabetes") # changes name of time to "diabetes" # cvd as time dependent variable cvd <- Framingham[cvd==1,c("randid","timecvd")] cvd <- cvd[,.SD[1],by="randid"] setnames(cvd,"timecvd","cvd") # Combine the chosen comorbidites in a matrix like data.table where the first column # is ID and following columns dates of time dependent comorbidites comorb <- merge(diab,cvd,by="randid",all=TRUE) # if there are multiple dataset, the following allows multiple merges in one step: #comorb <-Reduce(function(x, y) merge(x, y,by="randid", all=TRUE), list(diab,cvd)) # Periodic variables # Create a data.table with columns defining ID, start-period, end-period, a value for period, # and name of the period variable. Periods <- read.csv(text="randid,start,end,value,name") #Make empty data.frame setDT(Periods) for (var in c("sysbp","heartrte","totchol")){ temp <- Framingham[,.SD,.SDcols=c("randid","time",var,"timedth")] temp[,end:=shift(time,type="lead"),by=randid] temp[is.na(end),end:=timedth] temp[,timedth:=NULL] setnames(temp,c("time",var),c("start","value")) temp[,name:=var] Periods <- rbind(Periods,temp) # Add the next period sequence to the output data } Periods[,end:=as.integer(end)]
With the data prepared a time dependent analysis can be performed. Both Cox and Poisson are possible, but the Cox-regression is very slow and therefore left as a comment.
#tFram2 <- Fram2[randid==2448] #Periods[,value:=as.character(value)] # Split by comorbidities Fram2 <- lexisTwo(Fram2,comorb,c("randid","inn","out","death") ,c("diabetes","cvd")) F#Split on variables with periods Fram2 <- lexisFromTo(Fram2,Periods,c("randid","inn","out","death"), c("randid","start","end","value","name")) #Split on age Fram2[,birth:=-age*365.25] Fram2 <- lexisSeq(Fram2,c("randid","inn","out","death"),varname = 'birth', splitvector = c(0,100*365,10*365),format="seq",value = "age_time") #Split on calender time Fram2 <- lexisSeq(Fram2,c("randid","inn","out","death"),format="seq",varname = NULL, splitvector = c(0,100*365,10*365),value = "kal_time") head(Fram2,20) Fram2[,age_time:=age_time*10] # Adjust age to be close to real age by multiplying with "by" # Time dependent Cox regression - with age as underlying time scale # Define age for start/end intervals # Section turned to comment, because it takes a LONG time # Fram2[,':='(age_in=age+(inn/365),age_out=age+(end/365))] #fit_cox <- coxph(Surv(age_inn,age_out,death)~kal_time+age_time+totchol+sysbp+heartrte+cvd+diabetes+ # sex,data=Fram2) #plot(regressionTable(fit_cox)) #Time dependent Poisson regression # Step 1 calculate duration of each time interval Fram2[,duration:=out-inn] Fram2 <- Fram2[complete.cases(Fram2)] # In real analysis the problem should rather be solved #Change some variables to quartiles - poisson easiets to understand with categorial variables Fram2[,':='(sysbp=quantcut(as.numeric(sysbp)), totchol=quantcut(as.numeric(totchol)), heartrte=quantcut(as.numeric(heartrte)), kal_time=factor(kal_time), age_time=quantcut(age_time))] Fram2[is.na(heartrte),heartrte:="(69,75]"] #Fraud!! - but the model has to work! Fram2[is.na(totchol),totchol:="[112,211]"] # More Fraud # Collapse on covariates of interest, that sum death and duration by all relevant variables Fram3 <- Fram2[,list(death_s=sum(death,na.rm=T),duration_s=sum(duration,na.rm=T)), by=list(kal_time,age_time,totchol,sysbp,heartrte,cvd,diabetes,sex)] Fram3 <- Fram3[duration_s==0,duration_s:=1] dim(Fram3) head(Fram3) table(Fram3[,death_s]) fit_p <- glm(death_s~kal_time+age_time+totchol+sysbp+heartrte+cvd+diabetes+sex+offset(log(duration_s)), family = poisson(link = "log"),data=Fram3) regressionTable(fit_p) plot(regressionTable(fit_p),plot.log="x")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.