SupplementSEER.R
In SEERaBomb: SEER and Atomic Bomb Survivor Data Analysis Tools

# NOTE: This script assumes that the SEER data is in /data/SEER.  Please change
# this line to match your data location if it differs
.seerHome="~/data/SEER" 
rm(list=ls()) # note: dot variables defined above persist through such cleanings
# install.packages("RSQLite")
# install.packages("bbmle")
# install.packages("ggplot2")
# install.packages("plyr")
library(RSQLite)
m=dbDriver("SQLite")
library(plyr)
library(bbmle)
library(ggplot2)
graphics.off()
if(length(grep("linux",R.Version()$os))) windows <- function( ... ) X11( ... )
if(length(grep("darwin",R.Version()$os))) windows <- function( ... ) quartz( ... )
windows(width=8,height=7)
# windows(height=7,width=8,xpos=-100,ypos=-100) #need to control the device size to
                                    # make things work across computers/screen sizes
# Note: Figure 1 was drawn using Adobe Illustrator so it is not reproduced here.
######################   
# Code 9863 versus 205.1 trends over the decades: Figure 2
con=dbConnect(m,dbname=file.path(.seerHome,"73/all.db"))
# con=dbConnect(m,dbname=file.path(.seerHome,"mrgd/cancDef.db"))
# dbListFields(con,"canc")
#######################################################################
dbListTables(con)
dbListFields(con,"pops")
dbListFields(con,"lymyleuk")
pop=dbGetQuery(con,"SELECT * from pops where popage>5 and popage<19")
head(pop)
(pop<-ddply(pop, .(popage,popsex,popyear), summarise,py=sum(population)))
pop$dec= cut(pop$popyear,breaks=c(1972,1984,1996,2013),labels=c("73to84","85to96","97to11"))
(pop<-ddply(pop, .(popage,popsex,dec), summarise,py=sum(py)))
head(pop,20)

d=dbGetQuery(con, 
#      "SELECT * from lymyleuk where histo2=9863 and seqnum<2 and agerec>5 and agerec<19")
    "SELECT * from lymyleuk where cancer='CML' and seqnum<2")
head(d)
d$agerec=as.numeric(cut(d$agedx,breaks=c(0,1,seq(5,85,5),130)))
d=d[d$agerec>5&d$agerec<19,]
d=d[!is.na(d$agerec),]

(d<-ddply(d, .(agerec,sex,yrdx), summarise,cases=length(agerec))) 
d$decade= cut(d$yrdx,breaks=c(1972,1984,1996,2013),labels=c("73to84","85to96","97to11"))
head(d)
(d<-ddply(d, .(agerec,sex,decade), summarise,cases=sum(cases)))
head(cbind(d,pop)) # just to see that they match up
d=cbind(d,py=pop[,"py"]) # only take the non-redundant py column
head(d)

d9=dbGetQuery(con, 
        "SELECT * from lymyleuk where ICD9=2051 and seqnum<2")
d9$agerec=as.numeric(cut(d9$agedx,breaks=c(0,1,seq(5,85,5),130)))
d9=d9[d9$agerec>5&d9$agerec<19,]
d9=d9[!is.na(d9$agerec),]
d9<-ddply(d9, .(sex,agerec,yrdx), summarise,cases=length(agerec))
d9$decade= cut(d9$yrdx,breaks=c(1972,1984,1996,2013),labels=c("73to84","85to96","97to11"))
d9<-ddply(d9, .(agerec,sex,decade), summarise,cases=sum(cases))
d9=cbind(d9,py=pop[,"py"])
head(d9)
(d=cbind(rbind(d,d9),code=gl(2,dim(d9)[1],labels=c("9863","205.1"))) ) 
head(d)
d=transform(d,incid=1e6*cases/py)
d$sex=factor(d$sex,labels=c("Male","Female"))
age=c(0.5,3,seq(7.5,87.5,5))
d$age=age[d$agerec+1]
head(d,15)
names(d)[3]="Decade" # make legend start with capital
(p <- ggplot(d,aes(x=age,y=incid,shape=Decade,col=Decade))+geom_point(size=5)
 + labs(title="CML Incidence",x="Age (years)",
        y=expression(paste("Cases per ",10^6," Person-Years")))    
 + scale_y_log10(limits=c(3,200)) )
(p=p + facet_grid(code ~ sex))
mythem=theme( 
  plot.title = element_text(size = rel(1.5)),
  axis.title = element_text(size = rel(1.4)),
  axis.text = element_text(size = rel(1.2)),
  strip.text = element_text(size = rel(1.2))  )
p=p+mythem
(p=p+theme(legend.position = c(.60, .86),  #           legend.direction = 'vertical',
         legend.title = element_text(size = rel(1.4)) ,
  legend.text = element_text(size = rel(1.4))  ) )  

# ggsave(p,file="/users/radivot/igv/trends.wmf")
# ggsave(p,file="/users/radivot/case/grants/sachs/HSCepi/figs/trends.png")

#######################################################################
## CMML Incidence: Figure 3
#######################################################################
con=dbConnect(m,dbname=file.path(.seerHome,"00/all.db"))
d=dbGetQuery(con, 
  "SELECT * from lymyleuk where histo3==9945 and seqnum<2")
  # "SELECT * from lymyleuk where histo3==9945 and seqnum<2 and agerec>7 and agerec<19")
d$agerec=as.numeric(cut(d$agedx,breaks=c(0,1,seq(5,85,5),130)))
d=d[d$agerec>7&d$agerec<19,]
d=d[!is.na(d$agerec),]

(d<-ddply(d, .(sex,agerec), summarise,cases=length(agerec))) 
pops=dbGetQuery(con, "SELECT * from pops where popyear>2000 and popage>7 and popage<19")
(pop<-ddply(pops, .(popage,popsex), summarise,py=sum(population)))
d=cbind(d,py=pop[order(pop$popsex),"py"])
d=transform(d,incid=1e6*cases/py) # make it cases per million person years
d$Sex=factor(d$sex,labels=c("Male","Female"))
d$age=age[d$agerec+1]
head(d,15)

summary(mm1<-mle2(cases~dpois(lambda=exp(c+k*age)*py),
                  parameters=list(c~Sex),
                  method="Nelder-Mead",
                  start=list(c=-12,k=0.04),data=d)) 
d$EI=1e6*predict(mm1)/d$py
head(d,15)
(CI2=round(cbind(coef(mm1),confint(mm1)),4)  )
exp(-CI2["c.SexFemale",c(1,3,2)])
mkCIk=function(v) sprintf("k = %4.3f (%4.3f, %4.3f)",v[1],v[2],v[3])
(lab=mkCIk(CI2["k",]))

library(ggplot2)
(p <- ggplot(d,aes(x=age,y=incid,shape=Sex))+geom_point(size=5)
 + labs(title="CMML Incidence: SEER 2000-2011",x="Age (years)",
        y=expression(paste("Cases per ",10^6," Person-Years")))  #  y=expression(frac(Cases,paste(10^6," Person-Years")))) 
 + scale_y_log10(limits=c(0.1,70)) )

(p=p+theme(plot.title = element_text(size = rel(2.4)),
           axis.title = element_text(size = rel(2.4)),
           axis.text = element_text(size = rel(2)))  )
(p=p+theme(legend.position = c(0.8, .2), 
           legend.title = element_text(size = rel(2)) ,
           legend.text = element_text(size = rel(2))  ) )  
(p=p+geom_line(aes(y=EI))) # update just y component of aes

(p=p+annotate("text",x=35,y=70, hjust=0, label = paste("y ~ exp(C+k*age)  C~sex"),size=8 )   )  
(p=p+annotate("text",x=35,y=40,hjust=0,label=lab, size=8))

# ggsave(p,file="/users/Tomas/downloads/CMML.wmf")
# ggsave(p,file="/users/radivot/case/grants/sachs/HSCepi/figs/CMML.png")



###########################################################################################
# SEER CML versus Race Figure 4
########################################
pops=dbGetQuery(con, "SELECT * from pops where popyear>2000 and popage>5 and popage<19")
head(pops,20)
summary(as.factor(pops$poprace))
pops$poprace[pops$poprace>2]=3
(pop<-ddply(pops, .(popage,popsex,poprace), summarise,py=sum(population)))
head(pop)
d=dbGetQuery(con, 
# ICDO3 CML codes 9875 = bcr-abl+ and 9876 =bcr-abl neg CML are not in full use yet, i.e.
# many ICD-O3 CML codes are still 9863 carried over from ICD-O2
# "SELECT * from lymyleuk where histo3=9876 and seqnum<2 and race<98 and agerec>5 and agerec<19")
"SELECT * from lymyleuk where cancer='CML' and seqnum<2 and race<98")
# "SELECT * from lymyleuk where histo3=9863 and seqnum<2 and race<98 and agerec>5 and agerec<19")
d$agerec=as.numeric(cut(d$agedx,breaks=c(0,1,seq(5,85,5),130)))
d=d[d$agerec>5&d$agerec<19,]
d=d[!is.na(d$agerec),]

d$race[d$race>2]=3
head(d)
(ddply(d, .(agerec,sex,race), summarise,cases=length(agerec))) #only 5 of the 16s 
count(d, vars=c("agerec","sex","race"))  # same here, and no options to fix it
(d=ddply(d, .(agerec,sex,race), summarise,cases=length(agerec),.drop=F))# this gets them all
head(cbind(d,pop)) # see alignment
d=cbind(d,py=pop[,"py"])
d=transform(d,incid=1e6*cases/py)
head(d,15)
d$Sex=factor(d$sex,labels=c("Male","Female"))
d$race=factor(d$race,labels=c("White","Black","Asian"))
age=c(0.5,3,seq(7.5,87.5,5))
d$age=age[d$agerec+1]
head(d,15)
# ## playing around with stuff like this first
# (s1=summary(m1<-mle2(cases~dpois(lambda=exp(c+k*age)*py),
#                      parameters=list(c~-1+sex:race,k~-1+race),   
#                      method="L-BFGS-B",
#                      lower=c(rep(-15,6),rep(0.01,3)),
#                      upper=c(rep(-10,6),rep(0.05,3)),
#                      start=list(c=-13,k=0.02),
#                      data=d,
#                      control=list(maxit=500,trace=1,parscale=c(rep(13,6),rep(0.03,3)) )
# ) )  )
# I finally found good initial parameter estimates
(s1=summary(m1<-mle2(cases~dpois(lambda=exp(c+k*age)*py),
                     parameters=list(c~-1+Sex:race,k~-1+race),   
                     start=list(c=-13,k=0.02),
                     data=d
) )  )


# This is fine if all we want is k estimates and absolute amplitude params
# But for M/F ratios we need deltaC difference parameters, obtainable as
(sA=summary(mA<-mle2(cases~dpois(lambda=exp(c+k*age)*py),
                     parameters=list(c~-1+race+Sex:race,k~-1+race), # = work to figure out  
                     start=list(c=-4,k=0.02),
                     data=d
) )  )

(cA=confint(mA)) #good  
d$EI=1e6*predict(mA)/d$py
head(d,20) #incidence predictions look reasonable
# d$EI=1e6*as.numeric(predict(mA))/d$py 
# here as.numeric clears .Dimnames in the matrix output of predict()  
# If ggplot gives warnings about rownames being the same, this may help


# And for Tf shifts we need 
(sT=summary(mT<-mle2(cases~dpois(lambda=exp(c+k*(age-Tf))*py),
                     parameters=list(c~-1+race,k~-1+race,Tf~-1+Sex:race),   
                    fixed=list("Tf.SexMale:raceWhite"=0,
                               "Tf.SexMale:raceBlack"=0,
                               "Tf.SexMale:raceAsian"=0),
                     start=list(c=-12,k=0.02,Tf=10),
               control=list(maxit=100,trace=0,parscale=c(rep(-12,3),rep(0.03,3),rep(10,3))),
                     data=d
) )  )
(cT=confint(mT))#but unfortunately, we cannot get Tf CI for Asians or Blacks
d$EIT=1e6*as.numeric(predict(mT))/d$py
head(d,20) #predictions of T model are close to the A model 

# # the next block attempts to reparameterize the T model to get Tf CI 
# # This can be collapsed for now, since I'm not sure if anything will come of it
# #######################
# # try moving it all into Tf
# (sT=summary(mT<-mle2(cases~dpois(lambda=exp(k*(age-Tf))*py),
#                      parameters=list(k~-1+race,Tf~-1+race+sex:race),   
#                     start=list(k=0.02,Tf=100),
#              control=list(maxit=100,trace=0,parscale=c(rep(.03,3),rep(300,3),rep(10,3))),
#                      data=d
# ) )  ) # bad fit, want around 470
# 
# # try controlling IC
# (IC=c("k.raceWhite"=0.04,"k.raceBlack"=0.03,"k.raceAsian"=0.02,
# "Tf.raceWhite"=13/0.04,"Tf.raceBlack"=13/0.04,"Tf.raceAsian"=13/0.02,
# "Tf.raceWhite:sexF"=10,"Tf.raceBlack:sexF"=10,"Tf.raceAsian:sexF"=20) )
# 
# (sT=summary(mT<-mle2(cases~dpois(lambda=exp(k*(age-Tf))*py),
#                      parameters=list(k~-1+race,Tf~-1+race+sex:race),   
#                      start=as.list(IC),
# #                      control=list(maxit=100,trace=0,parscale=IC),
#                      data=d
# ) )  )   
# 
# # => An error messsage that I cannot understand. Time is up, I surrender
# ##################

# Back to the basics: reduce the dimensionality by doing the races separately.

dw=subset(d,race=="White")

# first show no significant difference in k values
summary(mle2(cases~dpois(lambda=exp(c+k*age)*py),method="Nelder-Mead",
                       parameters=list(c~Sex,k~sex),
                       start=list(c=-12,k=0.04),data=dw))

(sw2=summary(mw2<-mle2(cases~dpois(lambda=exp(c+k*(age-Tf))*py),
                       method="Nelder-Mead",
                       parameters=list(Tf~Sex),
                       fixed=list("Tf.(Intercept)"=0),
                       start=list(c=-12,k=0.04,Tf=10),data=dw))     )

(cwT=confint(mw2))
cT    # good to see same answer for white Tf CI



db=subset(d,race=="Black")
summary(mle2(cases~dpois(lambda=exp(c+k*age)*py),method="Nelder-Mead",
             parameters=list(c~Sex,k~sex),
             start=list(c=-12,k=0.04),data=db))


(sb2=summary(mb2<-mle2(cases~dpois(lambda=exp(c+k*(age-Tf))*py),
                       method="Nelder-Mead",
                       parameters=list(Tf~Sex),
                       fixed=list("Tf.(Intercept)"=0),
                       start=list(c=-12,k=0.04,Tf=10),data=db))     )
(cbT=confint(mb2))
cT    # adding all those whites didn't tighten black k and c estimates.
# This is because in Poisson regression, the mean = the variance, so no error 
# estimate improvements result from pooling across races or sexes. Joint fits 
# here thus do more harm than good, as they increase the dimensionality of the
# optimization. The only advantage is that it compacts your codes. 
da=subset(d,race=="Asian")
summary(mle2(cases~dpois(lambda=exp(c+k*age)*py),method="Nelder-Mead",
             parameters=list(c~Sex,k~sex),
             start=list(c=-12,k=0.04),data=da))
# => removal of k dependence on sex throughout was OK

(sa2=summary(ma2<-mle2(cases~dpois(lambda=exp(c+k*(age-Tf))*py),
                       method="Nelder-Mead",
                       parameters=list(Tf~Sex),
                       fixed=list("Tf.(Intercept)"=0),
                       start=list(c=-12,k=0.04,Tf=10),data=da))     )
(caT=confint(ma2))
cT # same finding
# Conclusion: try going to lower dimensions when confint() fails on you

mkCIA=function(v) {v=exp(-v); sprintf("M/F = %4.2f (%4.2f, %4.2f)",v[1],v[3],v[2])}
mkCIT=function(v) sprintf("Tf = %3.1f (%3.1f, %3.1f)",v[1],v[2],v[3])
mkCIk=function(v) sprintf("k = %4.3f (%4.3f, %4.3f)",v[1],v[2],v[3])
(CIA=round(cbind(coef(mA),cA),3))
(AS=apply(CIA[4:6,],1,mkCIA))
(kS=apply(CIA[7:9,],1,mkCIk))
(CIT=cbind(c(coef(mw2)[4],coef(mb2)[4],coef(ma2)[4]),rbind(cwT,cbT,caT)[c(3,6,9),]) )
(TS=apply(CIT,1,mkCIT))

(cnts=ddply(d, .(race), summarise,cases=sum(cases)))
mkCases=function(n) sprintf("%d cases",n)
(cS=sapply(cnts[,2],mkCases))
(labS=c(kS,AS,TS,cS))
# note that capital S at the end of variable name => strings in my codes
(tdf=data.frame(race=rep(c("White","Black","Asian"),4),  #panel
           Sex=rep("Male",12),  # dummy, it wants to know sex of the "points"
           age=c(rep(30,9),rep(40,3)),      #x-coordinate of label
           incid=rep(c(2.5,1.67,1.1,90),each=3),     #y-coordinate of label
           Text=labS,stringsAsFactors = F) )           #label text


graphics.off()
# windows(height=6,width=12,xpos=-100,ypos=-100) #need to control the device size to
if(length(grep("linux",R.Version()$os))) windows <- function( ... ) X11( ... )
if(length(grep("darwin",R.Version()$os))) windows <- function( ... ) quartz( ... )
windows(width=12,height=6)
library(ggplot2)
require(grid) # to avoid "cannot find unit()"
(p <- ggplot(d,aes(x=age,y=incid,col=Sex,shape=Sex)) + geom_point(size=7) 
 + labs(title="CML Incidence by Race, Sex and Age",x="Age",
        y=expression(paste("Cases per ",10^6," Person-Years")))    
 + scale_y_log10(limits=c(1,100)) +xlim(20,90) )
(p=p + facet_grid(. ~ race))
# (p=p+scale_shape_discrete(name = "") )

(p=p+theme(plot.title = element_text(size = rel(2)),
           strip.text = element_text(size = rel(2)),
           axis.title = element_text(size = rel(2)),
           axis.text = element_text(size = rel(2)))  )
(p=p+theme(legend.position = c(0.07, .75),    
            legend.direction = 'vertical',
#            legend.background = element_rect(size=4,color="red"),
#            legend.margin = unit(0, "cm"),
           legend.title = element_text(size = rel(1.5)) ,
           legend.text = element_text(size = rel(1.5))  ) )  
(p=p+geom_line(aes(y=EI))) 
(p=p + geom_text(aes(label=Text), size=6,data=tdf, hjust=0))

# ggsave(p,file="~/race.eps")
# ggsave(p,file="/users/radivot/case/grants/sachs/HSCepi/figs/race.png") 
#####################   END Figure 4

######################   
# 9863 k trends sampled every 5 years: Figure 5
con=dbConnect(m,dbname=file.path(.seerHome,"73/all.db"))
#######################################################################
dbListTables(con)
dbListFields(con,"pops")
dbListFields(con,"lymyleuk")
pop=dbGetQuery(con,"SELECT * from pops where popage>5 and popage<19 and poprace=1")
head(pop)
(pop<-ddply(pop, .(popage,popsex,popyear), summarise,py=sum(population)))
yearS=c("73to77","78to82","83to87","88to92","93to99","00to04","05to09","10to13")
yrs=c(1972,1977,1982,1987,1992,1999,2004,2009,2014)
pop$Years=cut(pop$popyear,breaks=yrs,labels=yearS)
(pop<-ddply(pop,.(popage,popsex,Years),summarise,py=sum(py)))
head(pop,20)

d=dbGetQuery(con, 
"SELECT * from lymyleuk where race=1 and cancer='CML' and seqnum<2")
d$agerec=as.numeric(cut(d$agedx,breaks=c(0,1,seq(5,85,5),130)))
d=d[d$agerec>5&d$agerec<19,]
d=d[!is.na(d$agerec),]

# "SELECT * from lymyleuk where race=1 and histo3=9863 and seqnum<2 and agerec>5 and agerec<19")
# "SELECT * from lymyleuk where race=1 and ICD9=2051 and seqnum<2 and agerec>5 and agerec<19")
head(d)
d<-ddply(d, .(agerec,sex,yrdx), summarise,cases=length(agerec))
d$Years=cut(d$yrdx,breaks=yrs,labels=yearS)
head(d)
d<-ddply(d, .(agerec,sex,Years), summarise,cases=sum(cases))
head(cbind(d,pop)) # just to see that they match up
d=cbind(d,py=pop[,"py"]) # only take the non-redundant py column
head(d)
d$sex=factor(d$sex,labels=c("Male","Female"))
age=c(0.5,3,seq(7.5,87.5,5))
d$age=age[d$agerec+1]
head(d,15)

# summary(mk<-mle2(cases~dpois(lambda=exp(c+k*age)*py),
#                   parameters=list(c~sex,k~-1+Years),
#                   method="Nelder-Mead",
#                   start=list(c=-12,k=rep(0.05,7)),data=d)) 

years=c(1975,1980,1985,1990,1996,2002,2007,2011)
kic=0.05
k=NULL
kL=NULL
kU=NULL

for (Y in yearS) {
  summary(mk<-mle2(cases~dpois(lambda=exp(c+k*age)*py),
                   parameters=list(c~sex),
                   method="Nelder-Mead",
                   start=list(c=-12,k=kic),data=subset(d,Years==Y))) 
  k=c(k,coef(mk)[3])
  (CIk=round(confint(mk)[3,],4))
  kL=c(kL,CIk[1])
  kU=c(kU,CIk[2])
}
dfk9=data.frame(years,k,kL,kU,Registries="SEER9")

con=dbConnect(m,dbname=file.path(.seerHome,"00/all.db"))
pop=dbGetQuery(con, 
"SELECT * from pops where poprace=1 and popyear>1999 and popage>5 and popage<19")
pop<-ddply(pop, .(popage,popsex,popyear), summarise,py=sum(population))
head(pop,20)
yearS=c("00to01","02to03","04to05","06to07","08to09","10to11","12to13")
yrs=c(1999,2001,2003,2005,2007,2009,2011,2014)
pop$Years=cut(pop$popyear,breaks=yrs,labels=yearS)
pop<-ddply(pop,.(popage,popsex,Years),summarise,py=sum(py))
head(pop,20)

d=dbGetQuery(con, 
"SELECT * from lymyleuk where race=1 and cancer='CML' and seqnum<2")
d$agerec=as.numeric(cut(d$agedx,breaks=c(0,1,seq(5,85,5),130)))
d=d[d$agerec>5&d$agerec<19,]
d=d[!is.na(d$agerec),]

# "SELECT * from lymyleuk where race=1 and histo3=9863 and seqnum<2 and agerec>5 and agerec<19")
# "SELECT * from lymyleuk where race=1 and ICD9=2051 and seqnum<2 and agerec>5 and agerec<19")
d<-ddply(d, .(agerec,sex,yrdx), summarise,cases=length(agerec))
d$Years=cut(d$yrdx,breaks=yrs,labels=yearS)
d<-ddply(d, .(agerec,sex,Years), summarise,cases=sum(cases))
head(cbind(d,pop)) # just to see that they match up
d=cbind(d,py=pop[,"py"])
head(d,15)
d$Sex=factor(d$sex,labels=c("Male","Female"))
age=c(0.5,3,seq(7.5,87.5,5))
d$age=age[d$agerec+1]
head(d,15)
yrs
years=c(2001,2003,2005,2007,2009,2011,2013)
kic=0.05
k=NULL
kL=NULL
kU=NULL

for (Y in yearS) {
  summary(mk<-mle2(cases~dpois(lambda=exp(c+k*age)*py),
                   parameters=list(c~sex),
                   method="Nelder-Mead",
                   start=list(c=-12,k=kic),data=subset(d,Years==Y))) 
  k=c(k,coef(mk)[3])
  (CIk=round(confint(mk)[3,],4))
  kL=c(kL,CIk[1])
  kU=c(kU,CIk[2])
}

dfk18=data.frame(years,k,kL,kU,Registries="SEER18")
dfk=rbind(dfk9,dfk18)

if(length(grep("linux",R.Version()$os))) windows <- function( ... ) X11( ... )
if(length(grep("darwin",R.Version()$os))) windows <- function( ... ) quartz( ... )
windows(width=9,height=7)
library(ggplot2)
pd <- position_dodge(1) 
(p=ggplot(dfk, aes(x=years, y=k,shape=Registries)) + 
   geom_point(size=6,position=pd))
(p=p+labs(title="SEER9 1973-2013 and SEER18 2000-2013",x="Year", y="k") )     
(p=p+geom_errorbar(aes(ymin=kL, ymax=kU),width=.01,position=pd))
(p=p+theme(plot.title = element_text(size = rel(2)),
           axis.title = element_text(size = rel(2)),
           axis.text = element_text(size = rel(2)))  )
(p=p+theme(legend.position = c(0.2, .2), 
           legend.title = element_text(size = rel(1.5)) ,
           legend.text = element_text(size = rel(1.5))  ) )  
# ggsave(p,file="/users/radivot/igv/ktrends.wmf")
# ggsave(p,file="/users/radivot/case/grants/sachs/HSCepi/figs/ktrends.png")

###################### End Fig 5 on k trends



#######################################################################
## AML Incidence: Figure 11
#######################################################################
con=dbConnect(m,dbname=file.path(.seerHome,"00/all.db"))
d=dbGetQuery(con, 
       "SELECT * from lymyleuk where icd9==2050 and seqnum<2")
d$agerec=as.numeric(cut(d$agedx,breaks=c(0,1,seq(5,85,5),130)))
d=d[d$agerec>3&d$agerec<19,]
d=d[!is.na(d$agerec),]

(d<-ddply(d, .(sex,agerec), summarise,cases=length(agerec))) 
pops=dbGetQuery(con, "SELECT * from pops where popyear>2000 and popage>3 and popage<19")
(pop<-ddply(pops, .(popage,popsex), summarise,py=sum(population)))
d=cbind(d,py=pop[order(pop$popsex),"py"])
d=transform(d,incid=1e5*cases/py) # make it cases per million person years
d$Sex=factor(d$sex,labels=c("Male","Female"))
d$age=age[d$agerec+1]
head(d,15)

library(ggplot2)
(p <- ggplot(d,aes(x=age,y=incid,shape=Sex))+geom_point(size=5)
 + labs(title="AML Incidence: SEER 2000-2013",x="Age (years)",
        y=expression(paste("Cases per ",10^5," Person-Years")))  #  y=expression(frac(Cases,paste(10^6," Person-Years")))) 
 + scale_y_log10(limits=c(0.7,30)) )

(p=p+theme(plot.title = element_text(size = rel(2.4)),
           axis.title = element_text(size = rel(2.4)),
           axis.text = element_text(size = rel(2)))  )
(p=p+theme(legend.position = c(0.8, .2), 
           legend.title = element_text(size = rel(2)) ,
           legend.text = element_text(size = rel(2))  ) )  

# ggsave(p,file="/users/radivot/igv/AML.wmf")
# ggsave(p,file="/users/radivot/case/grants/sachs/HSCepi/figs/AML.png")



####################################### End paper
# The following figure was taken out because it was redundant with the race figure
#############################
# CML Incidence in SEER 2000-2009 (parallel lines) 
##############################
con=dbConnect(m,dbname=file.path(.seerHome,"00/all.db"))
d=dbGetQuery(con, 
"SELECT * from lymyleuk where cancer='CML' and seqnum<2")
d$agerec=as.numeric(cut(d$agedx,breaks=c(0,1,seq(5,85,5),130)))
d=d[d$agerec>5&d$agerec<19,]
d=d[!is.na(d$agerec),]

# "SELECT * from lymyleuk where histo3=9863 and seqnum<2 and agerec>5 and agerec<19")
(d<-ddply(d, .(agerec,sex), summarise,cases=length(agerec))) 
pops=dbGetQuery(con, "SELECT * from pops where popyear>2000 and popage>5 and popage<19")
(pop<-ddply(pops, .(popage,popsex), summarise,py=sum(population)))
d=cbind(d,py=pop[,"py"])
d=transform(d,incid=1e6*cases/py)
head(d,15)
d$sex=factor(d$sex,labels=c("M","F"))
age=c(0.5,3,seq(7.5,87.5,5))
d$age=age[d$agerec+1]
head(d,15)

summary(m0<-mle2(cases~dpois(lambda=exp(c+k*age)*py),
                 parameters=list(c~sex,k~sex),   # see that slopes do not depend on sex
                 method="Nelder-Mead",
                 start=list(c=-12,k=0.04),data=d))

summary(m1<-mle2(cases~dpois(lambda=exp(c+k*age)*py),
                 parameters=list(c~sex),
                 method="Nelder-Mead",
                 start=list(c=-12,k=0.04),data=d)) 
anova(m0,m1)  # => reject null that extra param carries more than noise
d$EI=1e6*predict(m1)/d$py
head(d,15)
(CIA=round(cbind(coef(m1),confint(m1)),4)  )

# reparameterizing the model to get the delay time Tf for females
(s2=summary(m2<-mle2(cases~dpois(lambda=exp(c+k*(age-Tf))*py),
                     method="Nelder-Mead",
                     parameters=list(Tf~sex),
                     fixed=list("Tf.(Intercept)"=0),
                     start=list(c=-12,k=0.04,Tf=10),data=d))     )
coef(s2)
coef(m2)


AICtab(m0,m1,m2)  # should be no difference with m1, it's just a reparameterization
(CIT=round(cbind(coef(s2)[,1],confint(m2)),4)  )

mkCIA=function(v) {v=exp(-v); sprintf("M/F = %4.2f (%4.2f, %4.2f)",v[1],v[3],v[2])}
mkCIT=function(v) sprintf("Tf = %3.1f (%3.1f, %3.1f)",v[1],v[2],v[3])

(p <- ggplot(d,aes(x=age,y=incid,shape=sex))+geom_point(size=5)
 + labs(title="CML Incidence: SEER 2000-2013",x="Age (years)",
        y=expression(frac(Cases,paste(10^6," Person-Years")))) 
 + scale_y_log10(limits=c(3,100)) )
(p=p+theme(title = element_text(size = rel(2)),axis.text = element_text(size = rel(2)))  )
(p=p+geom_line(aes(y=EI))) # update just y component of aes
(p=p+annotate("text",x=30,y=100, hjust=0, label = paste("y ~ exp(c+k*age)  c~sex"),size=7 )   )  
(p=p+annotate("text",x=30,y=80,hjust=0,label=mkCIk(CIA["k",]), size=6))
(p=p+annotate("text",x=30,y=65,hjust=0,label=mkCIA(CIA["c.sexF",]), size=6 )   )  
(p=p+annotate("text",x=30,y=47, hjust=0, label = paste("y ~ exp(c+k*(age-Tf))"),size=7)   )  
(p=p+annotate("text",x=30,y=37,hjust=0, label=mkCIT(CIT["Tf.sexF",]), size=6))
# ggsave(p,file="/users/radivot/case/grants/sachs/HSCepi/figs/CML.wmf")

###########################################################
# This section explores CML ICD-O3 codes in whites as 9863, 9875 (ba+) and 9876 (ba-)
con=dbConnect(m,dbname=file.path(.seerHome,"00/all.db"))
pop=dbGetQuery(con, 
     "SELECT * from pops where poprace=1 and popyear>1999 and popage>9 and popage<19")
pop<-ddply(pop, .(popage,popsex,popyear), summarise,py=sum(population),.drop=F)
head(pop,20)
yearS=c("01to03","04to05","06to07","08to13")
pop$Years=cut(pop$popyear,breaks=c(1999,2003,2005,2007,2014),labels=yearS)
pop<-ddply(pop,.(popage,popsex,Years),summarise,py=sum(py),.drop=F)
head(pop,20)
dim(pop)

d=dbGetQuery(con,"SELECT * from lymyleuk where race=1 and ICD9=2051 and seqnum<2")
d$agerec=as.numeric(cut(d$agedx,breaks=c(0,1,seq(5,85,5),130)))
d=d[d$agerec>9&d$agerec<19,]
d=d[!is.na(d$agerec),]

# "SELECT * from lymyleuk where race=1 and histo2=9863 and
#               seqnum<2 and agerec>9 and agerec<19")
# "SELECT * from lymyleuk where race=1 and 
#              ((histo3=9863) or (histo3=9876) or (histo3=9875)) and
#              seqnum<2 and agerec>9 and agerec<19")
# "SELECT * from lymyleuk where race=1 and 
#              (histo3=9863 or histo3=9875) and
#              seqnum<2 and agerec>9 and agerec<19")
# "SELECT * from lymyleuk where race=1 and histo3=9875 and
# seqnum<2 and agerec>9 and agerec<19")
# "SELECT * from lymyleuk where race=1 and histo3=9945 and
# seqnum<2 and agerec>9 and agerec<19")
head(d)
d<-ddply(d, .(agerec,sex,yrdx), summarise,cases=length(agerec),.drop=F)
sum(d$cases)  #2970 icdo2 9863 cases is same as icdO3 9863+9875+9876
# of the latter, 2425 are 9863, 507 are 9875 and only 38 are 9876  
d$Years=cut(d$yrdx,breaks=c(1999,2003,2005,2007,2014),labels=yearS)
d<-ddply(d, .(agerec,sex,Years), summarise,cases=sum(cases),.drop=F)
head(cbind(d,pop)) # just to see that they match up
d=cbind(d,py=pop[,"py"])
head(d,15)
d$Sex=factor(d$sex,labels=c("Male","Female"))
age=c(0.5,3,seq(7.5,87.5,5))
d$age=age[d$agerec+1]
head(d,15)
years=c(2002,2005,2007,2009)
kic=0.05
c=k=NULL
cL=kL=NULL
cU=kU=NULL
for (Y in yearS) {
  summary(mk<-mle2(cases~dpois(lambda=exp(c+k*(age-65))*py),
                   parameters=list(c~sex),
                   method="Nelder-Mead",
                   start=list(c=-12,k=kic),data=subset(d,Years==Y))) 
  k=c(k,coef(mk)[3])
  c=c(c,coef(mk)[1])
  CI=confint(mk)
  (CIk=round(CI[3,],4))
  (CIc=round(CI[1,],4))
  kL=c(kL,CIk[1])
  kU=c(kU,CIk[2])
  cL=c(cL,CIc[1])
  cU=c(cU,CIc[2])
}
data.frame(years,c,cL,cU)
data.frame(years,k,kL,kU)

# 9876 = ba neg
# > data.frame(years,k,kL,kU)
# years          k      kL     kU
# 1  2002 0.04739624  0.0042 0.0918
# 2  2005 0.11644800  0.0455 0.2098
# 3  2007 0.06525722  0.0131 0.1207
# 4  2009 0.04218057 -0.0077 0.0926

# 9875 = ba + 
# years           k      kL     kU
# 1  2002 0.012260690 -0.0035 0.0276
# 2  2005 0.009064363 -0.0086 0.0234
# 3  2007 0.007569936 -0.0078 0.0245
# 4  2009 0.004961402 -0.0077 0.0165

# so bcr-abl negative CML's do have a larger k/aging rate constant, 
# but since we used icdo2 9863, this cannot explain the ongoing drops in k
Any scripts or data that you put into this service are public.
SEERaBomb documentation built on Dec. 16, 2019, 1:21 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
SEERaBomb
SEER and Atomic Bomb Survivor Data Analysis Tools

inst/docs/papers/attic/REB2014/SupplementSEER.R
In SEERaBomb: SEER and Atomic Bomb Survivor Data Analysis Tools

Try the SEERaBomb package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

SEERaBomb SEER and Atomic Bomb Survivor Data Analysis Tools

inst/docs/papers/attic/REB2014/SupplementSEER.R In SEERaBomb: SEER and Atomic Bomb Survivor Data Analysis Tools

Try the SEERaBomb package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

SEERaBomb
SEER and Atomic Bomb Survivor Data Analysis Tools

inst/docs/papers/attic/REB2014/SupplementSEER.R
In SEERaBomb: SEER and Atomic Bomb Survivor Data Analysis Tools