Nothing
mkSEER<-function(df,seerHome="~/data/SEER",outDir="mrgd",outFile="cancDef",
indices = list(c("sex","race"), c("histo3","seqnum"), "ICD9"),
writePops=TRUE,writeRData=TRUE,writeDB=FALSE){
# require(dplyr); require(LaF); seerHome="~/data/SEER"
# outDir="mrgd";outFile="cancDef";writePops=T;writeRData=TRUE;writeDB=TRUE # for debugging
# indices = list(c("sex","race"), "histo2", "histo3", "ICD9")
# gimic to get rid of unwanted notes in R CMD check
db=reg=race=sex=age=agerec=year=py=agedx=age19=age86=histo3=yrdx=NULL
#
# mkPopsae=function(popsa) {
# # data(stdUS) #load us 2000 standard population up to 99+ and use it for the extrapolation
# props=SEERaBomb::stdUS$prop[86:100] #get rid of check Note by directly getting stdUS
# props=props/sum(props)
# elders=popsa%>%filter(age86==90)%>%group_by(db,reg,race,sex,year)
# # head(elders,2)
# # head(popsa,2)
# grow=function(x) data.frame(x,age=85.5:99.5,PY=x$py*props)
# elders=elders%>%do_(~grow(.))%>%select(-age86,-py) # no note on check about .
# # elders=elders%>%do(grow(.))%>%select(-age86,-py) # => Note on check about .
# nms=names(elders)
# names(elders)[which(nms=="PY")]<-c("py")
# nelders=popsa%>%filter(age86<90)
# nms=names(nelders)
# names(nelders)[which(nms=="age86")]<-c("age")
# elders=elders[,names(nelders)] # reorder columns to match
# rbind(nelders,elders) # this now becomes popsa extended (i.e. popsae)
# }
tMDSrepair=function(canc) {
yearEnd=range(canc$yrdx)[2]
sc=canc%>%filter(histo3==9920|histo3==9987) #tAML and tMDS subset of cancs (sc)
d=sc%>%filter(yrdx>2000)%>%group_by(sex,yrdx,histo3)%>%summarize(cnt=n()) #count rows in sex-year-ICDO3 groups
m=lm(cnt~yrdx,subset(d,histo3==9920&yrdx<2010&sex=="Male"))
f=lm(cnt~yrdx,subset(d,histo3==9920&yrdx<2010&sex=="Female"))
nd=list(yrdx=2010:yearEnd)
dm=d%>%filter(yrdx>=2010&histo3==9920,sex=="Male")
df=d%>%filter(yrdx>=2010&histo3==9920,sex=="Female")
(xm=dm$cnt-round(predict(m,nd))) # numbers of male tAMLs that need to become tMDSs in 2010 and higher
(xf=df$cnt-round(predict(f,nd))) # same for females
canc$id=1:dim(canc)[1] # set up an id column to use to pull the random sample
set.seed(8675309) # Call Jenny to make it reproducible.
rws=NULL
for (i in 1:length(xm)){
rws=c(rws,sample((canc%>%filter(histo3==9920&sex=="Male"&yrdx==(2009+i)))$id,xm[i]))
rws=c(rws,sample((canc%>%filter(histo3==9920&sex=="Female"&yrdx==(2009+i)))$id,xf[i]))
}
canc[rws,"cancer"]="MDS" # note that we don't need tMDS and tAML as separate cancer types
canc[rws,"histo3"]=9987
canc$histo3=as.integer(canc$histo3)
canc$id=NULL
canc
}
mkPopsae=function(popsa) { # this replaces the version above it
rates=SEERaBomb::nvsr01[86:100,] #get rid of check Note by directly getting nvsr
props=cumprod(1-rates[,c("pm","pf")])
tots=cumsum(props)["100",]
for (i in 1:dim(props)[2]) props[,i]=props[,i]/tots[,i]
elders=popsa%>%filter(age86==90)%>%group_by(db,reg,race,sex,year)
grow=function(x) {if (x$sex=="male") y=data.frame(x,age=85.5:99.5,PY=x$py*props$pm) else
y=data.frame(x,age=85.5:99.5,PY=x$py*props$pf)
y
}
elders=elders%>%do_(~grow(.))%>%select(-age86,-py) # no note on check about .
nms=names(elders)
names(elders)[which(nms=="PY")]<-c("py")
nelders=popsa%>%filter(age86<90)
nms=names(nelders)
names(nelders)[which(nms=="age86")]<-c("age")
elders=elders[,names(nelders)] # reorder columns to match
# rbind(nelders,elders) # this now becomes popsa extended (i.e. popsae)
bind_rows(nelders,elders) # this now becomes popsa extended (i.e. popsae)
}
# mkPopsae(popsa) # for testing
seerHome=path.expand(seerHome)
outD=file.path(seerHome,outDir)
outF=file.path(seerHome,outDir,paste0(outFile,".RData"))
outDB=file.path(seerHome,outDir,paste0(outFile,".db"))
dir.create(outD,showWarnings = FALSE) # OK if already exists, so suppress warning
if(writePops|writeDB) {
cat("Making population file data.tables\n")
dirs=list.dirs(file.path(seerHome,"populations"))
dirs=dirs[grep("yr",dirs)]
dirs=dirs[-grep("yr2005",dirs)]
colTypes=c("integer","string",rep("integer",5),"double") # double, integer, categorical and string
colWidths=c(4,7,2,1,1,1,2, 10)
# colNames1 = c('year','X0','reg','race','origin','sex', 'age19','py')
colNames2 = c('year','X0','reg','race','origin','sex', 'age86','py')
ptm <- proc.time()
# popgaL=vector(mode="list",3)
popsaL=vector(mode="list",3)
ii=1
for (i in dirs) {
# f=file.path(i,"19agegroups.txt")
# laf<-laf_open_fwf(f,column_types=colTypes,column_widths=colWidths,column_names = colNames1)
# popgaL[[ii]]=tbl_df(laf[,colNames1[-c(2,5)]] )
f=file.path(i,"singleages.txt")
laf<-laf_open_fwf(f,column_types=colTypes,column_widths=colWidths,column_names = colNames2)
popsaL[[ii]]=tbl_df(laf[,colNames2[-c(2,5)]] )
if (grepl("plus",i)) {#5/21/14 fixed systematic lower 73 incidence due to 73 PY being also in 92
# popgaL[[ii]]=popgaL[[ii]]%>%filter(reg%in%c(29,31,35,37))
popsaL[[ii]]=popsaL[[ii]]%>%filter(reg%in%c(29,31,35,37))
cat("Removing SEER 9 person years from:\n",i,"\nbefore pooling into one file.\n")
}
ii=ii+1
}
# popga=rbind_all(popgaL)
# popsa=rbind_all(popsaL)
popsa=bind_rows(popsaL)
# popga$reg=as.integer(popga$reg+1500)
popsa$reg=as.integer(popsa$reg+1500)
# popga=popga%>%
# mutate(race=cut(race,labels=c("white","black","other"),breaks=c(1,2,3,100), right=F)) %>%
# mutate(db=cut(reg,labels=c("73","92","00"),breaks=c(1500,1528,1540,1550), right=F)) %>%
# mutate(reg=mapRegs(reg)) %>%
# mutate(age19=c(0.5,3,seq(7.5,82.5,5),90)[age19+1]) %>%
# mutate(sex=factor(sex,labels=c("male","female"))) %>%
# group_by(db,reg,race,sex,age19,year) %>%
# # summarise(py=sum(py))%>% #summing here over counties and hispanic origin or not, which are in popga as rows but not as columns
# # group_by(add=F) # clear grouping
# summarise(py=sum(py))
# popga=as.data.frame(popga) # clears everything, down to the data.frame
# class(popga)
popsa=popsa%>% # note: here age groups indices 0-18 are replaced by actual ages 1-85, so no need to remap
mutate(race=cut(race,labels=c("White","Black","Other"),breaks=c(1,2,3,100), right=F)) %>%
mutate(db=cut(reg,labels=c("73","92","00"),breaks=c(1500,1528,1540,1550), right=F)) %>%
mutate(reg=mapRegs(reg)) %>%
mutate(age86=age86+0.5) %>%
mutate(sex=factor(sex,labels=c("Male","Female"))) %>%
group_by(db,reg,race,sex,age86,year) %>%
# summarise(py=sum(py))%>% #summing here over counties and hispanic origin or not, which are in popga as rows but not as columns
# group_by(add=F) # clear grouping
summarise(py=sum(py))
popsa=as.data.frame(popsa) # clears everything down to the data.frame
popsa[popsa$age86==85.5,"age86"]=90 # touch up to better guess of average age of >85 group
popsae=mkPopsae(popsa) # extend popsa to fill in ages 85-99 based on the US STD 2000 population
delT=proc.time() - ptm
cat("The population files of SEER were processed in ",delT[3]," seconds.\n",sep="")
} #if writePops or writeDB
if(writeRData|writeDB) {
dirs=list.dirs(file.path(seerHome,"incidence"))
dirs=dirs[grep("yr",dirs)]
dirs=dirs[-grep("yr2005",dirs)]
y=df[which(df$names!=" "),"names"]
cat("Cancer Data:\nThe following fields will be written:\n"); print(y)
cancers=c('breast','digothr','malegen','femgen','other','respir','colrect','lymyleuk','urinary')
files=paste0(cancers,".txt")
DFL=vector(mode="list",length(dirs)*length(files))
ptm <- proc.time()
ii=1
for (i in dirs)
for (j in files) {
f=file.path(i,toupper(j))
print(f)
laf<-laf_open_fwf(f,column_types=df$type, column_widths=df$width)
DFL[[ii]]=tbl_df(laf[,which(df$names!=" ")])
ii=ii+1
}
print("using bind_rows() to make DF canc")
# canc=rbind_all(DFL)
canc=bind_rows(DFL)
colnames(canc)<-y
canc=canc%>%
filter(agedx<200)%>%
mutate(race=cut(race,labels=c("White","Black","Other"),breaks=c(1,2,3,100), right=F)) %>%
mutate(db=cut(reg,labels=c("73","92","00"),breaks=c(1500,1528,1540,1550), right=F)) %>%
mutate(reg=mapRegs(reg)) %>%
# mutate(age19=c(0.5,3,seq(7.5,82.5,5),90)[agerec+1]) %>%
mutate(age86=as.numeric(as.character(cut(agedx,c(0:85,150),right=F,labels=c(0.5:85,90)))))%>%
# select(-agerec)%>%
mutate(sex=factor(sex,labels=c("Male","Female")))
canc=mapCancs(canc)
canc=mapCODs(canc)
# if ("radiatn"%in%names(canc)) canc=mapTrts(canc)
canc=mapTrts(canc)
canc=tMDSrepair(canc)
canc[canc$surv==9999,"surv"]=NA #set missing to NA to simplify survival codes
delT=proc.time() - ptm
cat("Cancer files were processed in ",delT[3]," seconds.\n")
} #if writeRData or writeDB
if (writeDB) {
print("Deleting old version of this SQLite database file.")
unlink(outDB)
print("Creating new SQLite database file.")
ptm <- proc.time()
# m=dbDriver("SQLite")
# con <- dbConnect(m, dbname = outDB)
my_db <- src_sqlite(outDB, create = T)
# copy_to(my_db,DF, name="canc",temporary = FALSE, indexes = indices,overwrite=TRUE)
copy_to(my_db,canc, temporary = FALSE, indexes = indices,overwrite=TRUE)
# copy_to(my_db,popga, temporary = FALSE,overwrite=TRUE)
copy_to(my_db,popsa, temporary = FALSE,overwrite=TRUE)
copy_to(my_db,popsae, temporary = FALSE,overwrite=TRUE)
# dbWriteTable(con, "canc", DF,overwrite=TRUE)
# dbWriteTable(con, "popga", popga,overwrite=TRUE)
# dbWriteTable(con, "popsa", popsa,overwrite=TRUE)
delT=proc.time() - ptm
cat("\ndata.tables were written to ",outDB," in ",delT[3]," seconds.\n")
cat("tables in this SQLite file are:\n")
# print(dbListTables(con))
# dbDisconnect(con)
}
if (writeRData) {
print("save()-ing DF canc to disk")
canc=tbl_df(canc)
L=as.list(df$desc)
names(L)=df$names
L=L[df$names!=" "]
labelled::var_label(canc) <- L
save(canc,file=outF)
cat("Cancer data has been written to: ",outF,"\n")
}
if (writePops) {
# popga=tbl_df(popga)
popsa=tbl_df(popsa)
popsae=tbl_df(popsae)
labelled::var_label(popsa)[2:7] <- c("SEER Registry","Race/ethnicity","Sex","Age at diagnosis","Year of diagnosis","Person years")
labelled::var_label(popsae)[2:7] <- c("SEER Registry","Race/ethnicity","Sex","Age at diagnosis","Year of diagnosis","Person years")
# "db" "reg" "race" "sex" "age" "year" "py"
# save(popga,file=file.path(outD,"popga.RData"))
save(popsa,file=file.path(outD,"popsa.RData"))
save(popsae,file=file.path(outD,"popsae.RData"))
} #only if writePops
#if(.Platform$OS.type=="unix") {
s=dir(outD,full.names=T)
d=file.info(s)[,c("size","mtime")]
d$size=paste0(round(d$size/1e6)," M")
print(d)
#}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.