#'Make NCDB R Binary
#'
#'Uses output of \code{pickFields()} to produce an R binary of the NCDB data.
#'@param df Tibble returned by pickFields
#'@param ncdbHome Location of NCDB fixed width format ASCII files.
#'@param outFile Name of output file, which will placed in \code{ncdbHome}.
#'@return Tibble (big data file) saved as canc in \code{outFile}.
#'@note Inspired by \code{mkSEER()} in \pkg{SEERaBomb}.
#'@examples
#'\dontrun{
#' library(NCDBR)
#' d=getFields()
#' d=pickFields(d)
#' canc=mkNCDB(d) }
#'@name mkNCDB
#'@export
#'@import tibble
#'@import stringi
#'@import LaF
#'@import dplyr
#'@import labelled
mkNCDB<-function(df,ncdbHome="~/data/NCDB",outFile="cancDef") {
# library(dplyr)
# library(NCDBR)
# d=getFields()
# df=pickFields(d)
# ncdbHome="~/data/NCDB"
# outFile="cancDef"
race=sex=age=casenum=histo3=nms=type=Levs=NULL
outF=file.path(ncdbHome,paste0(outFile,".RData"))
files=dir(pattern="*.txt",path=ncdbHome,full.names=TRUE)
DFL=vector(mode="list",length(files))
cancer=sapply(strsplit(sapply(strsplit(files,split=".",fixed=T),function(x) x[1]),split="_"),function(x) x[2])
names(DFL)=cancer
# setwd(ncdbHome)
ii=1
# df%>%print(n=100)
df=df%>%mutate(type=ifelse(nms%in%c("crow","surv"),"double",type))
# require(LaF)
# require(dplyr)
for (f in files) {
print(f)
laf=LaF::laf_open_fwf(f,column_types=df$type, column_widths=df$width)
DFL[[ii]]=tbl_df(laf[,which(df$names!=" ")])
DFL[[ii]]$cancer=cancer[[ii]]
ii=ii+1
}
DFL
canc=bind_rows(DFL)
y=df$nms[which(df$nms!=" ")]
colnames(canc)[1:length(y)]<-y[1:length(y)]
canc
# canc$surv=as.numeric(canc$surv)
# canc$seqnum=as.numeric(canc$seqnum)
table(canc$race)
canc
# snms=names(canc)
# x=df%>%filter(nms%in%snms)
# L=as.list(x$labs)
# names(L)=x$nms
# labelled::var_label(canc) <- L
# # canc
# #
# (facts=which(sapply(df$Levs,function(x) class(x)[1])=="tbl_df"))
# fnms=df$nms[facts]
# (fnms=setdiff(fnms,c("alive")))
# (fnmsL=paste0(fnms,"L"))
# (x=df%>%filter(nms%in%fnms)%>%select(nms,Levs))
# L=x$Levs
# names(L)=x$nms
# dcanc=as.data.frame(canc)
# factor(dcanc[,fnms[1]],levels=L[[1]]$n,labels=L[[1]]$val)
# for (i in 1:length(fnms)) canc[,fnmsL[i]]=factor(dcanc[,fnms[i]],levels=L[[i]]$n,labels=L[[i]]$val)
#
sum(duplicated(canc$casenum)) # all unique
canc=canc%>%select(-casenum) # so remove casenum to save space
canc=canc%>%
# mutate(race=cut(race,labels=c("White","Black","Other"),breaks=c(1,2,3,100), right=F)) %>%
# mutate(Race=cut(race,labels=c("White","African American","Other","Unknown"),breaks=c(1,2,3,99,100), right=F)) %>%
mutate(race=cut(race,labels=c("White","Black","Other"),breaks=c(1,2,3,100), right=F)) %>%
mutate(sex=factor(sex,labels=c("Male","Female")))
# mutate(Sex=factor(sex,labels=c("Male","Female")))
# sex and race assumed always in, so take out sexL and raceL if they are in
# canc=canc%>%select(-sexL,-raceL)
if(length(unique(canc$cancer))==1) canc=canc%>%select(-cancer)
save(canc,file=outF)
cat("NCDB data has been written to: ",outF,"\n")
# canc%>%filter(histo3==9673) #check 18,252 MCL
# d=canc%>%filter(!is.na(surv))%>%select(-casenum) # rid of ~10% of cases due to missing survival time
# head(d) #drop missing survivals later to see if they correlate with e.g. crow
# d=d%>%mutate(facLoc=ifelse(facLoc==1,"New England",
# ifelse(facLoc==2,"Mid Altantic",
# ifelse(facLoc==3,"South Altantic",
# ifelse(facLoc==4,"East North Central",
# ifelse(facLoc==5,"East South Central",
# ifelse(facLoc==6,"West North Central",
# ifelse(facLoc==7,"West South Central",
# ifelse(facLoc==8,"Mountain",
# ifelse(facLoc==9,"Pacific",NA))))))))))
#
# d$facLoc=factor(d$facLoc,levels=c("New England","Mid Altantic","South Altantic","East North Central","East South Central",
# "West North Central","West South Central","Mountain","Pacific"))
invisible(canc)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.