#' Read data from 'NHANES' database in local PC
#'
#' @param ... one or more data file path, or variable names
#' @param varLabel logical, whether to add varLabel for variable
#' @param codebook logical, whether to decode variable
#' @param nrows The maximum number of rows to read.
#' @param lower_cd logical. whether to ignore case in codebook
#' @param cat logical. whether to show progress information
#' @param Year logical. whether to keep Year column
#' @param join join method. One of full, inner, left, right, semi, anti, nest
#' @return a list contains dataframe or one dataframe
#' @export
#'
nhs_read <- function(...,
varLabel=FALSE,
codebook=TRUE,
lower_cd = FALSE,
Year=TRUE,
nrows=Inf,
cat=TRUE,
join=c('full','inner','left','right','semi','anti','nest')){
join <- match.arg(join)
t1 <- Sys.time()
hold <- list(...)
hold <<- lapply(hold, function(i) if(length(i)==0) character() else do::Trim(i))
holdname <- do::get_names(...)
holdname <<- lapply(holdname, function(i) if(length(i)==0) character() else do::Trim(i))
for (i in 1:length(hold)) {
if (i<length(hold) & length(hold[[i]])==0){
if (!any(grepl(get_config_path(),hold[i+1]))) hold[[i+1]] <- character()
}else if (i==length(hold) & length(hold[[i]])==0){
hold[[i]] <- character()
}
}
(ck <- sapply(hold, length)>0)
(hold <- hold[ck])
(holdname <- holdname[ck])
(ck <- sapply(hold, function(i) any(grepl(get_config_path(),i))))
# check duplicated
if (sum(ck)>1){
(hold_dup <- hold[ck])
hold_ckname <- holdname[ck]
for (i in 1:(length(hold_dup)-1)) {
for (j in (i+1):length(hold_dup)){
if (any(hold_dup[[i]] %in% hold_dup[[j]])){
duptsv <- set::and(hold_dup[[i]],hold_dup[[j]])
if (!do::cnOS()) msg <- paste0(hold_ckname[i],'and',hold_ckname[j],'have duplicated tsv file(',length(duptsv),')\n',
paste0(duptsv,collapse = '\n'))
if (do::cnOS()) msg <- paste0(hold_ckname[i],tmcn::toUTF8("\u548C"),hold_ckname[j],tmcn::toUTF8("\u6709\u91CD\u590D\u7684tsv\u6587\u4EF6("),length(duptsv),')\n',
paste0(set::and(hold_dup[[i]],hold_dup[[j]]),collapse = '\n'))
stop(msg)
}
}
}
}
if (sum(ck)>=1) (holdname <- holdname[ck])
for (i in 1:length(hold)){
if (any(grepl(get_config_path(),hold[i]))){
j <- i+1
p=0
}else{
p=1+p
if (p>1){
hold[[j]] <- c(hold[[j]],hold[[i]])
hold[[i]] <- NA
}
}
}
(hold <- hold[!sapply(hold,function(i) all(is.na(i)))])
names(hold)[sapply(hold, function(i) any(grepl(get_config_path(),i)))] <- holdname
for (i in 1:length(hold)){
if (i==1){
tsv <- list()
k <- 1
}
if (length(hold)==1){
for (j in hold[[i]]) {
tsv <- c(tsv,list(j))
attr(tsv[[k]],'variable') <- 'allvariableallvariable'
attr(tsv[[k]],'holdname') <- names(hold)[i]
k <- k+1
}
}else if (any(grepl(get_config_path(),hold[[i]]))){
if (i+1 <= length(hold)){
if (any(grepl(get_config_path(),hold[[i+1]]))){
for (j in hold[[i]]) {
tsv <- c(tsv,list(j))
attr(tsv[[k]],'variable') <- 'allvariableallvariable'
attr(tsv[[k]],'holdname') <- names(hold)[i]
k <- k+1
}
}else{
for (j in 1:length(hold[[i]])) {
(tsv <- c(tsv,list(hold[[i]][j])))
if (j==1){
holdi1 <- hold[[i+1]]
(ck <- do::right(holdi1,2)=='-u')
if (any(ck)) holdi1[ck] <- do::knife_right(holdi1[ck],2)
hold[[i+1]] <- holdi1
}
attr(tsv[[k]],'uncodebook') <- holdi1[ck]
attr(tsv[[k]],'variable') <- holdi1
attr(tsv[[k]],'holdname') <- names(hold)[i]
k <- k+1
}
}
}else if(i==length(hold)){
if (any(grepl(get_config_path(),hold[[i]]))){
for (j in hold[[i]]) {
tsv <- c(tsv,list(j))
attr(tsv[[k]],'variable') <- 'allvariableallvariable'
attr(tsv[[k]],'holdname') <- names(hold)[i]
k <- k+1
}
}
}
}
}
tsv
(varlist <- lapply(tsv, function(i) attr(i,'variable')))
(holdname <- sapply(tsv, function(i) attr(i,'holdname')))
holdmaxn <- max(nchar(holdname))+1
holdname <- sapply(holdname, function(i) paste0(i,do::rep_n(' ',holdmaxn-nchar(i))))
(holdnameu <- unique(holdname))
(uncodebook <- lapply(tsv, function(i) attr(i,'uncodebook')))
(files <- unlist(tsv))
filemaxn <- max(nchar(do::file.name(files)))-4
variablemaxn <- nchar(max(sapply(varlist, length))+1)
if (do::cnOS()){
tsv <- tmcn::toUTF8("\u5FC5\u987B\u662Ftsv\u6587\u4EF6\n ")
}else{
tsv <- 'must be tsv file\n '
}
if (any(tools::file_ext(files) != 'tsv')){
files <- paste0(files[tools::file_ext(files) != 'tsv'],collapse = '\n ')
tsv <- paste0(tsv,files)
stop(tsv)
}
for (i in 1:length(holdnameu)){
if (i==1){
years <- prepare_years(files)
(yearu <- years |> unique() |> do::increase())
data <- lapply(1:length(yearu),function(i) NULL)
names(data) <- yearu
# file
items <- prepare_items(files)
itemmaxn <- max(nchar(items))
varLabeldata <- NULL
variableorder <- c()
all_joint <- c()
}
(filesi <- files[holdname %in% holdnameu[i]])
(filesi <- filesi[order(paste0(prepare_items(filesi),prepare_years(filesi)))])
if (cat) cat(paste0(ifelse(i==1,'\n','\n\n'),crayon::red(do::Replace0(holdnameu[i],'.*/')),'(',length(filesi),ifelse(length(filesi)<10,') ',')')))
for (j in 1:length(filesi)) {
(filej <- do::file.name(filesi[j]))
(itemsj <- prepare_items(filesi[j]))
if (j==1){
catviriablen <- 1
if (cat) cat(paste0(itemsj,do::rep_n(' ',itemmaxn-nchar(itemsj))))
}else{
if (itemsj != prepare_items(filesi[j-1])) if (cat) cat('\n ', paste0(itemsj,do::rep_n(' ',itemmaxn-nchar(itemsj))))
}
(noext <- do::Replace0(filej,paste0('\\.',tools::file_ext(filej))))
dfj <- data.table::fread(filesi[j],showProgress = FALSE,nrows=1)
(variable <-varlist[[which(files==filesi[j])]])
excludename <- c('seqn',
'drxiline','dr1iline','dr2iline',
'drdifdcd','dr1ifdcd','dr2ifdcd','drxfdcd',
'dr1mc','dr2mc','drxmc',
'rxddrgid')
if (all(variable=='allvariableallvariable')){
variable <- colnames(dfj)
dfj <- tryCatch(data.table::fread(filesi[j],showProgress = FALSE,nrows=nrows),warning=function(w) 'w')
if (is.character(dfj)) dfj <- data.table::fread(filesi[j],showProgress = FALSE,nrows=nrows,fill = TRUE)
if (cat){
filemsg <- paste0(' ',crayon::blue(paste0(do::equal_length(noext,nchar = filemaxn),
'(',do::equal_length(ncol(dfj),nchar = max(nchar(ncol(dfj)),variablemaxn)),',',crayon::magenta(prepare_years(filesi[j],range = FALSE)),')')))
cat(filemsg)
}
if (catviriablen %% 3 == 0) if (cat) cat('\n',do::rep_n(' ',holdmaxn+itemmaxn+4))
catviriablen <- catviriablen+1
variableorder <- unique(c('Year','seqn',variableorder,colnames(dfj)))
variableorder[variableorder %in% c('drxiline','dr1iline','dr2iline')] <- 'line'
all_joint <- unique(c('seqn',all_joint))
}else{
(variable <- do::Replace0(variable,' '))
(variable[!grepl(':',variable)] <- paste0(variable[!grepl(':',variable)],':',variable[!grepl(':',variable)]))
variable <- paste0(tolower(do::Replace0(variable,':.*')),':',do::Replace0(variable,'.*:'))
variableorder <- unique(c('Year','seqn',variableorder,do::Replace0(variable,'.*:')))
(ck <- sapply(tolower(variable), function(ii) any(unique(unlist(strsplit(do::Replace0(ii,':.*'),','))) %in% colnames(dfj))))
if (!any(ck)){
(holdnamej <- holdname[files==filesi[j]])
files[files==filesi[j]] <- 'novariable'
# if (cat) cat(paste0(' ',crayon::red(paste0(noext,'(0)'))))
if (cat){
filemsg <- paste0(' ',crayon::red(paste0(do::equal_length(noext,nchar = filemaxn),'(',do::equal_length('0',nchar = max(nchar(ncol(dfj)),variablemaxn) ),',',crayon::magenta(prepare_years(filesi[j],range = FALSE)),')')))
cat(filemsg)
}
if (catviriablen %% 3 == 0) if (cat) cat('\n',do::rep_n(' ',holdmaxn+itemmaxn+4))
catviriablen <- catviriablen+1
next(j)
}
(names <- unique(variable[ck]))
(allfrom <- tolower(do::Replace0(names,':.*')))
(keyvar <- paste0(excludename,':',excludename))
(keyvar <- keyvar[sapply(keyvar, function(keyi) do::Replace0(keyi,':.*') %in% colnames(dfj))])
(names <- names[!sapply(allfrom, function(l) any(excludename %in% unique(strsplit(l,',')[[1]])))])
(names <- c(keyvar,names))
dfj <- tryCatch(data.table::fread(filesi[j],showProgress = FALSE,nrows=nrows),warning=function(w) 'w')
if (is.character(dfj)) dfj <- data.table::fread(filesi[j],showProgress = FALSE,nrows=nrows,fill = TRUE)
for (k in 1:length(names)){
if (k==1) keepnames <- rep(TRUE,length(names))
(fromk <- do::Replace0(tolower(names[k]),':.*') |> strsplit(',') |> unlist() |> unique())
(tok <- do::Replace0(names[k],'.*:'))
if (any(fromk %in% colnames(dfj))){
(fromkj <- fromk[fromk %in% colnames(dfj)])
colnames(dfj)[colnames(dfj) %in% fromk] <- tok
# build variable for new and old
varlabelk <- nhs_varLabel(filesi[j])
varlabelk=varlabelk[varlabelk$variable==fromkj,c('file','label')]
labelkj <- varlabelk[1,2]
names(labelkj) <- varlabelk[1,1]
if (is.null(varLabeldata)){
varLabeldata <- data.frame(rename=tok,
'NHANES name'=I(list(fromkj)),
label=I(list(labelkj)),check.names = FALSE)
}else{
if (tok %in% varLabeldata[,1]){
varLabeldata[varLabeldata[,1]==tok,'NHANES name'][[1]] <- list(unique(c(varLabeldata[varLabeldata[,1]==tok,'NHANES name'][[1]],fromkj)))
varLabeldata[varLabeldata[,1]==tok,'label'][[1]] <- list(c(varLabeldata[varLabeldata[,1]==tok,'label'][[1]],labelkj))
}else{
varlabelj <- data.frame(rename=tok,
'NHANES name'=I(list(fromkj)),
label=I(list(labelkj)),check.names = FALSE)
varLabeldata <- rbind(varLabeldata,varlabelj)
}
}
}else{
keepnames[k] <- FALSE
}
}
(nm <- do::Replace0(names,'.*:')[keepnames])
dfj <- dfj[,nm,with=FALSE] |> unique()
if (!data.table::is.data.table(dfj)){
dfj <- data.table::as.data.table(dfj)
colnames(dfj) <- nm
}
if (cat){
filemsg <- paste0(' ',crayon::blue(paste0(do::equal_length(noext,nchar = filemaxn),'(',do::equal_length(ncol(dfj),nchar = max(nchar(ncol(dfj)),variablemaxn)),',',crayon::magenta(prepare_years(filesi[j],range = FALSE)),')')))
cat(filemsg)
}
if (catviriablen %% 3 == 0) if (cat) cat('\n',do::rep_n(' ',holdmaxn+itemmaxn+3))
catviriablen <- catviriablen+1
}
colnames(dfj)[colnames(dfj) %in% c('drxiline','dr1iline','dr2iline')] <- 'line'
head(dfj)
# codebook
if (codebook){
(ckbkf <- do::Replace(filesi[j],'\\.tsv','.codebook'))
if (file.exists(ckbkf)){
ckbk <- read.delim(ckbkf,comment.char = '#')
if (nrow(ckbk)>1){
if (lower_cd) ckbk$label <- tolower(ckbk$label)
ckbk$variable <- do::Trim(ckbk$variable)
ckbk$label <- do::Trim(ckbk$label)
ckbk$label <- do::Trim(ckbk$label,'`')
ckbk$label <- do::Trim(ckbk$label,',')
ckbk$label[ckbk$label=="very much, or"] <- "very much"
ckbk$code <- do::Trim(ckbk$code)
head(ckbk)
if (length(variable)>0){
variable <- do::Replace0(variable,' ')
variable[!grepl(':',variable)] <- paste0(variable[!grepl(':',variable)],':',variable[!grepl(':',variable)])
(select <- do::Replace0(variable,':.*') |>
strsplit(',') |> unlist() |> unique())
exselect <- uncodebook[files==filesi[j]] |> unlist() |> do::Replace0(':.*') |> lapply(function(ui) strsplit(ui,',|:')) |> unlist() |> unique()
select <- select[!select %in% exselect]
ckbk <- ckbk[ckbk$variable %in% select,]
for (k in 1:length(variable)) {
# replace old variable to new variable
fromk <- do::Replace0(variable[k],':.*') |> strsplit(',') |> unlist() |> unique()
tok <- do::Replace0(variable[k],'.*:')
ckbk[ckbk$variable %in% fromk,'variable'] <- tok
}
}
ckbk <- ckbk[ckbk$variable %in% colnames(dfj),]
(ck <- nrow(ckbk)>1)
if (ck){
for (k in unique(ckbk$variable)) {
k <- do::Replace0(k,' ')
code <- ckbk[ckbk$variable == k,]
code[,'label'] <- do::Replace(code[,'label'],' {2,}',' ')
code[,'label'] <- do::Replace(code[,'label'],' {0,}\n {0,}',' ')
dfjk <- dfj[[k]]
for (cd in 1:nrow(code)) {
cdjd <- dfjk == code[cd,'code']
cdjd[is.na(cdjd)] <- FALSE
dfjk[cdjd] <- code[cd,'label']
}
dfj[[k]] <- dfjk
}
}
}
}
}
head(dfj)
# add varLabel
if (varLabel){
(labefile <- do::Replace(filesi[j],'\\.tsv','.varLabel'))
if (file.exists(labefile)){
labelj <- read.delim(labefile,comment.char = '#')
if (length(variable)>0){
variable <- do::Replace0(variable,' ')
variable[!grepl(':',variable)] <- paste0(variable[!grepl(':',variable)],':',variable[!grepl(':',variable)])
(select <- do::Replace0(variable,':.*') |>
strsplit(',') |> unlist() |> unique())
for (k in 1:length(variable)) {
# replace old variable to new variable
fromk <- do::Replace0(variable[k],':.*') |> strsplit(',') |> unlist() |> unique()
tok <- do::Replace0(variable[k],'.*:')
labelj[labelj$variable %in% fromk,'variable'] <- tok
}
}
ck <- labelj$variable %in% colnames(dfj)
(labelj <- labelj[ck,c('variable','label')])
if (nrow(labelj)>0){
dfj <- sprintf('"%s" = "%s"',labelj$variable,labelj$label) |>
paste0(collapse = ', ') |>
sprintf(fmt = 'expss::apply_labels(dfj,%s)') |>
parse(file='',n=NULL) |>
eval()
}
}
}
if (lower_cd) for (il in 1:ncol(dfj)) if (is.character(dfj[,il])) dfj[,il] <- tolower(dfj[,il])
key <- excludename
key[grepl('line',key)] <- 'line'
key <- unique(key)
if (is.null(data[[prepare_years(filesi[j])]])){
data[[prepare_years(filesi[j])]] <- dfj
}else{
left_joint <- key[key %in% colnames(data[[prepare_years(filesi[j])]])]
right_joint <- key[key %in% colnames(dfj)]
(left2 <- do::right(left_joint,2))
(right2 <- do::right(right_joint,2))
(left_joint <- left_joint[left2 %in% right2])
(right_joint <- right_joint[right2 %in% left2])
iff_joints <- c('seqn',
'line',
'dr1ifdcd','dr2ifdcd','drdifdcd',
'dr1mc','dr2mc')
if ((length(left_joint) %in% c(3,4)) &
(length(right_joint) %in% c(3,4)) &
all(left_joint %in% iff_joints) &
all(right_joint %in% iff_joints)){
# seqn, iff, fdcd cannot joint two iff files
left_joint = right_joint <- c('seqn','line')
}else if(length(left_joint) != length(right_joint)){
stop('connect me')
}
all_joint <- unique(c(all_joint,left_joint,right_joint))
joint <- paste0(sprintf("'%s'",left_joint),'=',sprintf("'%s'",right_joint)) |>
paste0(collapse = ',') |> sprintf(fmt = 'c(%s)')
ps <- parse(text=sprintf("data[[prepare_years(filesi[j])]] <- suppressMessages(dplyr::%s_join(data[[prepare_years(filesi[j])]],dfj,by=%s))",join,joint))
eval(ps)
}
}
}
varLabeldata[,2] <- sapply(varLabeldata[,2], function(i){
(namei <- do::unique_no.NA(i))
if (length(namei)==0){
''
}else{
paste0(namei,collapse = ', ')
}
})
varLabeldata[,3] <- sapply(varLabeldata[,3], function(i){
(varlabeli <- do::unique_no.NA(i))
if (length(varlabeli)==0){
''
}else if (length(varlabeli)==1){
varlabeli
}else{
sapply(varlabeli, function(j) sprintf('[%s] %s',paste0(names(i)[i==j],collapse = ','),j)) |>
paste0(collapse = '<br>')
}
})
files <- files[files != 'novariable']
names(data) <- yearu
data <- data[!sapply(data, is.null)]
# add target
target <- data.table::fread(paste0(get_config_path(TRUE),'varLabel.txt'),showProgress = FALSE,data.table = FALSE)[,c("year", "item", "file", "variable", "target","url")]
tsv <- do::Replace0(files,get_config_path(T),'\\.tsv')
ck <- paste0_columns(target[,c("year","item","file")],'/') %in% tsv
target <- target[ck,]
row.names(target) <- NULL
# cat output
if (cat) cat(crayon::red('\n\nOutput\n'))
if (length(data)==0){
if (cat) cat('\nTime: ',time_diff(Sys.time(),t1),'\n')
return('no data selected')
}else{
for (i in 1:length(data)) {
data[[i]] <- cbind(Year=names(data)[i],data[[i]])
}
df <- as.data.frame(do.call(plyr::rbind.fill,data),check.names=FALSE)
vo <- seqn_by(x = colnames(df),unique(c('Year',unique(key),variableorder)))
df <- df[,vo]
eval(parse(text=sprintf('df <- df[order(%s),]',paste0(paste0('df$',set::and(c('Year',excludename),colnames(df))),collapse = ','))))
rownames(df) <- NULL
if (!Year) df <- drop_col(df,'Year')
if (cat) cat('Data Type: data.frame',paste0('(',paste0(dim(df),collapse = ','),')\n'))
if (cat) cat('Final Years Cycle:',length(data))
if (cat) cat('\nTime: ',time_diff(Sys.time(),t1),'\n')
attr(df,"target") <- target
attr(df,'varnameLabel') <- varLabeldata
attr(df,'files') <- files
return(df)
}
}
time_diff <- function(t1,t2){
diff <- as.numeric(t1)-as.numeric(t2)
if (diff<60){
dif <- round(diff/1,2)
p <- paste(dif,ifelse(dif==1,'second','seconds'))
}else if (diff>=60 & diff < 60*60){
dif <- round(diff/60,2)
p <- paste(dif,ifelse(diff==1,'minute','minutes'))
}else if (diff>=60*60 & diff < 60*60*24){
dif <- round(diff/60/60,2)
p <- paste(dif,ifelse(diff==1,'hour','hours'))
}else if (diff>=60*60*24 & diff < 60*60*24*365){
dif <- round(diff/60/60/24,2)
p <- paste(dif,ifelse(diff==1,'day','days'))
}else if (diff>=60*60*24*365 & diff < 60*60*24*365*100){
dif <- round(diff/60/60/24/365,2)
p <- paste(dif,ifelse(diff==1,'year','years'))
}else{
dif <- round(diff/60/60/24/365/100,2)
p <- paste(dif,ifelse(diff==1,'century','centurys'))
}
p
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.