R/nhs_read.R

Defines functions time_diff nhs_read

Documented in nhs_read

#' Read data from 'NHANES' database in local PC
#'
#' @param ... one or more data file path, or variable names
#' @param varLabel logical, whether to add varLabel for variable
#' @param codebook logical, whether to decode variable
#' @param nrows The maximum number of rows to read.
#' @param lower_cd logical. whether to ignore case in codebook
#' @param cat logical. whether to show progress information
#' @param Year logical. whether to keep Year column
#' @param join join method. One of full, inner, left, right, semi, anti, nest
#' @return a list contains dataframe or one dataframe
#' @export
#'
nhs_read <- function(...,
                     varLabel=FALSE,
                     codebook=TRUE,
                     lower_cd = FALSE,
                     Year=TRUE,
                     nrows=Inf,
                     cat=TRUE,
                     join=c('full','inner','left','right','semi','anti','nest')){
    join <- match.arg(join)
    t1 <- Sys.time()
    hold <- list(...)
    hold <<- lapply(hold, function(i) if(length(i)==0) character() else do::Trim(i))
    holdname <- do::get_names(...)
    holdname <<- lapply(holdname, function(i) if(length(i)==0) character() else do::Trim(i))
    for (i in 1:length(hold)) {
        if (i<length(hold) & length(hold[[i]])==0){
            if (!any(grepl(get_config_path(),hold[i+1]))) hold[[i+1]] <- character()
        }else if (i==length(hold) & length(hold[[i]])==0){
            hold[[i]] <- character()
        }
    }
    (ck <- sapply(hold, length)>0)
    (hold <- hold[ck])
    (holdname <- holdname[ck])
    (ck <- sapply(hold, function(i) any(grepl(get_config_path(),i))))
    # check duplicated
    if (sum(ck)>1){
        (hold_dup <- hold[ck])
        hold_ckname <- holdname[ck]
        for (i in 1:(length(hold_dup)-1)) {
            for (j in (i+1):length(hold_dup)){
              if (any(hold_dup[[i]] %in% hold_dup[[j]])){
                  duptsv <- set::and(hold_dup[[i]],hold_dup[[j]])
                  if (!do::cnOS()) msg <- paste0(hold_ckname[i],'and',hold_ckname[j],'have duplicated tsv file(',length(duptsv),')\n',
                         paste0(duptsv,collapse = '\n'))
                  if (do::cnOS()) msg <- paste0(hold_ckname[i],tmcn::toUTF8("\u548C"),hold_ckname[j],tmcn::toUTF8("\u6709\u91CD\u590D\u7684tsv\u6587\u4EF6("),length(duptsv),')\n',
                         paste0(set::and(hold_dup[[i]],hold_dup[[j]]),collapse = '\n'))
                  stop(msg)
              }
            }
        }
    }
    if (sum(ck)>=1) (holdname <- holdname[ck])
    for (i in 1:length(hold)){
        if (any(grepl(get_config_path(),hold[i]))){
            j <- i+1
            p=0
        }else{
            p=1+p

            if (p>1){
                hold[[j]] <- c(hold[[j]],hold[[i]])
                hold[[i]] <- NA
            }
        }
    }
    (hold <- hold[!sapply(hold,function(i) all(is.na(i)))])
    names(hold)[sapply(hold, function(i) any(grepl(get_config_path(),i)))] <- holdname

    for (i in 1:length(hold)){
        if (i==1){
            tsv <- list()
            k <- 1
        }
        if (length(hold)==1){
            for (j in hold[[i]]) {
                tsv <- c(tsv,list(j))
                attr(tsv[[k]],'variable') <- 'allvariableallvariable'
                attr(tsv[[k]],'holdname') <- names(hold)[i]
                k <- k+1
            }
        }else if (any(grepl(get_config_path(),hold[[i]]))){
            if (i+1 <= length(hold)){
                if (any(grepl(get_config_path(),hold[[i+1]]))){
                    for (j in hold[[i]]) {
                        tsv <- c(tsv,list(j))
                        attr(tsv[[k]],'variable') <- 'allvariableallvariable'
                        attr(tsv[[k]],'holdname') <- names(hold)[i]
                        k <- k+1
                    }
                }else{
                    for (j in 1:length(hold[[i]])) {
                        (tsv <- c(tsv,list(hold[[i]][j])))
                        if (j==1){
                            holdi1 <- hold[[i+1]]
                            (ck <- do::right(holdi1,2)=='-u')
                            if (any(ck)) holdi1[ck] <- do::knife_right(holdi1[ck],2)
                            hold[[i+1]] <- holdi1
                        }
                        attr(tsv[[k]],'uncodebook') <- holdi1[ck]
                        attr(tsv[[k]],'variable') <- holdi1
                        attr(tsv[[k]],'holdname') <- names(hold)[i]
                        k <- k+1
                    }
                }
            }else if(i==length(hold)){
                if (any(grepl(get_config_path(),hold[[i]]))){
                    for (j in hold[[i]]) {
                        tsv <- c(tsv,list(j))
                        attr(tsv[[k]],'variable') <- 'allvariableallvariable'
                        attr(tsv[[k]],'holdname') <- names(hold)[i]
                        k <- k+1
                    }
                }
            }
        }
    }
    tsv
    (varlist <- lapply(tsv, function(i) attr(i,'variable')))
    (holdname <- sapply(tsv, function(i) attr(i,'holdname')))
    holdmaxn <- max(nchar(holdname))+1
    holdname <- sapply(holdname, function(i) paste0(i,do::rep_n(' ',holdmaxn-nchar(i))))
    (holdnameu <- unique(holdname))
    (uncodebook <- lapply(tsv, function(i) attr(i,'uncodebook')))
    (files <- unlist(tsv))
    filemaxn <- max(nchar(do::file.name(files)))-4
    variablemaxn <- nchar(max(sapply(varlist, length))+1)

    if (do::cnOS()){
        tsv <- tmcn::toUTF8("\u5FC5\u987B\u662Ftsv\u6587\u4EF6\n     ")
    }else{
        tsv <- 'must be tsv file\n     '
    }
    if (any(tools::file_ext(files) != 'tsv')){
        files <- paste0(files[tools::file_ext(files) != 'tsv'],collapse = '\n     ')
        tsv <- paste0(tsv,files)
        stop(tsv)
    }
    for (i in 1:length(holdnameu)){
        if (i==1){
            years <- prepare_years(files)
            (yearu <- years |> unique() |> do::increase())
            data <- lapply(1:length(yearu),function(i) NULL)
            names(data) <- yearu
            # file
            items <- prepare_items(files)
            itemmaxn <- max(nchar(items))
            varLabeldata <- NULL
            variableorder <- c()
            all_joint <- c()
        }
        (filesi <- files[holdname %in% holdnameu[i]])
        (filesi <- filesi[order(paste0(prepare_items(filesi),prepare_years(filesi)))])
        if (cat) cat(paste0(ifelse(i==1,'\n','\n\n'),crayon::red(do::Replace0(holdnameu[i],'.*/')),'(',length(filesi),ifelse(length(filesi)<10,') ',')')))
        for (j in 1:length(filesi)) {
            (filej <- do::file.name(filesi[j]))
            (itemsj <- prepare_items(filesi[j]))
            if (j==1){
                catviriablen <- 1
                if (cat) cat(paste0(itemsj,do::rep_n(' ',itemmaxn-nchar(itemsj))))
            }else{
                if (itemsj != prepare_items(filesi[j-1])) if (cat) cat('\n           ', paste0(itemsj,do::rep_n(' ',itemmaxn-nchar(itemsj))))
            }
            (noext <- do::Replace0(filej,paste0('\\.',tools::file_ext(filej))))
            dfj <- data.table::fread(filesi[j],showProgress = FALSE,nrows=1)
            (variable <-varlist[[which(files==filesi[j])]])
            excludename <- c('seqn',
                             'drxiline','dr1iline','dr2iline',
                             'drdifdcd','dr1ifdcd','dr2ifdcd','drxfdcd',
                             'dr1mc','dr2mc','drxmc',
                             'rxddrgid')
            if (all(variable=='allvariableallvariable')){
                variable <- colnames(dfj)
                dfj <- tryCatch(data.table::fread(filesi[j],showProgress = FALSE,nrows=nrows),warning=function(w) 'w')
                if (is.character(dfj)) dfj <- data.table::fread(filesi[j],showProgress = FALSE,nrows=nrows,fill = TRUE)
                if (cat){
                    filemsg <- paste0(' ',crayon::blue(paste0(do::equal_length(noext,nchar = filemaxn),
                                                              '(',do::equal_length(ncol(dfj),nchar = max(nchar(ncol(dfj)),variablemaxn)),',',crayon::magenta(prepare_years(filesi[j],range = FALSE)),')')))
                    cat(filemsg)
                }
                if (catviriablen %% 3 == 0) if (cat) cat('\n',do::rep_n(' ',holdmaxn+itemmaxn+4))
                catviriablen <- catviriablen+1
                variableorder <- unique(c('Year','seqn',variableorder,colnames(dfj)))
                variableorder[variableorder %in% c('drxiline','dr1iline','dr2iline')] <- 'line'
                all_joint <- unique(c('seqn',all_joint))
            }else{
                (variable <- do::Replace0(variable,' '))
                (variable[!grepl(':',variable)] <- paste0(variable[!grepl(':',variable)],':',variable[!grepl(':',variable)]))
                variable <- paste0(tolower(do::Replace0(variable,':.*')),':',do::Replace0(variable,'.*:'))
                variableorder <- unique(c('Year','seqn',variableorder,do::Replace0(variable,'.*:')))
                (ck <- sapply(tolower(variable), function(ii) any(unique(unlist(strsplit(do::Replace0(ii,':.*'),','))) %in% colnames(dfj))))
                if (!any(ck)){
                    (holdnamej <- holdname[files==filesi[j]])
                    files[files==filesi[j]] <- 'novariable'
                    # if (cat) cat(paste0(' ',crayon::red(paste0(noext,'(0)'))))
                    if (cat){
                        filemsg <- paste0(' ',crayon::red(paste0(do::equal_length(noext,nchar = filemaxn),'(',do::equal_length('0',nchar = max(nchar(ncol(dfj)),variablemaxn) ),',',crayon::magenta(prepare_years(filesi[j],range = FALSE)),')')))
                        cat(filemsg)
                    }
                    if (catviriablen %% 3 == 0) if (cat) cat('\n',do::rep_n(' ',holdmaxn+itemmaxn+4))
                    catviriablen <- catviriablen+1
                    next(j)
                }
                (names <- unique(variable[ck]))
                (allfrom <- tolower(do::Replace0(names,':.*')))
                (keyvar <- paste0(excludename,':',excludename))
                (keyvar <- keyvar[sapply(keyvar, function(keyi) do::Replace0(keyi,':.*') %in% colnames(dfj))])
                (names <- names[!sapply(allfrom, function(l) any(excludename %in% unique(strsplit(l,',')[[1]])))])
                (names <- c(keyvar,names))
                dfj <- tryCatch(data.table::fread(filesi[j],showProgress = FALSE,nrows=nrows),warning=function(w) 'w')
                if (is.character(dfj)) dfj <- data.table::fread(filesi[j],showProgress = FALSE,nrows=nrows,fill = TRUE)
                for (k in 1:length(names)){
                    if (k==1) keepnames <- rep(TRUE,length(names))
                    (fromk <- do::Replace0(tolower(names[k]),':.*') |> strsplit(',') |> unlist() |> unique())
                    (tok <- do::Replace0(names[k],'.*:'))
                    if (any(fromk %in% colnames(dfj))){
                        (fromkj <- fromk[fromk %in% colnames(dfj)])
                        colnames(dfj)[colnames(dfj) %in% fromk] <- tok
                        # build variable for new and old
                        varlabelk <- nhs_varLabel(filesi[j])
                        varlabelk=varlabelk[varlabelk$variable==fromkj,c('file','label')]
                        labelkj <- varlabelk[1,2]
                        names(labelkj) <- varlabelk[1,1]
                        if (is.null(varLabeldata)){
                            varLabeldata <- data.frame(rename=tok,
                                                       'NHANES name'=I(list(fromkj)),
                                                       label=I(list(labelkj)),check.names = FALSE)
                        }else{
                            if (tok %in% varLabeldata[,1]){
                                varLabeldata[varLabeldata[,1]==tok,'NHANES name'][[1]] <- list(unique(c(varLabeldata[varLabeldata[,1]==tok,'NHANES name'][[1]],fromkj)))
                                varLabeldata[varLabeldata[,1]==tok,'label'][[1]] <- list(c(varLabeldata[varLabeldata[,1]==tok,'label'][[1]],labelkj))
                            }else{
                                varlabelj <- data.frame(rename=tok,
                                                        'NHANES name'=I(list(fromkj)),
                                                        label=I(list(labelkj)),check.names = FALSE)
                                varLabeldata <- rbind(varLabeldata,varlabelj)
                            }
                        }

                    }else{
                        keepnames[k] <- FALSE
                    }
                }
                (nm <- do::Replace0(names,'.*:')[keepnames])

                dfj <- dfj[,nm,with=FALSE] |> unique()
                if (!data.table::is.data.table(dfj)){
                    dfj <- data.table::as.data.table(dfj)
                    colnames(dfj) <- nm
                }
                if (cat){
                    filemsg <- paste0(' ',crayon::blue(paste0(do::equal_length(noext,nchar = filemaxn),'(',do::equal_length(ncol(dfj),nchar = max(nchar(ncol(dfj)),variablemaxn)),',',crayon::magenta(prepare_years(filesi[j],range = FALSE)),')')))
                    cat(filemsg)
                }

                if (catviriablen %% 3 == 0) if (cat) cat('\n',do::rep_n(' ',holdmaxn+itemmaxn+3))
                catviriablen <- catviriablen+1
            }
            colnames(dfj)[colnames(dfj) %in% c('drxiline','dr1iline','dr2iline')] <- 'line'
            head(dfj)
            # codebook
            if (codebook){
                (ckbkf <- do::Replace(filesi[j],'\\.tsv','.codebook'))
                if (file.exists(ckbkf)){
                    ckbk <- read.delim(ckbkf,comment.char = '#')
                    if (nrow(ckbk)>1){
                        if (lower_cd) ckbk$label <- tolower(ckbk$label)
                        ckbk$variable <- do::Trim(ckbk$variable)
                        ckbk$label <- do::Trim(ckbk$label)
                        ckbk$label <- do::Trim(ckbk$label,'`')
                        ckbk$label <- do::Trim(ckbk$label,',')
                        ckbk$label[ckbk$label=="very much, or"] <- "very much"
                        ckbk$code <- do::Trim(ckbk$code)
                        head(ckbk)
                        if (length(variable)>0){
                            variable <- do::Replace0(variable,' ')
                            variable[!grepl(':',variable)] <- paste0(variable[!grepl(':',variable)],':',variable[!grepl(':',variable)])
                            (select <- do::Replace0(variable,':.*') |>
                                    strsplit(',') |> unlist() |> unique())
                            exselect <- uncodebook[files==filesi[j]] |> unlist() |> do::Replace0(':.*') |> lapply(function(ui) strsplit(ui,',|:')) |> unlist() |> unique()
                            select <- select[!select %in% exselect]
                            ckbk <- ckbk[ckbk$variable %in% select,]
                            for (k in 1:length(variable)) {
                                # replace old variable to new variable
                                fromk <- do::Replace0(variable[k],':.*') |> strsplit(',') |> unlist() |> unique()
                                tok <- do::Replace0(variable[k],'.*:')
                                ckbk[ckbk$variable %in% fromk,'variable'] <- tok
                            }
                        }
                        ckbk <- ckbk[ckbk$variable %in% colnames(dfj),]
                        (ck <- nrow(ckbk)>1)
                        if (ck){
                            for (k in unique(ckbk$variable)) {
                                k <- do::Replace0(k,' ')
                                code <- ckbk[ckbk$variable == k,]
                                code[,'label'] <- do::Replace(code[,'label'],' {2,}',' ')
                                code[,'label'] <- do::Replace(code[,'label'],' {0,}\n {0,}',' ')
                                dfjk <- dfj[[k]]
                                for (cd in 1:nrow(code)) {
                                    cdjd <- dfjk == code[cd,'code']
                                    cdjd[is.na(cdjd)] <- FALSE
                                    dfjk[cdjd] <- code[cd,'label']
                                }
                                dfj[[k]] <- dfjk
                            }
                        }
                    }
                }
            }
            head(dfj)
            # add varLabel
            if (varLabel){
                (labefile <- do::Replace(filesi[j],'\\.tsv','.varLabel'))
                if (file.exists(labefile)){
                    labelj <- read.delim(labefile,comment.char = '#')
                    if (length(variable)>0){
                        variable <- do::Replace0(variable,' ')
                        variable[!grepl(':',variable)] <- paste0(variable[!grepl(':',variable)],':',variable[!grepl(':',variable)])
                        (select <- do::Replace0(variable,':.*') |>
                                strsplit(',') |> unlist() |> unique())
                        for (k in 1:length(variable)) {
                            # replace old variable to new variable
                            fromk <- do::Replace0(variable[k],':.*') |> strsplit(',') |> unlist() |> unique()
                            tok <- do::Replace0(variable[k],'.*:')
                            labelj[labelj$variable %in% fromk,'variable'] <- tok
                        }
                    }
                    ck <- labelj$variable %in% colnames(dfj)
                    (labelj <- labelj[ck,c('variable','label')])
                    if (nrow(labelj)>0){
                        dfj <- sprintf('"%s" = "%s"',labelj$variable,labelj$label) |>
                            paste0(collapse = ', ') |>
                            sprintf(fmt = 'expss::apply_labels(dfj,%s)') |>
                            parse(file='',n=NULL) |>
                            eval()

                    }
                }
            }
            if (lower_cd) for (il in 1:ncol(dfj)) if (is.character(dfj[,il])) dfj[,il] <- tolower(dfj[,il])
            key <- excludename
            key[grepl('line',key)] <- 'line'
            key <- unique(key)
            if (is.null(data[[prepare_years(filesi[j])]])){
                data[[prepare_years(filesi[j])]] <- dfj
            }else{
                left_joint <- key[key %in% colnames(data[[prepare_years(filesi[j])]])]
                right_joint <- key[key %in% colnames(dfj)]
                (left2 <- do::right(left_joint,2))
                (right2 <- do::right(right_joint,2))
                (left_joint <- left_joint[left2 %in% right2])
                (right_joint <- right_joint[right2 %in% left2])
                iff_joints <- c('seqn',
                                'line',
                                'dr1ifdcd','dr2ifdcd','drdifdcd',
                                'dr1mc','dr2mc')
                if ((length(left_joint) %in% c(3,4)) &
                    (length(right_joint) %in% c(3,4)) &
                    all(left_joint %in% iff_joints) &
                    all(right_joint %in% iff_joints)){
                    # seqn, iff, fdcd cannot joint two iff files
                    left_joint = right_joint <- c('seqn','line')
                }else if(length(left_joint) != length(right_joint)){
                    stop('connect me')
                }
                all_joint <- unique(c(all_joint,left_joint,right_joint))
                joint <- paste0(sprintf("'%s'",left_joint),'=',sprintf("'%s'",right_joint)) |>
                    paste0(collapse = ',') |> sprintf(fmt = 'c(%s)')
                ps <- parse(text=sprintf("data[[prepare_years(filesi[j])]] <- suppressMessages(dplyr::%s_join(data[[prepare_years(filesi[j])]],dfj,by=%s))",join,joint))
                eval(ps)
            }
        }
    }
    varLabeldata[,2] <- sapply(varLabeldata[,2], function(i){
        (namei <- do::unique_no.NA(i))
        if (length(namei)==0){
            ''
        }else{
            paste0(namei,collapse = ', ')
        }
    })
    varLabeldata[,3] <- sapply(varLabeldata[,3], function(i){
        (varlabeli <- do::unique_no.NA(i))
        if (length(varlabeli)==0){
            ''
        }else if (length(varlabeli)==1){
            varlabeli
        }else{
            sapply(varlabeli, function(j) sprintf('[%s] %s',paste0(names(i)[i==j],collapse = ','),j)) |>
                paste0(collapse = '<br>')
        }
    })

    files <- files[files != 'novariable']
    names(data) <- yearu
    data <- data[!sapply(data, is.null)]
    # add target
    target <- data.table::fread(paste0(get_config_path(TRUE),'varLabel.txt'),showProgress = FALSE,data.table = FALSE)[,c("year", "item", "file", "variable", "target","url")]
    tsv <- do::Replace0(files,get_config_path(T),'\\.tsv')
    ck <- paste0_columns(target[,c("year","item","file")],'/') %in% tsv
    target <- target[ck,]
    row.names(target) <- NULL
    # cat output
    if (cat) cat(crayon::red('\n\nOutput\n'))
    if (length(data)==0){
        if (cat) cat('\nTime: ',time_diff(Sys.time(),t1),'\n')
        return('no data selected')
    }else{
        for (i in 1:length(data)) {
            data[[i]] <- cbind(Year=names(data)[i],data[[i]])
        }
        df <- as.data.frame(do.call(plyr::rbind.fill,data),check.names=FALSE)
        vo <- seqn_by(x = colnames(df),unique(c('Year',unique(key),variableorder)))
        df <- df[,vo]
        eval(parse(text=sprintf('df <- df[order(%s),]',paste0(paste0('df$',set::and(c('Year',excludename),colnames(df))),collapse = ','))))
        rownames(df) <- NULL
        if (!Year) df <- drop_col(df,'Year')
        if (cat) cat('Data Type: data.frame',paste0('(',paste0(dim(df),collapse = ','),')\n'))
        if (cat) cat('Final Years Cycle:',length(data))
        if (cat) cat('\nTime: ',time_diff(Sys.time(),t1),'\n')
        attr(df,"target") <- target
        attr(df,'varnameLabel') <- varLabeldata
        attr(df,'files') <- files
        return(df)
    }
}


time_diff <- function(t1,t2){
    diff <- as.numeric(t1)-as.numeric(t2)
    if (diff<60){
        dif <- round(diff/1,2)
        p <- paste(dif,ifelse(dif==1,'second','seconds'))
    }else if (diff>=60 & diff < 60*60){
        dif <- round(diff/60,2)
        p <- paste(dif,ifelse(diff==1,'minute','minutes'))
    }else if (diff>=60*60 & diff < 60*60*24){
        dif <- round(diff/60/60,2)
        p <- paste(dif,ifelse(diff==1,'hour','hours'))
    }else if (diff>=60*60*24 & diff < 60*60*24*365){
        dif <- round(diff/60/60/24,2)
        p <- paste(dif,ifelse(diff==1,'day','days'))
    }else if (diff>=60*60*24*365 & diff < 60*60*24*365*100){
        dif <- round(diff/60/60/24/365,2)
        p <- paste(dif,ifelse(diff==1,'year','years'))
    }else{
        dif <- round(diff/60/60/24/365/100,2)
        p <- paste(dif,ifelse(diff==1,'century','centurys'))
    }
    p
}
yikeshu0611/nhanesR documentation built on Jan. 29, 2022, 6:08 a.m.