R/varLabel_url.R

Defines functions dl varLab_NULL varLabel_url

varLabel_url <- function(url,file){
    if (do::file.name(file)=='rxq_drug.varLabel'){
        wait <- TRUE
        while (wait){
            html <- tryCatch(xml2::read_html(url), error=function(e) 'e')
            wait <- ifelse(is.character(html),TRUE,FALSE)
        }
        firs_publish <- html |>
            rvest::html_elements(xpath = '//div[@id="PageHeader"]//h5') |>
            set::grep_and('First Published') |>
            rvest::html_text() |>
            do::fmt(x = '#/ ') |>
            do::Replace0('\t','\n','\r')
        last_revise <- html |>
            rvest::html_elements(xpath = '//div[@id="PageHeader"]//h5') |>
            set::grep_and('Last Revised') |>
            rvest::html_text() |>
            do::fmt(x = '#/ ') |>
            do::Replace0('\t','\n','\r')
        df <- html |>
            rvest::html_elements(xpath = '//div[@id="Sections"]') |>
            rvest::html_table() |>
            do::list1() |>
            as.data.frame()
        colnames(df) <- tolower(colnames(df))
        for (i in 1:ncol(df)) {
            df[,i] <- tolower(df[,i])
        }
        df <- df[,c("variable name","label")]
        df <- df[nchar(df[[1]])>0,]
        colnames(df)[1] <- 'variable'
        df <- cbind(df,url)
        suppressWarnings(write.table(firs_publish,file,row.names = FALSE,col.names = FALSE,quote = FALSE))
        suppressWarnings(write.table(last_revise,file,row.names = FALSE,col.names = FALSE,append = TRUE,quote = FALSE))
        suppressWarnings(write.table(df,file,append = TRUE,sep = '\t',row.names = FALSE))
        invisible('ok')
    }else{
        if (tools::file_ext(url)=='pdf'){
            pdf <- paste0(do::Replace0(file,tools::file_ext(file)),'pdf')
            if (file.exists(pdf)) file.remove(pdf)
            if (file.exists(pdf))  unlink(pdf,force = TRUE)
            cat(crayon::bgWhite(' pdf'))
            nullcon <- file(nullfile(), open = "wb")
            sink(nullcon, type = "message")
            # download
            download.file(url,pdf)
            sink(type = "message")
            close(nullcon)
            if (!file.exists(file)) varLab_NULL("#firs_publish:pdf",'#last_revise:pdf',file)
            return(invisible('pdf'))
        }
        wait <- TRUE
        while (wait) {
            html <- tryCatch(xml2::read_html(url), error=function(e) 'e')
            wait <- ifelse(is.character(html),TRUE,FALSE)
        }
        firs_publish <- html |>
            rvest::html_elements(xpath = '//div[@id="PageHeader"]//h5') |>
            set::grep_and('First Published') |>
            rvest::html_text() |>
            do::fmt(x = '#/ ') |>
            do::Replace0('\t','\n','\r')
        last_revise <- html |>
            rvest::html_elements(xpath = '//div[@id="PageHeader"]//h5') |>
            set::grep_and('Last Revised') |>
            rvest::html_text() |>
            do::fmt(x = '#/ ') |>
            do::Replace0('\t','\n','\r')
        codebook <- html |>
            rvest::html_elements(xpath = '//div[@id="Codebook"]//div[@class="pagebreak"]')|>
            set::grep_and(c('dl','table'))
        codebook
        if (do::file.name(file) %in% c('p_imq.varLabel','imq_j.varLabel','alb_cr_g.varLabel')){
            codebook <-  html |>
                rvest::html_elements(xpath = '//div[@id="Codebook"]//div[@class="pagebreak"]')
        }
        if (length(codebook)==0){
            varLab_NULL(firs_publish,last_revise,file)
            return(invisible('no codebook'))
        }
        df <- codebook |>
            rvest::html_elements(xpath = 'dl') |>
            lapply(dl) |>
            do.call(what = plyr::rbind.fill)
        df

        if (nrow(df)==0){
            varLab_NULL(firs_publish,last_revise,file)
            return(invisible('no varLabel'))
        }else{
            df <- cbind(df,url)
            suppressWarnings(write.table(firs_publish,file,row.names = FALSE,col.names = FALSE,quote = FALSE))
            suppressWarnings(write.table(last_revise,file,row.names = FALSE,col.names = FALSE,append = TRUE,quote = FALSE))
            suppressWarnings(write.table(df,file,append = TRUE,sep = '\t',row.names = FALSE))
            invisible('ok')
        }
    }
}


varLab_NULL <- function(firs_publish,last_revise,file){
    df <- data.frame(variable=1,label=2)
    df <- df[-c(1:nrow(df)),]
    suppressWarnings(write.table(firs_publish,file,row.names = FALSE,col.names = FALSE,quote = FALSE))
    suppressWarnings(write.table(last_revise,file,row.names = FALSE,col.names = FALSE,append = TRUE,quote = FALSE))
    suppressWarnings(write.table(x = df,file = file,sep = '\t',eol = '\n',row.names = FALSE,append = TRUE))
}
dl <- function(li){
    title <- li |>
        rvest::html_elements('dt') |>
        rvest::html_text(TRUE) |>
        do::Trim(':') |>
        tolower() |>
        do::Replace(' {1,}',' ')
    title[title == "variable name"] <- 'variable'
    title[title == "sas label"] <- 'label'
    title[title == "english text"] <- 'description'
    title[title == "english instructions"] <- 'instructions'
    title
    cont <- li |>
        rvest::html_elements('dd')|>
        rvest::html_text(TRUE) |>
        do::Trim(':') |>
        tolower() |>
        do::Replace(' {1,}',' ') |>
        do::Replace0('\r','\n','\t')
    if (anyDuplicated(title)){
        duptitle <- names(table(title))[table(title) >1]
        for (i in duptitle) {
            ck <- which(title==i)
            dupcont <- paste0(cont[ck],collapse = ';\n')
            title <- title[-ck[-1]]
            cont <- cont[-ck[-1]]
            cont[ck[1]] <- dupcont
        }
    }
    matrix(cont,nrow = 1,dimnames = list(NULL,title)) |>
        data.frame(check.names = FALSE)
}
yikeshu0611/nhanesR documentation built on Jan. 29, 2022, 6:08 a.m.