R/scrape_docs.R

Defines functions scrape_field_value_annotations

#' @title
#' Scrape UMLS Documentation into Tables
#' @description
#' These functions scrape the UMLS Web Documentation to write the \code{\link{mth_documentation_functions}}. They have been kept to enable reproducibility and other long-term maintenance efforts.
#' @keywords internal
#' @importFrom xml2 read_html
#' @importFrom rvest html_nodes html_table
#' @importFrom purrr map map2 pluck
#' @importFrom stringr str_replace_all
#' @importFrom dplyr mutate_all
#' @importFrom broca make_trbl
#' @importFrom secretary press_enter
#' @name documentation_scraping_functions
#' @noRd
NULL

#' @title
#' Scrape Field Value Annotations
#' @inherit documentation_scraping_functions title description
#' @noRd

scrape_field_value_annotations <-
        function() {
                Sys.sleep(5)
                origin <- xml2::read_html("https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/release/abbreviations.html")

                tables <-
                        origin %>%
                        rvest::html_nodes("table") %>%
                        rvest::html_table()

                names(tables) <-
                tables %>%
                        purrr::map(~colnames(.)[1]) %>%
                        purrr::map(stringr::str_replace_all, "[\r\n\t]", " ") %>%
                        stringr::str_replace_all(pattern = "(^.*?)( |[(]{1})(.*$)",
                                                 replacement = "\\1")
                tables <-
                        tables %>%
                        purrr::map(function(x) x %>%
                                                dplyr::mutate_all(stringr::str_replace_all, "[\r\n\t]", " ")) %>%
                        purrr::map(function(x) x %>%
                                           dplyr::mutate_all(stringr::str_replace_all, "[ ]{2,}", " ")) %>%
                        purrr::map(dplyr::mutate_all, as.character) %>%
                        purrr::map(dplyr::mutate_all, trimws, which = "both")

                tribble_part <- list()
                for (i in 1:length(tables)) {
                                tribble_part[[i]] <-
                                       capture.output(broca::make_trbl(tables[[i]]))
                }
                names(tribble_part) <- names(tables)
                tribble_part <-
                        tribble_part %>%
                        purrr::map(function(x) c("tibble::tribble(",
                                                 x,
                                                 ")")) %>%
                        purrr::map(~paste(., collapse = "\n"))
                # Add names
                output <-
                tribble_part %>%
                        purrr::map2(names(tribble_part),
                                    function(x,y) paste0(y , " = ", x))

                for (i in 1:length(output)) {
                        cat(output[[i]])
                        secretary::press_enter()
                }
        }


#' @title
#' Scrape Vocabulary Annotations
#' @inherit documentation_scraping_functions title description
#' @noRd

scrape_vocabulary_annotations <-
        function() {

                Sys.sleep(5)
                origin <- xml2::read_html("https://www.nlm.nih.gov/research/umls/sourcereleasedocs/index.html")

                tables <-
                        origin %>%
                        rvest::html_nodes("table") %>%
                        rvest::html_table() %>%
                        purrr::pluck(1)

                broca::make_trbl(tables)
        }
meerapatelmd/metaorite documentation built on Dec. 26, 2021, 2:20 a.m.