skyscraper: Scrape Clinical Drug Data

Documented in drug_count get_dictionary_and_links get_drug_link_synonym log_drug_count

#' @title
#' CancerGov Internal Functions
#'
#' @description
#' This is an internal function to the `_run*` function, which is part of the family of functions that scrape, parse, and store the NCI Drug Dictionary found at CancerGov.org and any correlates to the NCI Thesaurus in a Postgres Database. Use \code{\link{cg_run}} to run the full sequence. See details for more info.
#'
#' @name cancergov_internal
NULL

#' @title
#' Get the Drug Count in the Drug Dictionary
#'
#' @inherit cancergov_internal description
#'
#' @details
#' Retrieve the total number of drugs in the NCI Drug Dictionary from the Drug Dictionary API (\url{https://webapis.cancer.gov/drugdictionary/v1/index.html#/Drugs/Drugs_GetByName})
#'
#' @param size The number of records to retrieve.
#' @param crawl_delay Delay in seconds.
#' @return
#' Drug count as integer
#'
#' @seealso
#'  \code{\link[httr]{GET}},\code{\link[httr]{content}}
#'  \code{\link[jsonlite]{toJSON, fromJSON}}
#' @rdname drug_count
#' @export
#' @importFrom httr GET content
#' @importFrom jsonlite fromJSON

drug_count <-
        function(size = 10000,
                 crawl_delay = 5) {

                Sys.sleep(crawl_delay)

                response <- httr::GET(url = "https://webapis.cancer.gov/drugdictionary/v1/Drugs",
                                      query = list(size = size,
                                                   includeResourceTypes = "DrugTerm"))
                parsed <- httr::content(x = response,
                                        as = "text",
                                        encoding = "UTF-8")
                df <- jsonlite::fromJSON(txt = parsed)

                df$meta$totalResults

        }


#' @title
#' Log the Drug Count in the Drug Dictionary
#'
#' @description
#' Log the drug count scraped by \code{\link{drug_count}} with a timestamp to the Drug Dictionary Log Table and also receive a time difference and count comparison between the most recent count and the current count in the R console.
#'
#' @inheritParams cg_run
#' @seealso
#'  \code{\link[pg13]{query}},\code{\link[pg13]{appendTable}}
#'  \code{\link[secretary]{typewrite}}
#'  \code{\link[tibble]{tibble}}
#' @rdname log_drug_count
#' @export
#' @importFrom pg13 query appendTable
#' @importFrom secretary typewrite
#' @importFrom tibble tibble


log_drug_count <-
        function(conn,
                 verbose = TRUE,
                 render_sql = TRUE,
                 crawl_delay = 5) {


                nci_dd_count <- drug_count(crawl_delay = crawl_delay)

                most_recent_count <-
                        pg13::query(conn = conn,
                                    sql_statement =
                                                "
                                                SELECT ddl.ddl_datetime, ddl.drug_count
                                                FROM cancergov.drug_dictionary_log ddl
                                                WHERE ddl.ddl_datetime IN (
                                                                SELECT MAX(ddl_datetime)
                                                                FROM cancergov.drug_dictionary_log
                                                )
                                                "
                                            )

                if (verbose) {
                        time_difference <- as.character(signif(Sys.time()-most_recent_count$ddl_datetime, digits = 3))
                        time_difference_units <- units(Sys.time()-most_recent_count$ddl_datetime)
                        secretary::typewrite(sprintf("Last drug count was %s and from %s %s ago",
                                                                most_recent_count$drug_count,
                                                                time_difference,
                                                                time_difference_units))

                        secretary::typewrite("Current drug count:", nci_dd_count)
                }


                pg13::appendTable(conn = conn,
                                  schema = "cancergov",
                                  tableName = "DRUG_DICTIONARY_LOG",
                                  data = tibble::tibble(ddl_datetime = Sys.time(),
                                                 drug_count = nci_dd_count))


        }


#' @title
#' Scrape the Drug Definitions and Links from the NCI Drug Dictionary
#'
#' @inherit cg_run description
#' @inheritSection cg_run Web Source Types
#' @inheritSection cg_run Drug Dictionary
#' @inheritParams cg_run
#'
#' @details
#' Scrapes the Definitions and the links to each Drug Page at the main Drug Dictionary pages in  the \url{https://www.cancer.gov/publications/dictionaries/cancer-drug}\emph{{i}} and stores the parsed response to the Drug Dictionary and Drug Link Tables, respectively.
#'
#' @return
#' Any differences found between the scraped data and the existing data in the Drug Dictionary and Drug Link Tables are appended to their respective tables with the local timestamp.
#'
#'
#' @seealso
#'  \code{\link[pg13]{brake_closed_conn}},\code{\link[pg13]{query}},\code{\link[pg13]{appendTable}}
#'  \code{\link[secretary]{typewrite_progress}},\code{\link[secretary]{c("typewrite", "typewrite")}},\code{\link[secretary]{character(0)}}
#'  \code{\link[rvest]{html_nodes}},\code{\link[rvest]{html_text}}
#'  \code{\link[tibble]{tibble}}
#'  \code{\link[dplyr]{bind}},\code{\link[dplyr]{mutate}},\code{\link[dplyr]{distinct}}
#' @rdname get_dictionary_and_links
#' @export
#' @importFrom pg13 brake_closed_conn query appendTable
#' @importFrom secretary typewrite_progress typewrite silverTxt
#' @importFrom rvest html_nodes html_text html_children html_attr
#' @importFrom tibble tibble
#' @importFrom dplyr bind_rows mutate distinct transmute



get_dictionary_and_links <-
        function(conn,
                 max_page = 50,
                 sleep_time = 3,
                 verbose = TRUE,
                 render_sql = TRUE,
                 crawl_delay = 5,
                 size = 10000) {

                pg13::brake_closed_conn(conn = conn)

                Sys.sleep(crawl_delay)

                response <- httr::GET(url = "https://webapis.cancer.gov/drugdictionary/v1/Drugs",
                                      query = list(size = size))

                parsed <- httr::content(x = response,
                                        as = "text",
                                        encoding = "UTF-8")
                df <- jsonlite::fromJSON(txt = parsed)


                # Definitions
                drug_dictionary_table <-
                df$results %>%
                        dplyr::filter(!is.na(definition)) %>%
                        dplyr::transmute(drug = name,
                                         definition = definition$text) %>%
                        dplyr::distinct()


                drug_link_table <-
                        dplyr::bind_rows(
                                df$results %>%
                                        dplyr::transmute(drug = name,
                                                         drug_link = sprintf("https://www.cancer.gov/publications/dictionaries/cancer-drug/def/%s", prettyUrlName)),
                                df$results %>%
                                        dplyr::transmute(drug = name,
                                                         drug_link = sprintf("https://www.cancer.gov/publications/dictionaries/cancer-drug/def/%s", termId))) %>%
                        dplyr::distinct()

                # pg13::appendTable()
                #
                # drug_dictionary <- list()
                # drug_link <- list()
                #
                # for (i in 1:max_page) {
                #
                #         page_url <- sprintf("https://www.cancer.gov/publications/dictionaries/cancer-drug?expand=ALL&page=%s", i)
                #
                #         if (verbose) {
                #
                #                 secretary::typewrite_progress(iteration = i,
                #                                               total = max_page)
                #                 secretary::typewrite(secretary::silverTxt("Page:"), i)
                #
                #
                #         }
                #
                #         page_scrape <- scrape(page_url)
                #
                #
                #         no_data_message <-
                #                 page_scrape %>%
                #                 rvest::html_nodes("#ctl36_ctl00_resultListView_ctrl0_lblNoDataMessage") %>%
                #                 rvest::html_text()
                #
                #         if (length(no_data_message) == 0) {
                #
                #                 drugs <-
                #                         page_scrape %>%
                #                         rvest::html_nodes(".dictionary-list a")  %>%
                #                         rvest::html_text() %>%
                #                         grep(pattern = "[\r\n]",
                #                              invert = FALSE,
                #                              value = TRUE) %>%
                #                         trimws(which = "both")
                #
                #                 definitions <-
                #                         page_scrape %>%
                #                         rvest::html_nodes(".dictionary-list .definition")  %>%
                #                         rvest::html_text() %>%
                #                         trimws(which = "both")
                #
                #
                #
                #                 drug_dictionary[[i]] <-
                #                         tibble::tibble(drug = drugs,
                #                                        definition = definitions)
                #
                #                 drug_def_link <-
                #                         page_scrape %>%
                #                         rvest::html_nodes("dfn") %>%
                #                         rvest::html_children() %>%
                #                         rvest::html_attr(name = "href")
                #
                #                 drug_names <-
                #                         page_scrape %>%
                #                         rvest::html_nodes("dfn") %>%
                #                         rvest::html_text() %>%
                #                         trimws()
                #
                #                 drug_link[[i]] <-
                #                         tibble::tibble(drug= drug_names,
                #                                        drug_link = drug_def_link)
                #
                #
                #         }
                # }
                #
                #
                # drug_link_table <-
                #         dplyr::bind_rows(drug_link) %>%
                #         dplyr::mutate(drug_link = paste0("https://www.cancer.gov", drug_link)) %>%
                #         dplyr::distinct()


                if (verbose) {
                        secretary::typewrite("Updating DRUG_DICTIONARY Table with diffs")
                }

                        write_cg_staging_tbl(conn = conn,
                                             tableName = "drug_dictionary_staging",
                                             data = drug_dictionary_table,
                                             verbose = verbose,
                                             render_sql = render_sql)


                        add_to_drug_dictionary <-
                                pg13::query(conn = conn,
                                            sql_statement =
                                                    "SELECT ndd.drug, ndd.definition
                                                FROM cancergov.drug_dictionary_staging ndd
                                                LEFT JOIN cancergov.drug_dictionary dd
                                                ON dd.drug = ndd.drug
                                                        AND dd.definition = ndd.definition
                                                WHERE dd.dd_datetime IS NULL;",
                                            warn_no_rows = TRUE) %>%
                                dplyr::transmute(dd_datetime = Sys.time(),
                                                 drug,
                                                 definition) %>%
                                dplyr::distinct()


                        pg13::appendTable(conn = conn,
                                          schema = "cancergov",
                                          tableName = "drug_dictionary",
                                          data = add_to_drug_dictionary)


                        if (verbose) {
                                secretary::typewrite("Updating DRUG_LINK Table with diffs")
                        }

                        write_cg_staging_tbl(conn = conn,
                                             tableName = "new_drug_link",
                                             data = drug_link_table,
                                             verbose = verbose,
                                             render_sql = render_sql)

                        add_to_drug_link <-
                                pg13::query(conn = conn,
                                            sql_statement =
                                                    "SELECT ndl.drug, ndl.drug_link
                                                FROM cancergov.new_drug_link ndl
                                                LEFT JOIN cancergov.drug_link dl
                                                ON dl.drug = ndl.drug
                                                        AND dl.drug_link = ndl.drug_link
                                                WHERE dl.dl_datetime IS NULL;",
                                            warn_no_rows = TRUE) %>%
                                dplyr::transmute(dl_datetime = Sys.time(),
                                                 drug,
                                                 drug_link) %>%
                                dplyr::distinct()

                        pg13::appendTable(conn = conn,
                                          schema = "cancergov",
                                          tableName = "drug_link",
                                          data = add_to_drug_link)


        }




#' @title
#' Get the Synonyms found at a given Drug Link
#'
#' @inherit cg_run description
#' @inheritSection cg_run Drug Detail Links
#' @inheritParams cg_run
#' @param response (Optional) The response returned when the url supplied as the `drug_link` is parsed. If not provided, a new response is received. This is an option to reduce repetitive scrapes of the same URL if multiple sections of the same page are being parsed.
#'
#' @seealso
#'  \code{\link[SqlRender]{render}}
#'  \code{\link[pg13]{query}},\code{\link[pg13]{appendTable}}
#'  \code{\link[xml2]{read_xml}},\code{\link[xml2]{xml_find_all}},\code{\link[xml2]{xml_replace}}
#'  \code{\link[rvest]{html_nodes}},\code{\link[rvest]{html_table}}
#'  \code{\link[dplyr]{bind}},\code{\link[dplyr]{mutate}},\code{\link[dplyr]{filter_all}}
#'  \code{\link[tidyr]{separate_rows}}
#' @rdname get_drug_link_synonym
#' @export
#' @importFrom SqlRender render
#' @importFrom pg13 query appendTable
#' @importFrom xml2 read_html xml_find_all xml_add_sibling xml_remove
#' @importFrom rvest html_nodes html_table
#' @importFrom dplyr bind_rows transmute filter_at
#' @importFrom tidyr separate_rows

get_drug_link_synonym <-
        function(conn,
                 drug_link,
                 response,
                 sleep_time = 5,
                 expiration_days = 30,
                 verbose = TRUE,
                 render_sql = TRUE) {

                # drug_link <- "https://www.cancer.gov/publications/dictionaries/cancer-drug/def/bispecific-antibody-mdx-h210"

                sql_statement <-
                        SqlRender::render(
                                "
                                with new AS (
                                        SELECT *
                                                FROM cancergov.drug_link_synonym dls
                                        WHERE dls.drug_link = '@drug_link'
                                                AND dls.dls_datetime IS NULL
                                        ORDER BY RANDOM()
                                ),
                                expired AS (
                                SELECT *
                                        FROM cancergov.drug_link_synonym dls
                                WHERE dls.drug_link = '@drug_link'
                                        AND (DATE_PART('day', LOCALTIMESTAMP(0)-dls_datetime)::integer < @expiration_days)
                                ORDER BY RANDOM()
                                )

                                SELECT *
                                FROM new
                                UNION
                                SELECT *
                                FROM expired
                                ;
                                ",
                                drug_link = drug_link,
                                expiration_days = expiration_days)

                current_entry <-
                pg13::query(conn = conn,
                            sql_statement = sql_statement,
                            verbose = verbose,
                            render_sql = render_sql)

                proceed <- nrow(current_entry) == 0


                if (proceed) {


                        if (missing(response)) {

                                response <-
                                        scrape(drug_link)


                                if (is.null(response)) {

                                        Sys.sleep(sleep_time)

                                        response <-
                                                scrape(drug_link)



                                        Sys.sleep(sleep_time)

                                } else {

                                        Sys.sleep(sleep_time)
                                }
                        }



                        if (!is.null(response)) {


                                xml2::xml_find_all(response, ".//br") %>% xml2::xml_add_sibling("p", "\n")

                                xml2::xml_find_all(response, ".//br") %>% xml2::xml_remove()


                                results <-
                                       tryCatch(
                                                response %>%
                                                        rvest::html_nodes("table") %>%
                                                        rvest::html_table(),
                                                error = function(e) NULL
                                        )


                                if (length(results)) {

                                        output  <-
                                                dplyr::bind_rows(results) %>%
                                                tidyr::separate_rows(X2,
                                                                     sep = "\n") %>%
                                                dplyr::transmute(dls_datetime = Sys.time(),
                                                                 drug_link = drug_link,
                                                                 drug_synonym_type = X1,
                                                                 drug_synonym = X2) %>%
                                                dplyr::filter_at(vars(drug_synonym_type,drug_synonym),
                                                                 ~nchar(.) < 255)


                                                pg13::appendTable(conn = conn,
                                                                  schema = "cancergov",
                                                                  tableName = "drug_link_synonym",
                                                                  data = output)




                                } else {
                                        output <-
                                                data.frame(dls_datetime = Sys.time(),
                                                           drug_link = drug_link,
                                                           drug_synonym_type = NA,
                                                           drug_synonym = NA)

                                                pg13::appendTable(conn = conn,
                                                                  schema = "cancergov",
                                                                  tableName = "drug_link_synonym",
                                                                  output)

                                }

                        }

                }
        }


#' @title
#' Get the URLs found in a Drug Link
#'
#' @inherit cg_run description
#' @inheritSection cg_run Drug Detail Links
#' @inheritParams get_drug_link_synonym
#'
#' @seealso
#'  \code{\link[SqlRender]{render}}
#'  \code{\link[pg13]{query}},\code{\link[pg13]{appendTable}}
#'  \code{\link[xml2]{read_xml}}
#'  \code{\link[rvest]{html_nodes}},\code{\link[rvest]{html_text}}
#'  \code{\link[tibble]{tibble}}
#' @rdname get_drug_link_url
#' @export
#' @importFrom SqlRender render
#' @importFrom pg13 query appendTable
#' @importFrom xml2 read_html
#' @importFrom rvest html_nodes html_attr
#' @importFrom tibble tibble

get_drug_link_url <-
        function(conn,
                 drug_link,
                 response,
                 sleep_time = 5,
                 expiration_days = 30,
                 verbose = TRUE,
                 render_sql = TRUE) {

                sql_statement <-
                        SqlRender::render(
                                "
                                SELECT *
                                        FROM cancergov.drug_link_url dlu
                                WHERE dlu.drug_link = '@drug_link'
                                        AND (DATE_PART('day', LOCALTIMESTAMP(0)-dlu_datetime)::integer < @expiration_days)
                                ;
                                ",
                                drug_link = drug_link,
                                expiration_days = expiration_days)

                current_entry <-
                        pg13::query(conn = conn,
                                    sql_statement = sql_statement,
                                    verbose = verbose,
                                    render_sql = render_sql)

                proceed <- nrow(current_entry) == 0

                if (proceed) {


                        if (missing(response)) {

                                response <- scrape(drug_link)

                        }

                        if (!is.null(response)) {

                                results <-
                                        tryCatch(
                                                response %>%
                                                        rvest::html_nodes(".navigation-dark-red") %>%
                                                        rvest::html_attr("href"),
                                                error = function(e) NULL
                                        )



                                if (length(results)) {

                                        output  <-
                                                tibble::tibble(dlu_datetime = Sys.time(),
                                                               drug_link = drug_link,
                                                               drug_link_url = results)


                                                pg13::appendTable(conn = conn,
                                                                  schema = "cancergov",
                                                                  tableName = "DRUG_LINK_URL",
                                                                  data = output)

                                } else {

                                        output  <-
                                                tibble::tibble(dlu_datetime = Sys.time(),
                                                               drug_link = drug_link,
                                                               drug_link_url = NA)

                                                pg13::appendTable(conn = conn,
                                                                  schema = "cancergov",
                                                                  tableName = "DRUG_LINK_URL",
                                                                  data = output)

                                }

                        }

                }
        }

#' @title
#' Process the Links found in the Drug Link Table for Synonyms
#' @inherit cg_run description
#' @inheritSection cg_run Drug Detail Links
#' @inheritParams get_drug_link_synonym
#' @seealso
#'  \code{\link[pg13]{query}},\code{\link[pg13]{appendTable}}
#'  \code{\link[SqlRender]{render}}
#'  \code{\link[secretary]{typewrite_progress}},\code{\link[secretary]{c("typewrite", "typewrite")}},\code{\link[secretary]{character(0)}}
#'  \code{\link[xml2]{xml_find_all}},\code{\link[xml2]{xml_replace}}
#'  \code{\link[rvest]{html_nodes}},\code{\link[rvest]{html_table}}
#'  \code{\link[dplyr]{bind}},\code{\link[dplyr]{mutate}},\code{\link[dplyr]{filter_all}}
#'  \code{\link[tidyr]{separate_rows}}
#'  \code{\link[tibble]{tibble}}
#' @rdname process_drug_link_synonym
#' @export
#' @importFrom pg13 query appendTable
#' @importFrom SqlRender render
#' @importFrom secretary typewrite_progress typewrite magentaTxt
#' @importFrom xml2 xml_find_all xml_add_sibling xml_remove
#' @importFrom rvest html_nodes html_table
#' @importFrom dplyr bind_rows transmute filter_at
#' @importFrom tidyr separate_rows
#' @importFrom tibble tibble


process_drug_link_synonym <-
        function(conn,
                 sleep_time = 3,
                 expiration_days = 30,
                 verbose = TRUE,
                 render_sql = TRUE,
                 encoding = "",
                 options = c("RECOVER", "NOERROR", "NOBLANKS")) {

                sql_statement <-
                        SqlRender::render(
                                "
                                with new AS (
                                        SELECT
                                                dl.drug, dl.drug_link
                                        FROM cancergov.drug_link dl
                                        LEFT JOIN cancergov.drug_link_synonym dls
                                        ON dls.drug_link  = dl.drug_link
                                        WHERE dls.dls_datetime IS NULL
                                        ORDER BY RANDOM()
                                ),
                                expired AS (
                                SELECT
                                                dl.drug, dl.drug_link
                                        FROM cancergov.drug_link dl
                                        LEFT JOIN cancergov.drug_link_synonym dls
                                        ON dls.drug_link  = dl.drug_link
                                        GROUP BY dl.drug, dl.drug_link
                                        HAVING DATE_PART('day', LOCALTIMESTAMP(0)-MAX(dls.dls_datetime))::integer >= @expiration_days
                                                ORDER BY RANDOM()
                                )

                                SELECT *
                                FROM new
                                UNION
                                SELECT *
                                FROM expired
                                ;
                                ",
                                expiration_days = expiration_days)

                        drug_link_table <-
                                pg13::query(conn = conn,
                                            sql_statement = sql_statement,
                                            verbose = verbose,
                                            render_sql = render_sql)




                drug_links <- unique(drug_link_table$drug_link)


                for (i in seq_along(drug_links)) {


                        #drug_link <- "https://www.cancer.gov/publications/dictionaries/cancer-drug/def/792667"

                        drug_link <- drug_links[i]

                        if (verbose) {

                                secretary::typewrite_progress(iteration = i,
                                                              total = length(drug_links))
                                secretary::typewrite(secretary::magentaTxt("Drug Link:"), drug_link)
                        }


                        response <- scrape(drug_link,
                                           encoding = encoding,
                                           options = options,
                                           sleep_time = sleep_time,
                                           verbose = verbose)




                        if (!is.null(response)) {


                                xml2::xml_find_all(response, ".//br") %>% xml2::xml_add_sibling("p", "\n")

                                xml2::xml_find_all(response, ".//br") %>% xml2::xml_remove()


                                results <-
                                        tryCatch(
                                                response %>%
                                                        rvest::html_nodes("table") %>%
                                                        rvest::html_table(),
                                                error = function(e) NULL
                                        )


                                if (length(results) > 0) {

                                        output  <-
                                                dplyr::bind_rows(results) %>%
                                                tidyr::separate_rows(X2,
                                                                     sep = "\n") %>%
                                                dplyr::transmute(dls_datetime = Sys.time(),
                                                                 drug_link = drug_link,
                                                                 drug_synonym_type = X1,
                                                                 drug_synonym = X2) %>%
                                                dplyr::filter_at(vars(drug_synonym_type,drug_synonym),
                                                                 ~nchar(.) < 255)

                                        pg13::appendTable(conn = conn,
                                                          schema = "cancergov",
                                                          tableName = "drug_link_synonym",
                                                          data = output)



                                } else {
                                        output <-
                                                tibble::tibble(dls_datetime = Sys.time(),
                                                           drug_link = drug_link,
                                                           drug_synonym_type = NA,
                                                           drug_synonym = NA)

                                                pg13::appendTable(conn = conn,
                                                                  schema = "cancergov",
                                                                  tableName = "drug_link_synonym",
                                                                  data = output)


                                }


                        }


                }

        }

#' @title
#' Process the NCIt CUI from the Drug Link URL Table
#'
#' @inherit cg_run description
#' @inheritSection cg_run Drug Detail Links
#' @inheritParams get_drug_link_synonym
#'
#' @details
#' Unlike the other `process_*` functions, this one uses regex to find the NCI Thesaurus Code, if presence, from the scraped URLs instead of performing any scraping.
#'
#' @seealso
#'  \code{\link[pg13]{query}},\code{\link[pg13]{appendTable}}
#'  \code{\link[SqlRender]{render}}
#'  \code{\link[rubix]{filter_at_grepl}}
#'  \code{\link[tidyr]{extract}}
#'  \code{\link[dplyr]{mutate}}
#' @rdname process_drug_link_ncit
#' @export
#' @importFrom pg13 query appendTable
#' @importFrom SqlRender render
#' @importFrom rubix filter_at_grepl
#' @importFrom tidyr extract
#' @importFrom dplyr transmute


process_drug_link_ncit <-
        function(conn,
                 verbose = TRUE,
                 render_sql = TRUE,
                 expiration_days = 30) {


                sql_statement <-
                SqlRender::render(
                        "WITH new AS (
                                SELECT dl.*
                                FROM cancergov.drug_link dl
                                LEFT JOIN cancergov.drug_link_ncit dln
                                ON dln.drug_link = dl.drug_link
                                WHERE dln.dln_datetime IS NULL
                                ORDER BY RANDOM()
                        ),
                        expired AS (
                                SELECT dl.*
                                FROM cancergov.drug_link dl
                                LEFT JOIN cancergov.drug_link_ncit dln
                                ON dln.drug_link = dl.drug_link
                                GROUP BY dl.drug, dl.drug_link
                                HAVING DATE_PART('day', LOCALTIMESTAMP(0)-MAX(dls.dls_datetime))::integer >= @expiration_days
                                ORDER BY RANDOM()
                        )

                        SELECT *
                        FROM new
                        UNION
                        SELECT *
                        FROM expired
                        ",
                        expiration_days = expiration_days)

                        drug_link_table <-
                                pg13::query(
                                        conn = conn,
                                        sql_statement = sql_statement,
                                        verbose = verbose,
                                        render_sql = render_sql)



                        write_cg_staging_tbl(conn = conn,
                                             tableName = "temp_drug_link_table",
                                             data = drug_link_table,
                                             verbose = verbose,
                                             render_sql = render_sql)


                drug_link_url_table <-
                        pg13::query(conn = conn,
                                    sql_statement =
                                            "
                                        SELECT DISTINCT
                                                dlu.drug_link, dlu.drug_link_url
                                        FROM cancergov.temp_drug_link_table temp
                                        INNER JOIN cancergov.drug_link_url dlu
                                        ON dlu.drug_link = temp.drug_link
                                        ;
                                        ",
                                    verbose = verbose,
                                    render_sql = render_sql
                        )

                results <-
                        drug_link_url_table %>%
                        rubix::filter_at_grepl(drug_link_url,
                                               grepl_phrase = "ncit.nci.nih.gov") %>%
                        tidyr::extract(drug_link_url,
                                       into = "ncit_code",
                                       regex = "^.*?code=(.*$)") %>%
                        dplyr::transmute(dln_datetime = Sys.time(),
                                         drug_link,
                                         ncit_code)

                        pg13::appendTable(conn = conn,
                                          schema = "cancergov",
                                          tableName = "DRUG_LINK_NCIT",
                                          data = results)


        }


#' @title
#' Process the Links found in the Drug Link Table for NCIt and other URLs
#'
#' @inherit cg_run description
#' @inheritSection cg_run Drug Detail Links
#' @inheritParams get_drug_link_synonym
#' @seealso
#'  \code{\link[pg13]{query}},\code{\link[pg13]{appendTable}}
#'  \code{\link[SqlRender]{render}}
#'  \code{\link[secretary]{typewrite_progress}}
#'  \code{\link[rvest]{html_nodes}},\code{\link[rvest]{html_text}}
#'  \code{\link[tibble]{tibble}}
#' @rdname process_drug_link_url
#' @export
#' @importFrom pg13 query appendTable
#' @importFrom SqlRender render
#' @importFrom secretary typewrite_progress typewrite magentaTxt
#' @importFrom rvest html_nodes html_attr
#' @importFrom tibble tibble

process_drug_link_url <-
        function(conn,
                 sleep_time = 5,
                 expiration_days = 30,
                 encoding = "",
                 options = c("RECOVER", "NOERROR", "NOBLANKS"),
                 verbose = TRUE,
                 render_sql = TRUE) {

                        sql_statement <-
                                SqlRender::render(
                                        "
                                        WITH new AS (
                                                SELECT
                                                        dl.drug, dl.drug_link
                                                FROM cancergov.drug_link dl
                                                LEFT JOIN cancergov.drug_link_url dlu
                                                ON dlu.drug_link  = dl.drug_link
                                                WHERE dlu_datetime IS NULL
                                                ORDER BY RANDOM()
                                        ),
                                        expired AS (
                                                SELECT
                                                        dl.drug, dl.drug_link
                                                FROM cancergov.drug_link dl
                                                LEFT JOIN cancergov.drug_link_url dlu
                                                ON dlu.drug_link  = dl.drug_link
                                                GROUP BY dl.drug, dl.drug_link
                                                HAVING DATE_PART('day', LOCALTIMESTAMP(0)-MAX(dlu.dlu_datetime))::integer >= @expiration_days
                                                ORDER BY RANDOM()
                                        )


                                        SELECT *
                                        FROM new
                                        UNION
                                        SELECT *
                                        FROM expired
                                        ",
                                        expiration_days = expiration_days)

                        drug_link_table <-
                                pg13::query(conn = conn,
                                            sql_statement = sql_statement,
                                            verbose = verbose,
                                            render_sql = render_sql)




                        drug_links <- unique(drug_link_table$drug_link)


                for (i in seq_along(drug_links)) {


                        # drug_link <- "https://www.cancer.gov/publications/dictionaries/cancer-drug/def/792667"
                        # drug_link <- "https://www.cancer.gov/publications/dictionaries/cancer-drug/def/61cu-atsm"

                        drug_link <- drug_links[i]

                        if (verbose) {

                                secretary::typewrite_progress(iteration = i,
                                                              total = length(drug_links))
                                secretary::typewrite(secretary::magentaTxt("Drug Link:"), drug_link)
                        }


                        response <- scrape(x = drug_link,
                                           encoding = encoding,
                                           options = options,
                                           verbose = verbose,
                                           sleep_time = sleep_time)




                        if (!is.null(response)) {

                                results <-
                                        tryCatch(
                                                response %>%
                                                        rvest::html_nodes(".navigation-dark-red") %>%
                                                        rvest::html_attr("href"),
                                                error = function(e) NULL
                                        )



                                if (length(results) > 0) {

                                        output  <-
                                                tibble::tibble(dlu_datetime = Sys.time(),
                                                               drug_link = drug_link,
                                                               drug_link_url = results)

                                                pg13::appendTable(conn = conn,
                                                                  schema = "cancergov",
                                                                  tableName = "DRUG_LINK_URL",
                                                                  data = output)


                                } else {

                                        output  <-
                                                tibble::tibble(dlu_datetime = Sys.time(),
                                                               drug_link = drug_link,
                                                               drug_link_url = NA)

                                                pg13::appendTable(conn = conn,
                                                                  schema = "cancergov",
                                                                  tableName = "DRUG_LINK_URL",
                                                                  data = output)

                                }

                        }


                }

        }


#' @title
#' Scrape the NCI Thesaurus
#'
#' @description
#' All NCIt Codes that have not been scraped or were scraped in the expiration period are scraped in the NCIt Thesaurus at the \url{"https://ncithesaurus.nci.nih.gov/ncitbrowser/pages/concept_details.jsf?dictionary=NCI_Thesaurus&code=%s&ns=ncit&type=synonym&key=null&b=1&n=0&vse=null#} path.
#' @inheritSection cg_run Drug Detail Links
#' @inheritParams get_drug_link_synonym
#' @seealso
#'  \code{\link[pg13]{query}},\code{\link[pg13]{appendTable}}
#'  \code{\link[SqlRender]{render}}
#'  \code{\link[secretary]{typewrite_progress}},\code{\link[secretary]{c("typewrite", "typewrite")}},\code{\link[secretary]{character(0)}}
#'  \code{\link[rvest]{html_nodes}},\code{\link[rvest]{html_table}}
#'  \code{\link[purrr]{keep}}
#'  \code{\link[dplyr]{bind}},\code{\link[dplyr]{mutate}},\code{\link[dplyr]{select_all}}
#'  \code{\link[rubix]{format_colnames}}
#' @rdname get_ncit_synonym
#' @export
#' @importFrom pg13 query appendTable
#' @importFrom SqlRender render
#' @importFrom secretary typewrite_progress typewrite cyanTxt
#' @importFrom rvest html_nodes html_table
#' @importFrom purrr keep
#' @importFrom dplyr bind_rows mutate rename_all transmute
#' @importFrom rubix format_colnames

get_ncit_synonym <-
        function(conn,
                 sleep_time = 5,
                 expiration_days = 30,
                 verbose = TRUE,
                 render_sql = TRUE) {

                        ncit_synonym_table <-
                                pg13::query(conn = conn,
                                            sql_statement =
                                                    SqlRender::render("
                                        SELECT DISTINCT
                                                dln.*
                                        FROM cancergov.drug_link_ncit dln
                                        LEFT JOIN cancergov.ncit_synonym ns
                                        ON dln.ncit_code  = ns.ncit_code
                                        WHERE ns_datetime IS NULL
                                                OR DATE_PART('day', LOCALTIMESTAMP(0)-ns.ns_datetime)::integer >= @expiration_days",
                                            expiration_days = expiration_days),
                                            verbose = verbose,
                                            render_sql = render_sql
                                        )



                ncit_codes <- unique(ncit_synonym_table$ncit_code)


                for (i in 1:length(ncit_codes)) {

                        ncit_code <- ncit_codes[i]
                        ncit_code_url <- sprintf("https://ncithesaurus.nci.nih.gov/ncitbrowser/pages/concept_details.jsf?dictionary=NCI_Thesaurus&code=%s&ns=ncit&type=synonym&key=null&b=1&n=0&vse=null#", ncit_code)

                        if (verbose) {

                                secretary::typewrite_progress(iteration = i,
                                                              total = length(ncit_codes))
                                secretary::typewrite(secretary::cyanTxt("NCIt Code:"), ncit_code)
                        }

                        response <- scrape(ncit_code_url)


                        if (!is.null(response)) {

                                output <-
                                response %>%
                                        rvest::html_nodes("table") %>%
                                        rvest::html_table(fill = TRUE) %>%
                                        purrr::keep(function(x) "Term" %in% colnames(x)) %>%
                                        dplyr::bind_rows() %>%
                                        dplyr::mutate(ncit_code = ncit_code) %>%
                                        dplyr::mutate(ncit_code_url = ncit_code_url) %>%
                                        rubix::format_colnames() %>%
                                        dplyr::rename_all(tolower) %>%
                                        dplyr::transmute(ns_datetime = Sys.time(),
                                                         ncit_code,
                                                         ncit_code_url,
                                                         term,
                                                         source,
                                                         tty = type,
                                                         code)

                                pg13::appendTable(conn = conn,
                                                 schema = "cancergov",
                                                 tableName = "ncit_synonym",
                                                 data = output)

                        }

                }

        }


#' @title
#' Update the Cancergov Drugs Table
#'
#' @description
#' The Synonyms found in the NCI Drug Dictionary and NCI Thesaurus are aggregated into a single Cancergov Drug Table.
#'
#' @inheritParams cg_run
#'
#' @seealso
#'  \code{\link[pg13]{query}},\code{\link[pg13]{send}}
#'  \code{\link[tidyr]{pivot_longer}}
#'  \code{\link[dplyr]{mutate}},\code{\link[dplyr]{coalesce}},\code{\link[dplyr]{select}},\code{\link[dplyr]{distinct}},\code{\link[dplyr]{group_by}},\code{\link[dplyr]{arrange}},\code{\link[dplyr]{slice}},\code{\link[dplyr]{reexports}}
#'  \code{\link[forcats]{fct_recode}}
#' @rdname update_cancergov_drugs
#' @export
#' @importFrom pg13 query send
#' @importFrom tidyr pivot_longer
#' @importFrom dplyr mutate coalesce select distinct group_by arrange slice ungroup everything
#' @importFrom forcats fct_recode

update_cancergov_drugs <-
        function(conn,
                 verbose = TRUE,
                 render_sql = TRUE) {

                cancergov_drugs <-
                        pg13::query(
                                conn = conn,
                                sql_statement =
                                "
                                SELECT DISTINCT dl.drug_link, dln.ncit_code, dl.drug, dls.drug_synonym_type, dls.drug_synonym, ns.term as ncit_drug
                                FROM cancergov.drug_link dl
                                LEFT JOIN cancergov.drug_link_synonym dls
                                ON dls.drug_link = dl.drug_link
                                LEFT JOIN cancergov.drug_link_ncit dln
                                ON dln.drug_link = dl.drug_link
                                LEFT JOIN cancergov.ncit_synonym ns
                                ON dln.ncit_code = ns.ncit_code
                                ",
                                verbose = verbose,
                                render_sql = render_sql)


                cancergov_drugs2 <-
                        cancergov_drugs %>%
                        tidyr::pivot_longer(cols = c(drug, drug_synonym, ncit_drug),
                                            names_to = "drug_name_type",
                                            values_to = "cancergov_drug",
                                            values_drop_na = TRUE) %>%
                        dplyr::mutate(drug_name_type = factor(drug_name_type)) %>%
                        dplyr::mutate(drug_name_type = forcats::fct_recode(drug_name_type,
                                                                           `Label:` = "drug",
                                                                           `NCIt Term:` = "ncit_drug")) %>%
                        dplyr::mutate(drug_name_type = as.character(drug_name_type)) %>%
                        dplyr::mutate(drug_synonym_type = dplyr::coalesce(drug_synonym_type, drug_name_type)) %>%
                        dplyr::select(-drug_name_type) %>%
                        dplyr::distinct() %>%
                        dplyr::mutate(drug_synonym_type = factor(drug_synonym_type,
                                                                 levels = c("NCIt Term:",
                                                                            "Label:",
                                                                            "Synonym:",
                                                                            "Code name:",
                                                                            "Abbreviation:",
                                                                            "Acronym:",
                                                                            "US brand name:",
                                                                            "Chemical structure:",
                                                                            "Foreign brand name:"))) %>%
                        dplyr::group_by(drug_link, ncit_code, cancergov_drug) %>%
                        dplyr::arrange(desc(drug_synonym_type), .by_group = TRUE) %>%
                        dplyr::slice(1) %>%
                        dplyr::ungroup() %>%
                        dplyr::mutate(drug_synonym_type = as.character(drug_synonym_type)) %>%
                        dplyr::mutate(cd_datetime = Sys.time()) %>%
                        dplyr::select(cd_datetime,
                                      dplyr::everything())



                write_cg_staging_tbl(conn = conn,
                                     tableName = "chron_m01_cancergov_drugs2",
                                     data = cancergov_drugs2,
                                     verbose = verbose,
                                     render_sql = render_sql)


                pg13::send(conn = conn,
                           sql_statement =
                                   "
                                        WITH new_drugs AS (
                                        SELECT new.*
                                        FROM cancergov.chron_m01_cancergov_drugs2 new
                                        LEFT JOIN cancergov.cancergov_drug current
                                        ON current.drug_link = new.drug_link
                                                AND current.ncit_code = new.ncit_code
                                                AND current.drug_synonym_type = new.drug_synonym_type
                                                AND current.cancergov_drug = new.cancergov_drug
                                        WHERE current.cd_datetime IS NULL
                                        )

                                        INSERT INTO cancergov.cancergov_drug SELECT * FROM new_drugs
                                        ",
                           verbose = verbose,
                           render_sql = render_sql
                )

        }
meerapatelmd/skyscraper documentation built on Dec. 27, 2020, 7:46 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
meerapatelmd/skyscraper
Scrape Clinical Drug Data

R/cg-internals.R
In meerapatelmd/skyscraper: Scrape Clinical Drug Data

Defines functions get_drug_link_synonym get_dictionary_and_links log_drug_count drug_count

Documented in drug_count get_dictionary_and_links get_drug_link_synonym log_drug_count

R Package Documentation

Browse R Packages

We want your feedback!

meerapatelmd/skyscraper Scrape Clinical Drug Data

R/cg-internals.R In meerapatelmd/skyscraper: Scrape Clinical Drug Data

Defines functions get_drug_link_synonym get_dictionary_and_links log_drug_count drug_count

Documented in drug_count get_dictionary_and_links get_drug_link_synonym log_drug_count

R Package Documentation

Browse R Packages

We want your feedback!

meerapatelmd/skyscraper
Scrape Clinical Drug Data

R/cg-internals.R
In meerapatelmd/skyscraper: Scrape Clinical Drug Data