skyscraper: Scrape Clinical Drug Data

Documented in isMultipleHits

#' @title
#' Get the RNs from a page listing the first 5 matches
#'
#' @inherit chemidplus_parsing_functions description
#'
#' @inheritSection chemidplus_parsing_functions Multiple Hits
#'
#' @inheritParams chemidplus_parsing_functions
#'
#' @seealso
#'  \code{\link[rvest]{html_nodes}},\code{\link[rvest]{html_text}}
#'  \code{\link[tibble]{as_tibble}}
#'  \code{\link[rubix]{filter_at_grepl}}
#'  \code{\link[tidyr]{extract}}
#'  \code{\link[dplyr]{mutate}},\code{\link[dplyr]{mutate_all}}
#'  \code{\link[stringr]{str_remove}}
#'
#' @rdname isMultipleHits
#'
#' @family chemidplus parsing
#'
#' @export
#'
#' @importFrom rvest html_nodes html_text
#' @importFrom magrittr %>%
#' @importFrom police try_catch_error_as_null
#' @importFrom tibble as_tibble_col as_tibble tribble
#' @importFrom rubix filter_at_grepl rm_multibyte_chars normalize_all_to_na
#' @importFrom tidyr extract
#' @importFrom dplyr mutate mutate_all filter_at distinct transmute bind_rows
#' @importFrom stringr str_remove_all

isMultipleHits <-
        function(response) {

                output <-
                response %>%
                        rvest::html_nodes(".bodytext") %>%
                        rvest::html_text()


                chem_names <-
                response %>%
                        rvest::html_nodes(".chem-name") %>%
                        rvest::html_text() %>%
                        paste(collapse = "|")

                output_a <-
                        police::try_catch_error_as_null(
                        output  %>%
                        tibble::as_tibble_col(column_name = "multiple_match") %>%
                        rubix::filter_at_grepl(multiple_match,
                                               grepl_phrase = "MW[:]{1} ",
                                               evaluates_to = FALSE) %>%
                        tidyr::extract(col = multiple_match,
                                       into = c("compound_match", "rn"),
                                       regex = paste0("(^", chem_names, ") \\[.*?\\](.*$)")) %>%
                        dplyr::mutate(rn_url = paste0("https://chem.nlm.nih.gov/chemidplus/rn/",rn)) %>%
                        dplyr::mutate_all(stringr::str_remove_all, "No Structure") %>%
                        dplyr::filter_at(vars(compound_match,
                                              rn),
                                         all_vars(!is.na(.))))


                if (is.null(output_a)) {

                        chem_name_vector <-
                                response %>%
                                rvest::html_nodes(".chem-name") %>%
                                rvest::html_text()


                                output  %>%
                                                tibble::as_tibble_col(column_name = "multiple_match") %>%
                                                rubix::filter_at_grepl(multiple_match,
                                                                       grepl_phrase = "MW[:]{1} ",
                                                                       evaluates_to = FALSE) %>%
                                dplyr::mutate(compound_match = chem_name_vector) %>%
                                dplyr::mutate(nchar_compound_name = nchar(compound_match)) %>%
                                dplyr::mutate(string_start_rn = nchar_compound_name+1) %>%
                                dplyr::mutate(total_nchar = nchar(multiple_match)) %>%
                                dplyr::mutate(rn = substr(multiple_match, string_start_rn, total_nchar)) %>%
                                dplyr::mutate_all(stringr::str_remove_all, "No Structure") %>%
                                rubix::rm_multibyte_chars() %>%
                                dplyr::filter_at(vars(compound_match,
                                                      rn),
                                                 all_vars(!is.na(.))) %>%
                                dplyr::distinct() %>%
                                tibble::as_tibble() %>%
                                rubix::normalize_all_to_na() %>%
                                dplyr::transmute(compound_match,
                                              rn,
                                              rn_url = ifelse(!is.na(rn),
                                                              paste0("https://chem.nlm.nih.gov/chemidplus/rn/", rn),
                                                              NA))


                } else {

                output_b <-
                        output  %>%
                                tibble::as_tibble_col(column_name = "multiple_match") %>%
                                rubix::filter_at_grepl(multiple_match,
                                                       grepl_phrase = "MW[:]{1} ",
                                                       evaluates_to = FALSE) %>%
                                tidyr::extract(col = multiple_match,
                                               into = c("compound_match", "rn"),
                                               regex = paste0("(^", chem_names, ")([0-9]{1,}[-]{1}[0-9]{1,}[-]{1}[0-9]{1,}.*$)")) %>%
                                dplyr::mutate(rn_url = paste0("https://chem.nlm.nih.gov/chemidplus/rn/",rn)) %>%
                                dplyr::mutate_all(stringr::str_remove_all, "No Structure") %>%
                                dplyr::filter_at(vars(compound_match,
                                                      rn),
                                                 all_vars(!is.na(.)))

                output <-
                        dplyr::bind_rows(output_a,
                                 output_b)  %>%
                        dplyr::distinct()


                if (nrow(output)) {
                        output %>%
                        tibble::as_tibble() %>%
                        rubix::normalize_all_to_na() %>%
                        dplyr::transmute(compound_match,
                                         rn,
                                         rn_url = ifelse(!is.na(rn),
                                                         paste0("https://chem.nlm.nih.gov/chemidplus/rn/", rn),
                                                         NA))
                } else {
                        tibble::tribble(~compound_match, ~rn, ~rn_url)
                }
                }

        }