dea_scrape.R
In DOPE: Drug Ontology Parsing Engine

## ---- include = FALSE---------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  eval = FALSE,
  comment = "#>"
)

## ----setup, include = FALSE---------------------------------------------------
#  knitr::opts_chunk$set(echo = TRUE)
#  library(DOPE)

## ---- cache=FALSE, echo=FALSE, fig.align="center", fig.cap="Figure 1. Console", out.width=5----
#  knitr::include_graphics("../inst/extdata/console.png", error = FALSE)

## -----------------------------------------------------------------------------
#  library(conflicted)
#  suppressMessages(conflict_prefer("filter", "dplyr"))
#  library(xml2)  # read_html()
#  library(rvest)  # html_nodes(), html_text()
#  library(purrr)  # map_dfr()
#  library(stringr)  # str_to_lower()
#  library(tibble)  # tibble(),
#  suppressPackageStartupMessages(library(dplyr))  # %>%, bind_rows()
#  
#  get_drug_factsheets <- function(pg_num){
#    category <- read_html(paste0("https://www.dea.gov/factsheets?field_fact_sheet_category_target_id=All&page=", pg_num)) %>%
#      html_nodes(".teaser-title--drug_fact_sheet span") %>%
#      html_text() %>%
#      str_to_lower()
#    class <- read_html(paste0("https://www.dea.gov/factsheets?field_fact_sheet_category_target_id=All&page=", pg_num)) %>%
#      html_nodes(".teaser-category--drug-category") %>%
#      html_text() %>%
#      str_to_lower()
#    #get correct path to factsheet
#    path <- read_html(paste0("https://www.dea.gov/factsheets?field_fact_sheet_category_target_id=All&page=", pg_num)) %>%
#      html_nodes(".teaser-title--drug_fact_sheet a") %>%
#      html_attr("href")
#    #return 1x2 tibble
#    tibble("class" = class,
#           "category" = category,
#           "fact_path" = path
#           )
#  }
#  
#  dea_factsheets <- map_dfr(0:2, get_drug_factsheets)
#  

## -----------------------------------------------------------------------------
#  
#  # function to pull the data - specifically the brand names of each of
#  #   the drug types from their factsheets
#  get_brand <- function(drug_path, drug_category){
#    drug_brands <- read_html(paste0("https://www.dea.gov", drug_path)) %>%
#      html_nodes(".field--what") %>%  # name of the div with the brand names
#      html_text() %>%
#      str_remove_all("\n") %>%  # remove line breaks
#      str_split(" ", simplify = TRUE) %>%  # split the vector into individual strings
#      .[str_detect(., "®")] %>%  # find the strings that include the registered trademark symbol and subset
#      str_remove_all(., "[,|.]")  # remove extra characters
#    tibble("category" = drug_category,
#           "brands" = drug_brands)
#  }
#  
#  dea_brands <- map2_dfr(dea_factsheets$fact_path, dea_factsheets$category, get_brand)

## -----------------------------------------------------------------------------
#  usethis::use_data(dea_factsheets, overwrite = TRUE)
#  usethis::use_data(dea_brands, overwrite = TRUE)