#' @title
#' Scrape PubMed Publications
#'
#' @description
#' A PubMed search is performed for a string given as the `search_term` argument and the earliest or the latest publications are returned in descending order. This is a useful tool for researching investigational drugs and whether or not they are a code name for an existing drug on the market. Note that there is no `expiration_days` argument because the earliest dates cannot ever become obsolete.
#'
#' @seealso
#' \code{\link[stringr]{str_remove}}
#' \code{\link[xml2]{read_xml}}
#' \code{\link[rvest]{html_nodes}},\code{\link[rvest]{html_text}}
#' \code{\link[tibble]{tibble}}
#' \code{\link[pg13]{lsSchema}},\code{\link[pg13]{createSchema}},\code{\link[pg13]{lsTables}},\code{\link[pg13]{appendTable}},\code{\link[pg13]{writeTable}}
#' @rdname get_pm
#' @family pubmed functions
#' @export
#' @importFrom stringr str_remove_all
#' @importFrom xml2 read_html
#' @importFrom rvest html_node html_text html_nodes
#' @importFrom tibble tibble
#' @importFrom pg13 lsSchema createSchema lsTables appendTable writeTable
get_pm <-
function(conn,
conn_fun,
earliest = TRUE,
latest = !earliest,
search_term,
verbose = TRUE,
render_sql = TRUE) {
if (!missing(conn_fun)) {
conn <- eval(rlang::parse_expr(conn_fun))
on.exit(pg13::dc(conn = conn,
verbose = verbose))
}
# search_term <- "Meera"
# verbose <- TRUE
# render_sql <- TRUE
start_pm(conn = conn,
verbose = verbose,
render_sql = render_sql)
processed_search_term <- urltools::url_encode(search_term)
if (earliest) {
URL <- paste0("https://pubmed.ncbi.nlm.nih.gov/?term=", processed_search_term, "&filter=dates.1900%2F1%2F1-3000%2F12%2F12&sort=date&sort_order=asc")
} else {
URL <- URL <- paste0("https://pubmed.ncbi.nlm.nih.gov/?term=", processed_search_term, "&sort=date")
}
if (earliest) {
search_type <- "earliest"
} else {
search_type <- "latest"
}
current_results_log <-
pg13::query(conn = conn,
sql_statement =
SqlRender::render("SELECT DISTINCT *
FROM patelm9.pm_log log
WHERE url IN ('@url')
AND search_type = '@search_type';",
url = URL,
search_type = search_type),
verbose = verbose,
render_sql = render_sql)
proceed <- nrow(current_results_log) == 0
if (proceed) {
pubmed_scrape <- scrape(URL)
results_count <-
pubmed_scrape %>%
rvest::html_node(".results-amount") %>%
rvest::html_text() %>%
trimws(which = "both")
if (results_count %in% "No results were found.") {
results_log <-
tibble::tibble(rl_datetime = Sys.time(),
search_type = search_type,
search_term = search_term,
processed_search_term = processed_search_term,
url = URL,
results_count = 0)
results <-
tibble::tibble(
r_datetime = Sys.time(),
search_term = search_term,
url = URL,
title = NA,
citation = NA,
snippet = NA,
citation_date_string = NA,
citation_date = NA)
} else {
rl_datetime <- Sys.time()
results_count <-
results_count %>%
stringr::str_remove_all(pattern = "[^0-9]") %>%
as.integer()
results_log <-
tibble::tibble(rl_datetime = rl_datetime,
search_type = search_type,
search_term = search_term,
processed_search_term = processed_search_term,
url = URL,
results_count = results_count)
pub_snippet <-
pubmed_scrape %>%
rvest::html_nodes(".full-view-snippet") %>%
rvest::html_text(trim = TRUE)
if (length(pub_snippet) == 0) {
pub_snippet <- NA
}
pub_citation <<-
pubmed_scrape %>%
rvest::html_nodes(".full-journal-citation") %>%
rvest::html_text(trim = TRUE)
pub_title <<-
pubmed_scrape %>%
rvest::html_nodes(".docsum-title") %>%
rvest::html_text(trim = TRUE)
results <-
tibble::tibble(
r_datetime = rl_datetime,
search_term = search_term,
url = URL,
title = pub_title,
citation = pub_citation,
snippet = pub_snippet) %>%
tidyr::extract(col = citation,
into = "citation_date_string",
regex = ".*?([1-2]{1}[0-9]{3}[ ]{1,}[A-Za-z]{3,}).*$",
remove = FALSE) %>%
dplyr::filter(!is.na(citation_date_string)) %>%
dplyr::mutate(citation_date = as.Date(lubridate::parse_date_time(citation_date_string, orders = "%Y %b"))) %>%
dplyr::select(r_datetime,
search_term,
url,
title,
snippet,
citation,
citation_date_string,
citation_date)
}
pg13::appendTable(conn = conn,
schema = "patelm9",
tableName = "PM_LOG",
data = results_log)
if (earliest) {
pg13::appendTable(conn = conn,
schema = "patelm9",
tableName = "PM_EARLIEST",
data = results)
} else {
pg13::appendTable(conn = conn,
schema = "patelm9",
tableName = "PM_LATEST",
data = results)
}
}
}
#' @title
#' Get Earliest PubMed Publications
#'
#' @description
#' Get the earliest PubMed publications for a given term. The results are stored in the PM_EARLIEST Table. See \code{\link{get_pm}} for more details.
#'
#' @inheritParams get_pm
#' @rdname get_pm_earliest
#' @family pubmed functions
#' @export
get_pm_earliest <-
function(conn,
conn_fun,
search_term,
verbose = TRUE,
render_sql = TRUE) {
get_pm(conn = conn,
conn_fun = conn_fun,
earliest = TRUE,
search_term = search_term,
verbose = verbose,
render_sql = render_sql)
}
#' @title
#' Get Latest PubMed Publications
#'
#' @description
#' Get the latest PubMed publications for a given term. The results are stored in the PM_LATEST Table. See \code{\link{get_pm}} for more details.
#'
#' @inheritParams get_pm
#' @rdname get_pm_latest
#' @family pubmed functions
#' @export
get_pm_latest <-
function(conn,
conn_fun,
search_term,
verbose = TRUE,
render_sql = TRUE) {
get_pm(conn = conn,
conn_fun = conn_fun,
earliest = FALSE,
latest = TRUE,
search_term = search_term,
verbose = verbose,
render_sql = render_sql)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.