#' Generates URL with query string for Ebsco API
#'
#' @param searchterm text of the query
#' @param datefrom from date appeared online (default = 1 year ago from today), format YYYY/MM/DD
#' @param dateto to date appeared online (default = today), format YYYY/MM/DD
#'
#' @importFrom stringr str_replace_all
#'
#' @return string, a URL to search pubmed for a particular term between two given dates
gen_url_ebsco <-
function(searchterm,
datefrom = Sys.Date() - 365,
dateto = Sys.Date() - 1) {
query <- searchterm %>%
# FIXME user ASCII (fix R CMD check warning)
# stringi::stri_escape_unicode("“Hello World”") %>%
# gsub("[(\\u201c)|(\\u201d)]", "", .)
str_replace_all(., "“", '"') %>%
str_replace_all(., "”", '"') %>%
str_squish() %>%
str_replace_all(., " ", "+") %>%
paste0("AB+(+", ., "+)+OR+TI+(+", ., "+)")
# dates need to be replaced with years as this is as granular as it goes!
query <- paste0(
"(",
query,
")+AND+(DT+",
format(datefrom, "%Y%m%d"),
"-",
format(dateto, "%Y%m%d"),
")"
)
# "&(TI+AB+SU+yes)"
base_url <- "http://eit.ebscohost.com/Services/SearchService.asmx/Search?"
paste0(
base_url,
"prof=", Sys.getenv("EBSCO_PROF"),
"&pwd=", Sys.getenv("EBSCO_PASSWORD"),
"&authType=profile",
"&ipprof=",
"&query=", query,
"&db=fsr"
)
}
#' Get the number of results from querying Ebsco (fsr) db
#'
#' To get the number of hits and get a faster response time, limit returned
#' results to 1.
#'
#' @param url a string, url generated by gen_url_ebsco
#'
#' @importFrom xml2 read_xml xml_find_first xml_ns
#'
#' @return a numerical value of the total number of results found
get_number_of_hits_ebsco <- function(url) {
paste0(url, "&startrec=1&numrec=1") %>%
read_xml(.) %>%
xml_find_first("//d1:Hits", xml_ns(.)) %>%
xml_text(.) %>%
as.numeric()
}
#' Ebsco fetch one page (up to 200 refs) from search
#'
#' @param url a string, url generated by gen_url_ebsco
#' @param startrec number starting record number for the result set returned from a search
#'
#' @importFrom xml2 read_xml xml_find_all
#'
#' @return tibble with info fields on up to 500 articles
fetch_ebsco <- function(url, startrec) {
url <- paste0(url, "&startrec=", startrec, "&numrec=200")
nodenames <-
"plink,
atl,
ab,
ui[type=\"doi\"],
jtl,
pubtype,
doctype,
dt,
au,
language"
read_xml(url) %>%
xml_find_all(".//rec") %>%
xml2tib(nodenames, "ebsco")
}
#' ebsco get page of results
#'
#' @param searchterm text of the query
#' @param datefrom from date appeared online (default = 1 year ago from today), format YYYY/MM/DD
#' @param dateto to date appeared online (default = today), format YYYY/MM/DD
#'
#' @importFrom dplyr mutate case_when if_else row_number
#' @importFrom purrr map_df
#' @importFrom xml2 read_xml xml_text xml_find_first xml_ns
#' @importFrom tibble add_column
#'
#' @return a tibble of all results
get_ebsco <- function(searchterm,
datefrom = Sys.Date() - 365,
dateto = Sys.Date() - 1) {
url <-
gen_url_ebsco(searchterm, datefrom, dateto)
# FIXME optimize maybe always load first 200 results no matter what
# since it will number of hits with it
rcount <- get_number_of_hits_ebsco(url)
if (rcount > 0) {
pages <- seq(1, rcount, 200)
cols <- c(
doi = NA_character_,
title = NA_character_,
abstract = NA_character_,
author = NA_character_,
# `publication date (yyyy-mm-dd)` = NA_character_,
# `publication type` = NA_character_,
dt = NA_character_,
pubtype = NA_character_,
journal = NA_character_,
lang = NA_character_,
url = NA_character_
)
results <- map_df(pages, ~ fetch_ebsco(url, .x))
results %>%
select(
doi = contains("ui"),
title = contains("atl"),
abstract = contains("ab"),
author = contains("au"),
# `publication date (yyyy-mm-dd)` = contains("dt"),
# `publication type` = contains("pubtype"),
dt = contains("dt"),
pubtype = contains("pubtype"),
journal = contains("jtl"),
lang = contains("Language"),
url = contains("plink")
) %>%
add_column(!!!cols[!names(cols) %in% names(.)]) %>%
replace(., . == "NA", "") %>%
mutate_at(
"dt",
~ substr(.x, 1, nchar(.x) - 3)
) %>%
# handle multiple date formats
# "2018 Supplementury issue", "Sep2015 Special Issue", "Mar2021", "1/20/2021", "2020"
mutate(
date_format_1 = as.Date(dt, "%m/%d/%Y")
) %>%
mutate(
date_format_2 = if_else(
grepl(".*[a-zA-Z]{3}[0-9]{4}|.*[a-zA-Z]{3}[0-9]{2}", dt),
str_extract(dt, "[a-zA-Z]{3}[0-9]{4}|[a-zA-Z]{3}[0-9]{2}"),
""
)
) %>%
mutate_at(
"date_format_2",
~ format_month_abb_year_v(.x)
) %>%
mutate_at(
"date_format_2",
~ as.Date(.x, "%Y-%m-%d")
) %>%
mutate(
date_format_3 = if_else(
grepl("\\b[0-9]{4}\\b", dt),
str_extract(dt, "\\b[0-9]{4}\\b"),
""
)
) %>%
mutate_at(
"date_format_3",
~ format(as.Date(.x, "%Y"), "%Y-01-01")
) %>%
mutate_at(
"date_format_3",
~ as.Date(date_format_3)
) %>%
mutate(
pdate = case_when(
!is.na(date_format_1) ~ date_format_1,
!is.na(date_format_2) ~ date_format_2,
!is.na(date_format_3) ~ date_format_3
)
) %>%
mutate_at(
"pdate",
~ as.character(.x)
) %>%
mutate_at(
"pdate",
~ ifelse(is.na(.x), "1990-01-01", .x)
) %>%
mutate(type = "") %>%
mutate_at(
"type",
~ if_else(
grepl("Academic Journal", pubtype),
"journal article or review",
.x
)
) %>%
mutate_at(
"type",
~ if_else(type == "", "other", .x)
) %>%
mutate(
url = case_when(
is.na(doi) ~ url,
TRUE ~ paste0("https://dx.doi.org/", doi)
)
) %>%
# NOTE display date in author column for test with searchterm <- "gluten AND intolerance"
# ... seems just display error in R studio
select(
doi,
title,
abstract,
author,
`publication date (yyyy-mm-dd)` = pdate,
`publication type` = type,
journal,
lang,
url
) %>%
mutate(source = "Ebsco") %>%
mutate(openaccess = "NA") %>%
group_by(doi) %>%
mutate(id = row_number()) %>%
group_by(title, abstract) %>%
mutate(title_id = row_number()) %>%
ungroup() %>%
filter(
(!is.na(doi) & doi != "" & id == 1) |
(title_id == 1 & doi == "") |
(title_id == 1 & is.na(doi))
) %>%
select(-id, -title_id)
} else {
tibble(doi = character(0))
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.