R/search_ebsco.R

Defines functions get_number_of_hits_ebsco gen_url_ebsco

Documented in gen_url_ebsco get_number_of_hits_ebsco

#' Generates URL with query string for Ebsco API
#'
#' @param searchterm text of the query
#' @param datefrom from date appeared online (default = 1 year ago from today), format YYYY/MM/DD
#' @param dateto to date appeared online (default = today), format YYYY/MM/DD
#' 
#' @importFrom stringr str_replace_all
#' 
#' @return string, a URL to search pubmed for a particular term between two given dates
gen_url_ebsco <-
  function(searchterm,
           datefrom = Sys.Date() - 365,
           dateto = Sys.Date() - 1) {
    query <- searchterm %>%
      # FIXME user ASCII (fix R CMD check warning)
      # stringi::stri_escape_unicode("“Hello World”") %>% 
      # gsub("[(\\u201c)|(\\u201d)]", "", .)
      str_replace_all(., "“", '"') %>%
      str_replace_all(., "”", '"') %>%
      str_squish() %>%
      str_replace_all(., " ", "+") %>%
      paste0("AB+(+", ., "+)+OR+TI+(+", ., "+)")

    # dates need to be replaced with years as this is as granular as it goes!

    query <- paste0(
      "(",
      query,
      ")+AND+(DT+",
      format(datefrom, "%Y%m%d"),
      "-",
      format(dateto, "%Y%m%d"),
      ")"
    )
    # "&(TI+AB+SU+yes)"

    base_url <- "http://eit.ebscohost.com/Services/SearchService.asmx/Search?"
  
    paste0(
      base_url,
      "prof=", Sys.getenv("EBSCO_PROF"),
      "&pwd=", Sys.getenv("EBSCO_PASSWORD"),
      "&authType=profile",
      "&ipprof=",
      "&query=", query,
      "&db=fsr"
    )
  }


#' Get the number of results from querying Ebsco (fsr) db
#' 
#' To get the number of hits and get a faster response time, limit returned
#' results to 1.
#'
#' @param url a string, url generated by gen_url_ebsco
#' 
#' @importFrom xml2 read_xml xml_find_first xml_ns
#' 
#' @return a numerical value of the total number of results found
get_number_of_hits_ebsco <- function(url) {
  paste0(url, "&startrec=1&numrec=1") %>%
  read_xml(.) %>%
    xml_find_first("//d1:Hits", xml_ns(.)) %>%
    xml_text(.) %>%
    as.numeric()
}


#' Ebsco fetch one page (up to 200 refs) from search
#'
#' @param url a string, url generated by gen_url_ebsco
#' @param startrec number starting record number for the result set returned from a search
#' 
#' @importFrom xml2 read_xml xml_find_all
#' 
#' @return tibble with info fields on up to 500 articles
fetch_ebsco <- function(url, startrec) {
  url <- paste0(url, "&startrec=", startrec, "&numrec=200")

  nodenames <- 
    "plink,
    atl,
    ab,
    ui[type=\"doi\"],
    jtl,
    pubtype,
    doctype,
    dt,
    au,
    language"
  
  read_xml(url) %>%
    xml_find_all(".//rec") %>%
    
    xml2tib(nodenames, "ebsco")
}


#' ebsco get page of results
#'
#' @param searchterm text of the query
#' @param datefrom from date appeared online (default = 1 year ago from today), format YYYY/MM/DD
#' @param dateto to date appeared online (default = today), format YYYY/MM/DD
#' 
#' @importFrom dplyr mutate case_when if_else row_number
#' @importFrom purrr map_df
#' @importFrom xml2 read_xml xml_text xml_find_first xml_ns
#' @importFrom tibble add_column
#' 
#' @return a tibble of all results
get_ebsco <- function(searchterm,
                      datefrom = Sys.Date() - 365,
                      dateto = Sys.Date() - 1) {
  url <-
    gen_url_ebsco(searchterm, datefrom, dateto)

  # FIXME optimize maybe always load first 200 results no matter what
  # since it will number of hits with it
  rcount <- get_number_of_hits_ebsco(url)

  if (rcount > 0) {
    pages <- seq(1, rcount, 200)

    cols <- c(
      doi = NA_character_,
      title = NA_character_,
      abstract = NA_character_,
      author = NA_character_,
      # `publication date (yyyy-mm-dd)` = NA_character_,
      # `publication type` = NA_character_,
      dt = NA_character_,
      pubtype = NA_character_,
      journal = NA_character_,
      lang = NA_character_,
      url = NA_character_
    )
    
    results <- map_df(pages, ~ fetch_ebsco(url, .x))
    
    results %>%
      select(
        doi = contains("ui"),
        title = contains("atl"),
        abstract = contains("ab"),
        author = contains("au"),
        # `publication date (yyyy-mm-dd)` = contains("dt"),
        # `publication type` = contains("pubtype"),
        dt = contains("dt"),
        pubtype = contains("pubtype"),
        journal = contains("jtl"),
        lang = contains("Language"),
        url = contains("plink")
      ) %>%
      add_column(!!!cols[!names(cols) %in% names(.)]) %>%
      replace(., . == "NA", "") %>%
      mutate_at(
        "dt",
        ~ substr(.x, 1, nchar(.x) - 3)
      ) %>%
      # handle multiple date formats
      # "2018 Supplementury issue", "Sep2015 Special Issue", "Mar2021", "1/20/2021", "2020"
      mutate(
        date_format_1 = as.Date(dt, "%m/%d/%Y")
      ) %>%
      mutate(
        date_format_2 = if_else(
          grepl(".*[a-zA-Z]{3}[0-9]{4}|.*[a-zA-Z]{3}[0-9]{2}", dt),
          str_extract(dt, "[a-zA-Z]{3}[0-9]{4}|[a-zA-Z]{3}[0-9]{2}"),
          ""
        )
      ) %>%
      mutate_at(
        "date_format_2",
        ~ format_month_abb_year_v(.x)
      ) %>%
      mutate_at(
        "date_format_2",
        ~ as.Date(.x, "%Y-%m-%d")
      ) %>%
      mutate(
        date_format_3 = if_else(
          grepl("\\b[0-9]{4}\\b", dt),
          str_extract(dt, "\\b[0-9]{4}\\b"),
          ""
        )
      ) %>%
      mutate_at(
        "date_format_3",
        ~ format(as.Date(.x, "%Y"), "%Y-01-01")
      ) %>%
      mutate_at(
        "date_format_3",
        ~ as.Date(date_format_3)
      ) %>%
      mutate(
        pdate = case_when(
          !is.na(date_format_1) ~ date_format_1,
          !is.na(date_format_2) ~ date_format_2,
          !is.na(date_format_3) ~ date_format_3
        )    
      ) %>%
      mutate_at(
        "pdate",
        ~ as.character(.x)
      ) %>%
      mutate_at(
        "pdate",
        ~ ifelse(is.na(.x), "1990-01-01", .x)
      ) %>%
      mutate(type = "") %>%
      mutate_at(
        "type",
        ~ if_else(
          grepl("Academic Journal", pubtype),
          "journal article or review",
          .x
        )
      ) %>%
      mutate_at(
        "type",
        ~ if_else(type == "", "other", .x)
      ) %>%
      mutate(
        url = case_when(
          is.na(doi) ~ url,
          TRUE ~ paste0("https://dx.doi.org/", doi)
        )
      ) %>%
      # NOTE display date in author column for test with searchterm <- "gluten AND intolerance"
      # ... seems just display error in R studio
      select(
        doi,
        title,
        abstract,
        author,
        `publication date (yyyy-mm-dd)` = pdate,
        `publication type` = type,
        journal,
        lang,
        url
      ) %>%
      mutate(source = "Ebsco") %>%
      mutate(openaccess = "NA") %>% 
      group_by(doi) %>%
      mutate(id = row_number()) %>%
      group_by(title, abstract) %>%
      mutate(title_id = row_number()) %>%
      ungroup() %>%
      filter(
        (!is.na(doi) & doi != "" & id == 1) |
        (title_id == 1 & doi == "") | 
        (title_id == 1 & is.na(doi))
      ) %>%
      select(-id, -title_id)

  } else {
    tibble(doi = character(0))
  }
}
FoodStandardsAgency/lit-fetch documentation built on June 29, 2024, 12:46 a.m.