R/get_links.R

#' Return all links on a species' ECOS page
#'
#' Return a data.frame with links (and anchor text) from a species' ECOS page.
#' 
#' @note Either \code{url} or \code{page} must be specified.
#' 
#' @param url The URL of a species' page on ECOS or path to the HTML file
#' @param clean Remove useless links (e.g., www.usa.gov) [default = TRUE]
#' @param pause 0.5-3 second pause to be nice to the server [default = TRUE]
#' @param verbose Message the species being processed [default = TRUE]
#' @return A data.frame with variables:
#' \describe{
#'   \item{Scientific_Name}{The species' scientific name, as given by ECOS}
#'   \item{href}{The link as given on the ECOS page, if available}
#'   \item{link}{The link, in absolute form, if available}
#'   \item{text}{The anchor text of the <a> tag, if available}
#' }
#' @seealso \link{remove_silly_links} \link{get_bulk_species_links}
#' @export
#' @examples
#' \dontrun{
#'   res <- get_species_url("Gila purpurea") %>% get_species_links()
#'   head(res)
#' }
get_species_links <- function(url, clean = TRUE, pause = TRUE, verbose = TRUE) {
  if(grepl(url, pattern = "^http|^www")) {
    if(verbose) message(paste("Fetching page for", species))
    if(pause) Sys.sleep(runif(1, 0, 3))
    page <- get_species_page(url)
    record <- filter(TECP_table, Species_Page == url)
  } else {
    page <- xml2::read_html(url)
    sp_code <- strsplit(basename(url), split = "_")[[1]][1]
    record <- filter(TECP_table, Species_Code == sp_code)
  }
  check_load()
  species <- unique(record$Scientific_Name)
  a_nodes <- try(html_nodes(page, "a"))
  if(class(a_nodes) == "try-error") err_res("No <a> nodes for", species)
  hrefs <- html_attr(a_nodes, "href")
  base_ln <- dirname(url)
  full_ln <- simplify2array(lapply(hrefs, FUN = fill_link, base_ln))
  texts <- html_text(a_nodes)
  res <- data_frame(Scientific_Name = rep(species, length(hrefs)),
                    href = hrefs,
                    link = full_ln,
                    text = str_trim(texts))
  if(clean) res <- remove_silly_links(res)
  return(res)
}

# Warning with a data.frame return for bad get_species_links
err_res <- function(e, species) {
  warning(paste("Warning:", e, species))
  err_res <- data_frame(Scientific_Name = species,
                        href = "No page",
                        link = "No page",
                        text = "No page")
  return(err_res)
}

# Fill relative links with appropriate prefix
# 
# @param x A link (href) string; may be relative or absolute
# @param base_ln A string indicating the base URL of current page
# @return The complete link, as available.
fill_link <- function(x, base_ln) {
  base_url <- "http://ecos.fws.gov"
  if(is.na(x)) return(NA)
  if(grepl(x, pattern = "^http")) return(x)
  if(grepl(x, pattern = "^/")) return(paste0(base_url, x))
  if(grepl(x, pattern = "^#")) return(paste0(base_ln, x))
  return(x)
}

# Check if the requested page is an ECOS profile page
#
# ECOS will return a page rather than 404 if the species code is wrong. This
# checks that the page is not the "No species profile" page.
# 
# @param page An rvest read_html page
# @return logical; TRUE if a species profile, FALSE if "No species profile"
# @importFrom rvest html_text html_node
# @examples
# get_species_url("Gila purpurea") %>%
#   get_species_page() %>%
#   is_species_profile()
is_species_profile <- function(page) {
  text <- try(rvest::html_text(page))
  if(class(text) == "try-error") {
    return(FALSE)
  } else if(grepl(text, pattern = "No species profile", ignore.case = TRUE)) {
    return(FALSE)
  }
  return(TRUE)
}

#' Get the links for ESA five-year review documents from ECOS species' pages
#'
#' @param df A data.frame of species' links.
#' @return df, filtered for only five-year review links
#' @importFrom dplyr filter
#' @export
#' @examples
#' \dontrun{
#'   five_yr <- get_species_url("Gila purpurea") %>% 
#'                get_species_links() %>%
#'                get_5yrev_links()
#' }
get_5yrev_links <- function(df) {
  res <- dplyr::filter(df, grepl(df$href, pattern = "five_year_review"))
  return(res)
}

#' Get the links for recovery plans on ECOS species' pages
#'
#' @param df A data.frame of species' links.
#' @return df, filtered for only recovery plan links
#' @importFrom dplyr filter
#' @export
#' @examples
#'   recovery <- get_species_url("Gila purpurea") %>% 
#'                 get_species_links() %>%
#'                 get_recovery_links()
get_recovery_links <- function(df) {
  res <- dplyr::filter(df, grepl(href, pattern = "recovery_plan"))
  return(res)
}

#' Get the links for Federal Register documents on ECOS species' pages
#'
#' @param df A data.frame of links from \link{get_species_links}
#' @return df, filtered for only Fed. Reg. links
#' @importFrom dplyr filter
#' @export
#' @examples
#'   fed_reg <- get_species_url("Gila purpurea") %>% 
#'                get_species_links() %>%
#'                get_fedreg_links()
get_fedreg_links <- function(df) {
  res <- dplyr::filter(df, grepl(href, pattern = "federal_register|gpo"))
  return(res)
}

#' Get the links to conservation plan pages on ECOS species' pages
#'
#' @param df A data.frame of species' links.
#' @return df, filtered for links to conservation plans (SHA, HCP, CCAA)
#' @importFrom dplyr filter
#' @export
#' @examples
#' \dontrun{
#'   get_species_url("Gila purpurea") %>% 
#'     get_species_links() %>% 
#'     get_cons_plan_links()
#' }
get_cons_plan_links <- function(df) {
  res <- filter(df, grepl(href, pattern = "conservationPlan"))
  return(res)
}

#' Get a listing of link suffixes for HCPs, SHA, and CCA/As
#'
#' @note Does not use the ECOS conservation plan page because we know that many 
#' plans linked on species' ECOS pages do not appear in the conservation plan 
#' portal.
#' 
#' @param url The species' ECOS page URL to scrape *OR* path to HTML of page
#' @param type The type of conservation agreement to search for; one of
#'   \itemize{
#'     \item{HCP}
#'     \item{SHA}
#'     \item{CCA}
#'     \item{CCAA}
#'   }
#' @param verbose Print messages while processing [default = TRUE]
#' @return A data.frame with plan type, plan name, species, and link to the plan
#' @export
#' @examples
#' \dontrun{
#' agmt <- get_species_url("Gila purpurea") %>%
#'           get_agmt_type_links(type = "HCP")
#' }
get_agmt_type_links <- function(url, type, verbose = TRUE) {
  check_load()
  if(grepl(url, pattern = "^http|^www")) {
    species <- unique(filter(TECP_table, 
                      Species_Page == url)$Scientific_Name)
  } else {
    sp_code <- strsplit(basename(url), split = "_")[[1]][1]
    species <- unique(filter(TECP_table, 
                      Species_Code == sp_code)$Scientific_Name)
  }
  tabs <- get_species_tables(url, verbose = FALSE)
  if(type == "HCP") {
    cur_tab <- tabs[["HCP Plan Summaries"]]
  } else if(type == "SHA") {
    cur_tab <- tabs[["SHA Plan Summaries"]]
  } else if(type == "CCA") {
    cur_tab <- tabs[["CCA Plan Summaries"]]
  } else if(type == "CCAA") {
    cur_tab <- tabs[["CCAA Plan Summaries"]]
  } else {
    message("Please specify a type of HCP, SHA, CCA, or CCAA")
    return(NULL)
  }
  if(is.null(cur_tab)) {
    message(sprintf("No plans of type %s for %s", type, species))
    return(NULL)
  }
  link <- get_species_links(url, verbose = verbose)
  names(cur_tab)[1] <- "text"
  join <- left_join(cur_tab, link, by = "text")
  join$Type <- rep(type, length(join[[1]]))
  res <- select_(join, "Type", "Plan" = "text", "Scientific_Name", 
                 "Link" = "link")
  return(res)
}
jacob-ogre/ecosscraper documentation built on May 18, 2019, 8 a.m.