R/rsf.R

Defines functions .standardize_rsf get_rsf

Documented in get_rsf

#' Get Russell Sage foundation awards
#' @param keyword Keyword to query, single string
#' @param verbose enable verbose HTTP messages. TRUE/FALSE, default: false
#' @return a data.frame
#' @export
#' @examples \dontrun{rsf <- get_rsf("ethnography")}
get_rsf <- function(keyword, verbose=FALSE) {
  # First we use the general search function to find potential awards
  url <- "https://www.russellsage.org"
  path <- paste0("/search/node/", xml2::url_escape(keyword))

  # Collect all the links, looping through pages as necessary
  links <- c()
  repeat {
    page <- request(paste0(url, path), "get", verbose)
    links <- c(links,
               rvest::html_attr(rvest::html_nodes(page, "li.search-result a"),
                                "href"))

    if (length(links)==0) return(NULL) # No results at all?
    path <- rvest::html_attr(rvest::html_node(page, "li.Pagination__next > a"),
                             "href")
    if (is.na(path)) break # Hit the last page?
  }

  # This is the general site-wide search feature, so pick only the awards
  links <- links[grepl("awarded-project", links)]
  if (length(links)==0) { # No results?
    return(NULL)
  }

  # Getting details require seperate HTTP reqs for each award, ugh
  df <- lapply(links, function(x) {
    award <- request(x, "get", verbose)
    award <- rvest::html_node(award, "div.content > header > div.u-nubbed")
    program <- rvest::html_children(award)[1] %>% rvest::html_text(trim=TRUE)
    program <- gsub("\t\t\t\t", "; ", program) # Separate with ;
    title <- rvest::html_children(award)[2] %>% rvest::html_text(trim=TRUE)

    # Extract data from these nasty unstructured divs
    info <- rvest::html_children(rvest::html_children(award)[3])
    info <- data.frame(
      label=rvest::html_text(rvest::html_nodes(info, "strong:first-child")),
      value=rvest::html_text(rvest::html_nodes(info, "strong + div, br + div")),
      stringsAsFactors = FALSE
    )
    awardee <- strsplit(info$value[info$label=="Awarded Scholars: "], ",")
    if (length(awardee)==0) # Try plan B
      awardee <- strsplit(
        info$value[info$label=="Other External Scholars: "], ",")

    pi_name <- trimws(awardee[[1]][1])
    institution <- trimws(awardee[[1]][2])

    year <- as.integer(
      .substr_right(info$value[info$label=="Project Date: "], 4))

    # Remove $ and , in amounts (i.e. $1,000,000)
    amount <- gsub("^\\$|,", "", info$value[info$label=="Award Amount: "])

    data.frame(institution, pi_name, year, amount, title, program,
               id=paste0("RSF", .text_hash(x))) # Return one df line
  })
  df <- do.call(rbind.data.frame, df) # Bind the df lines

  df$keyword <- keyword
  df
}

.standardize_rsf <- function(keywords, from_date, to_date, verbose) {
  raw <- lapply(keywords, get_rsf, verbose)
  raw <- do.call(rbind.data.frame, raw)
  if (nrow(raw)==0) {
    message("No results from RSF")
    return(NULL)
  }

  # Since the API doesn't have the feature, we'll do date handling here
  year <- NULL # for R CMD check
  raw <- subset(raw, year > format.Date(from_date, "%Y") &
                  year < format.Date(to_date, "%Y"))

  if (nrow(raw)==0) { # Empty now after date subsetting?
    message("No results from RSF")
    return(NULL)
  }

  with(raw, data.frame(
    institution, pi=pi_name, year, start=NA, end=NA, program, amount,
    id, title, abstract=NA, keyword, source="RSF"
  ))
}
PESData/awardFindR documentation built on March 27, 2024, 4:46 a.m.