repoRter_nih.R
In repoRter.nih: R Interface to the 'NIH RePORTER Project' API

## ---- include = FALSE------------------------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  width = 95,
  fig.dpi = 96
)

opts.old <- options()
options(width = 95, cli.unicode = FALSE, cli.width = 95)

## confirm RePORTER is currently available before building this, for CRAN
r <- httr::POST("https://api.reporter.nih.gov/v2/projects/search",
           httr::accept("application/json"),
           httr::content_type_json(),
           body = "{\"criteria\":{\"foa\":[\"RFA-NS-19-036\"]},\"include_fields\":[\"ApplId\"],\"offset\":0,\"limit\":10,\"sort_field\":\"appl_id\",\"sort_order\":\"desc\"}",
           encode = "raw"
           )
if (r$status_code != 200) knitr::knit_exit()

## ---- echo = FALSE, message = FALSE, fig.width=7, fig.height=4, fig.align='center'-----------
knitr::include_graphics("covid_plot.png")

## ---- eval = FALSE---------------------------------------------------------------------------
#  install.packages("repoRter.nih")

## ---- eval = FALSE---------------------------------------------------------------------------
#  devtools::install_github('bikeactuary/repoRter.nih@dev')

## ---- message = FALSE------------------------------------------------------------------------
library(tibble)
library(repoRter.nih)
library(ggplot2)
library(ggrepel)
library(dplyr)
library(scales)
library(tufte)

## --------------------------------------------------------------------------------------------
# all projects funded by the Paycheck Protection Act, Coronavirus Response and
# Relief Act, and American Rescue Plan, in fiscal year 2021
req <- make_req(criteria =
                  list(fiscal_years = 2021,
                       covid_response = c("C4", "C5", "C6")))

## --------------------------------------------------------------------------------------------
res <- get_nih_data(req)
class(res)

## --------------------------------------------------------------------------------------------
res %>% glimpse(width = getOption("cli.width"))

## --------------------------------------------------------------------------------------------
data("nih_fields")
nih_fields %>% print

## --------------------------------------------------------------------------------------------
req <- make_req()

## --------------------------------------------------------------------------------------------
data("nih_fields")
fields <- nih_fields %>%
  filter(response_name %in% 
           c("appl_id", "subproject_id", "project_title", "fiscal_year",
             "award_amount", "is_active", "project_start_date")) %>%
  pull(include_name)

req <- make_req(include_fields = fields,
                limit = 500,
                message = FALSE) # default
res <- get_nih_data(query = req,
                    max_pages = 1)

res %>% glimpse(width = getOption("cli.width"))

## --------------------------------------------------------------------------------------------
req <- make_req(criteria = 
                  list(
                    fiscal_years = 2010:2011,
                    include_active_projects = TRUE,
                    org_names = c("Yale", "New Haven")
                  ),
                include_fields = c("Organization", "FiscalYear", "AwardAmount"),
                message = FALSE)

## --------------------------------------------------------------------------------------------
res <- get_nih_data(req, max_pages = 1)
res %>% glimpse(width = getOption("cli.width"))

## --------------------------------------------------------------------------------------------
res <- get_nih_data(req,
                    max_pages = 1,
                    flatten_result = TRUE)

res %>% glimpse(width = getOption("cli.width"))

## ---- echo = FALSE---------------------------------------------------------------------------
evl <- class(res)[1] == "tbl_df"

## ---- eval=evl-------------------------------------------------------------------------------
res %>% 
  group_by(organization_org_name) %>%
  summarise(project_count = n())

## --------------------------------------------------------------------------------------------
## A valid request but probably not what we want
req <- make_req(criteria = 
                  list(
                    fiscal_years = 2010:2011,
                    include_active_projects = TRUE,
                    org_cities = "New Haven",
                    org_states = "WY"
                  ),
                include_fields = c("Organization", "FiscalYear", "AwardAmount"),
                message = FALSE ## suppress printed message
)

res <- get_nih_data(req,
                    max_pages = 5,
                    flatten_result = TRUE)

## --------------------------------------------------------------------------------------------
req <- make_req(criteria =
                  list(
                    fiscal_years = 2015,
                    include_active_projects = TRUE,
                    org_states = "WY"
                  ),
                include_fields = c("ApplId", "Organization", "FiscalYear", "AwardAmount"),
                sort_field = "AwardAmount",
                sort_order = "desc",
                message = FALSE)

res <- get_nih_data(req,
                    flatten_result = TRUE)

res %>% glimpse(width = getOption("cli.width"))

## --------------------------------------------------------------------------------------------
## all projects funded by the Paycheck Protection Act, Coronavirus Response and Relief Act,
## and American Rescue Plan, over all years
req <- make_req(criteria =
                  list(covid_response = c("All")),
                include_fields = nih_fields %>%
                  filter(payload_name %in% c("award_amount_range", "covid_response"))
                %>% pull(include_name))

res <- get_nih_data(req, max_pages = 1)

## ---- echo=FALSE-----------------------------------------------------------------------------
evl <- class(res)[1] == "tbl_df"

## ---- eval=evl-------------------------------------------------------------------------------
res$covid_response %>% class()
res$covid_response[[1]]

## ----covid, eval=FALSE-----------------------------------------------------------------------
#  ## all projects funded by the Paycheck Protection Act, Coronavirus Response and Relief Act,
#  ## and American Rescue Plan, in fiscal year 2021
#  req <- make_req(criteria =
#                    list(covid_response = c("All")),
#                  message = FALSE)
#  
#  res <- get_nih_data(req,
#                      flatten_result = TRUE)

## ---- eval=FALSE-----------------------------------------------------------------------------
#  unique(res$covid_response)

## ---- echo=FALSE-----------------------------------------------------------------------------
res <- readRDS("cov_res.RDS")
unique(res$covid_response)

## ---- eval=FALSE-----------------------------------------------------------------------------
#  library(ggplot2)
#  
#  res %>%
#    left_join(covid_response_codes, by = "covid_response") %>%
#    mutate(covid_code_desc = case_when(!is.na(fund_src) ~ paste0(covid_response, ": ", fund_src),
#                                       TRUE ~ paste0(covid_response, " (Multiple)"))) %>%
#    group_by(covid_code_desc) %>%
#    summarise(total_awards = sum(award_amount) / 1e6) %>%
#    ungroup() %>%
#    arrange(desc(covid_code_desc)) %>%
#    mutate(prop = total_awards / sum(total_awards),
#           csum = cumsum(prop),
#           ypos = csum - prop/2 ) %>%
#    ggplot(aes(x = "", y = prop, fill = covid_code_desc)) +
#    geom_bar(stat="identity") +
#    geom_text_repel(aes(label =
#                          paste0(dollar(total_awards,
#                                        accuracy = 1,
#                                        suffix = "M"),
#                                 "\n", percent(prop, accuracy = .01)),
#                        y = ypos),
#                    show.legend = FALSE,
#                    nudge_x = .8,
#                    size = 3, color = "grey25") +
#    coord_polar(theta ="y") +
#    theme_void() +
#    theme(legend.position = "right",
#          legend.title = element_text(colour = "grey25"),
#          legend.text = element_text(colour="blue", size=6,
#                                     face="bold"),
#          plot.title = element_text(color = "grey25"),
#          plot.caption = element_text(size = 6)) +
#    labs(caption = "Data Source: NIH RePORTER API v2") +
#    ggtitle("Legislative Source for NIH Covid Response Project Funding")

## ---- echo = FALSE, message = FALSE, fig.width=7, fig.height=4, fig.align='center'-----------
knitr::include_graphics("covid_plot.png")

## --------------------------------------------------------------------------------------------
data("covid_response_codes")
covid_response_codes %>% print

## --------------------------------------------------------------------------------------------
## projects funded in 2021 where the principal investigator first name
##  is "Michael" or begins with "Jo"
req <- make_req(criteria = 
                  list(fiscal_years = 2021,
                       pi_names = list(first_name = c("Michael", "Jo*"),
                                       last_name = c(""), # must specify all pi_names elements always
                                       any_name = character(1))),
                include_fields = nih_fields %>%
                  filter(payload_name == "pi_names") %>%
                  pull(include_name),
                message = FALSE)

res <- get_nih_data(req,
                    max_pages = 1,
                    flatten_result = TRUE)

res %>% glimpse(width = getOption("cli.width"))

## --------------------------------------------------------------------------------------------
## using advanced_text_search with boolean search string
req <- make_req(criteria = 
                  list(advanced_text_search =
                         list(operator = "advanced",
                              search_field = c("terms", "abstract"),
                              search_text = "(head AND trauma) OR \"brain damage\" AND NOT \"psychological\"")),
                include_fields = c("ProjectTitle", "AbstractText", "Terms") )

res <- get_nih_data(req, max_pages = 1)

## ---- echo = FALSE---------------------------------------------------------------------------
evl <- class(res)[1] == "tbl_df"

## ---- eval=evl-------------------------------------------------------------------------------
one_rec <- res %>%
  slice(42) %>%
  mutate(abstract_text = gsub("[\r\n]", " ", abstract_text))

one_rec %>% pull(project_title) %>% print

## ---- eval=evl-------------------------------------------------------------------------------
## substr to avoid LaTeX error exceeding char limit
one_rec %>% pull(abstract_text) %>% substr(1, 85) %>% print

## ---- eval=evl-------------------------------------------------------------------------------
one_rec %>% pull(terms) %>% substr(1, 85) %>% print

## ---- eval = FALSE---------------------------------------------------------------------------
#  all_res <- list()
#  for(y in 2017:2021) { ## five years to loop over, each year is ~80K records
#    ## We only need the AwardAmount for quantiles
#    req_sample <- make_req(criteria = list(fiscal_years = y),
#                           include_fields = "AwardAmount")
#  
#    ## get a sample of the result set - 1000 records should be enough
#    ## return the metadata
#    res_sample <- get_nih_data(req_sample, max_pages = 2, return_meta = TRUE)
#  
#    paste0("There are ", res_sample$meta$total, " results for fiscal year ", y) %>%
#      print()
#  
#    ## deciles of award amount - each decile should contain ~7,314.2 records, approximately
#    qtiles <- res_sample$records %>% pull(award_amount) %>% quantile(na.rm = TRUE, probs = seq(.1, 1, .1))
#  
#    ## list for qtile results (full year)
#    this_res <- list()
#    ## for each qtile
#    for (i in 1:length(qtiles)) {
#      if (i == 1) {
#        award_min <- 0
#      } else {
#        award_min <- ceiling(qtiles[i-1])+.01
#      }
#      if (i == length(qtiles)) {
#        award_max <- 1e9 ## arbitrarily huge
#      } else {
#        award_max <- ceiling(qtiles[i])
#      }
#      req <- make_req(criteria = list(fiscal_years = y,
#                                      award_amount_range = list(min_amount = award_min,
#                                                                max_amount = award_max)))
#      ## result set for quantile
#      this_res[[i]] <- get_nih_data(req, flatten_result = FALSE)
#    }
#    ## list of result sets for each year
#    yr_res[[y %>% as.character()]] <- this_res
#  }
#  
#  ## shape it up
#  all_res <- unlist(yr_res, recursive = FALSE) %>%
#    bind_rows() %>%
#    flatten(recursive = FALSE) %>%
#    clean_names()
#  
#  ## pull out everything that is flat
#  flat_columns <- all_res %>%
#    select_if(is.atomic)
#  
#  ## everything that isn't
#  annoying_columns <- all_res %>%
#    select_if(!is.atomic)

## --------------------------------------------------------------------------------------------
sessionInfo()

## ---- include=FALSE-----------------------------------------------------------
options(opts.old)