# https://www.nsf.gov/crssprgm/reu/list_result.jsp;jsessionid=DFB0057418103DF5567873842A8A34BC?unitid=5049&d-8001259-s=1&d-8001259-o=1&showItems=15&d-8001259-p=3
.split_column <-
function(data,
id_column = "idAward",
column = "emailInvestigator",
clean = F) {
if (!data %>% hasName(column)) {
glue("Data missing {column}") %>% message()
return(data)
}
is_date <- column %>% str_detect("date")
df <-
data %>%
select(one_of(id_column, column)) %>%
separate_rows(column, sep = "\\|") %>%
mutate_at(column,
str_squish) %>%
distinct()
if (is_date) {
clean <- F
df <- df %>% mutate_at(column, mdy)
}
if (clean) {
df <-
df %>% clean_entity_data(entity_column = column) %>% select(-column)
names(df) <- names(df) %>% str_remove_all("Clean")
}
df <-
df %>%
filter(!is.na(!!sym(column))) %>%
group_by_at(id_column) %>%
summarise(!!sym(column) := !!sym(column) %>% unique() %>% sort() %>% str_c(collapse = " | ")) %>%
ungroup()
data <-
data %>%
select(-one_of(column)) %>%
left_join(df, by = id_column) %>%
select(one_of(names(data)), everything())
data
}
.dictionary_nsf_names <-
function() {
tibble(
nameNSF = c(
"AbstractNarration",
"AwardAmount",
"AwardEffectiveDate",
"AwardExpirationDate",
"AwardID",
"AwardInstrument",
"AwardTitle",
"CityNameInstitution",
"CodeFoaInformation",
"CodeOrganization",
"CodeProgramElement",
"CodeProgramReference",
"CountryNameInstitution",
"nameDirectorateFunding",
"nameDivisionFunding",
"EmailAddressInvestigator",
"FirstNameInvestigator",
"LastNameInvestigator",
"MaxAmdLetterDate",
"MinAmdLetterDate",
"NameFoaInformation",
"NameInstitution",
"PhoneNumberInstitution",
"ProgramOfficer",
"RoleCodeInvestigator",
"StartDateInvestigator",
"StateCodeInstitution",
"StateNameInstitution",
"StreetAddressInstitution",
"TextProgramElement",
"TextProgramReference",
"ZipCodeInstitution",
"AwardTotalIntnAmount",
"EndDateInvestigator",
"ARRAAmount",
"AwardNumber",
"Title",
"NSFOrganization",
"Program(s)",
"StartDate",
"LastAmendmentDate",
"PrincipalInvestigator",
"State",
"Organization",
"ProgramManager",
"EndDate",
"AwardedAmountToDate",
"Co-PIName(s)",
"PIEmailAddress",
"OrganizationStreet",
"OrganizationCity",
"OrganizationState",
"OrganizationZip",
"OrganizationPhone",
"NSFDirectorate",
"ProgramElementCode(s)",
"ProgramReferenceCode(s)",
"Abstract",
"DirectorateOrganization",
"DivisionOrganization"
),
nameActual = c(
"descriptionAbstract",
"amountAward",
"dateAwardEffective",
"dateAwardExpiration",
"idAward",
"typeAwardInstrument",
"titleAward",
"cityInstitution",
"codeFOIA",
"idOrganization",
"codeProgramElement",
"codeProgramReference",
"countryInstitution",
"nameDirectorateFunding",
"nameDivisionFunding",
"emailInvestigator",
"nameFirstInvestigator",
"nameLastInvestigator",
"dateAmmendmentRecent",
"dateAmmendmentInitial",
"nameFOIA",
"nameInstitution",
"telephoneInstitution",
"nameProgramOfficer",
"roleInvestigator",
"dateStart",
"codeStateInstitution",
"nameStateInstitution",
"addressStreetInstitution",
"textProgramElement",
"referenceProgramElement",
"zipcodeInstitution",
"amountAwardTotalIntn",
"dateEnd",
"amountAmericanRecoveryAct",
"idAward",
"nameAward",
"codeOrganization",
"namePrograms",
"dateStart",
"dateLastAmmendment",
"namePrincipalInvestigator",
"codeState",
"nameOrganization",
"nameProgramManager",
"dateEndDate",
"amountAwardedToDate",
"namesCoPrincipalInvestigators",
"emailInvestigator",
"addressStreetOrganization",
"cityOrganization",
"stateOrganization",
"zipcodeOrganization",
"telephoneOrganization",
"codeNSFDirectorate",
"codesProgramElement",
"codesReferences",
"descriptionAward",
"nameDirectorateFunding",
"nameDivisionFunding"
)
)
}
.munge_nsf_names <-
function(data) {
dict_names <- .dictionary_nsf_names()
nsf_names <-
names(data)
actual_names <-
nsf_names %>%
map_chr(function(name) {
df_row <-
dict_names %>% filter(nameNSF == name)
if (nrow(df_row) == 0) {
glue::glue("Missing {name}") %>% message()
return(name)
}
df_row$nameActual
})
data %>%
set_names(actual_names)
}
.parse_nsf_xml <-
function(f) {
data <- read_xml(f) %>% xml_contents()
nodeset <- data %>%
xml2::xml_children()
data <-
seq_along(nodeset) %>%
map_dfr(function(x) {
node <- nodeset[[x]]
name_nsf <- node %>% xml_name()
children <- xml_children(x = node) %>% length()
if (children <= 1) {
text <-
xml_text(x = node)
d <- tibble(x, item = name_nsf, value = text)
return(d)
}
data <-
seq_along(children) %>%
map_dfr(function(z) {
node_child <- xml_children(x = node)
item <- node_child %>% xml_name()
text <- xml_text(x = node_child)
tibble(item = str_c(item, name_nsf), value = text)
}) %>%
mutate(x)
data
})
data <- data %>%
group_by(item) %>%
filter(value != "") %>%
summarise(value = str_c(value, collapse = " | ")) %>%
ungroup()
nsf_names <- data$item
data <-
data %>% spread(item, value) %>%
select(one_of(nsf_names))
data
}
#' NSF Research Grant urls
#'
#' This function acquires urls
#' for all National Science Foundation Research Grants
#'
#' @return \code{tibble}
#' @export
#'
#' @examples
#' dictionary_nsf_award_urls()
dictionary_nsf_award_urls <-
function() {
page <- "https://nsf.gov/awardsearch/download.jsp" %>% read_html()
years <-
page %>% html_nodes(".downloadcontent strong") %>% html_text() %>% str_squish()
urls <-
page %>% html_nodes(".downloadcontent a") %>% html_attr("href") %>% str_c("https://nsf.gov/awardsearch/", .)
current_year <- Sys.Date() %>% year()
data <-
tibble(years, urlNSFData = urls) %>%
separate(years,
into = c("yearAward", "sizeDataMB"),
extra = "merge",
sep = "\\-") %>%
mutate_all(str_squish) %>%
mutate(
sizeDataMB = parse_number(sizeDataMB),
yearAward = parse_number(yearAward),
isCurrentYear = yearAward == current_year,
isHistoric1976Prior = is.na(yearAward)
) %>%
select(isCurrentYear, isHistoric1976Prior, everything())
data
}
.parse_nsf_url <-
function(url = "https://nsf.gov/awardsearch/download?DownloadFileName=1960&All=true",
return_message = T) {
tmp <-
tempfile()
file <- curl_download(url, tmp)
unz_files <- unzip(file, exdir = "xml")
.parse_nsf_xml_safe <- possibly(.parse_nsf_xml, tibble())
all_data <-
unz_files %>%
map_dfr(function(x) {
x %>% message()
.parse_nsf_xml_safe(f = x)
})
all_data <-
all_data %>% .munge_nsf_names()
all_data <-
all_data %>%
mutate_at(c("idAward", "idOrganization"),
as.numeric)
amt_names <- all_data %>% select(matches("amount")) %>% names()
all_data <-
all_data %>%
mutate_at(amt_names,
list(function(x) {
x %>% as.numeric() %>% currency(digits = 0)
}))
if (all_data %>% hasName("dateAwardEffective")) {
all_data <-
all_data %>%
select(idAward, dateAwardEffective) %>%
separate_rows("dateAwardEffective", sep = "\\|") %>%
mutate(dateAwardEffective = dateAwardEffective %>% str_trim() %>% mdy()) %>%
group_by(idAward) %>%
summarise(dateAwardEffective = dateAwardEffective %>% str_c(collapse = " | ")) %>%
ungroup() %>%
left_join(all_data %>% select(-dateAwardEffective), by = "idAward") %>%
select(one_of(names(all_data)), everything())
}
if (all_data %>% hasName("dateAwardExpiration")) {
all_data <-
all_data %>%
select(idAward, dateAwardExpiration) %>%
separate_rows("dateAwardExpiration", sep = "\\|") %>%
mutate(dateAwardExpiration = dateAwardExpiration %>% str_trim() %>% mdy()) %>%
group_by(idAward) %>%
summarise(dateAwardExpiration = dateAwardExpiration %>% str_c(collapse = " | ")) %>%
ungroup() %>%
left_join(all_data %>% select(-dateAwardExpiration), by = "idAward") %>%
select(one_of(names(all_data)), everything())
}
date_cols <-
all_data %>% select(matches("date")) %>% select(-one_of(c(
"dateAwardExpiration",
"dateAwardEffective"
))) %>%
names()
if (length(date_cols) > 0) {
all_data <- all_data %>%
mutate_at(date_cols, list(mdy))
}
if (all_data %>% hasName("dateAwardExpiration")) {
all_data <- all_data %>%
mutate(dateAwardExpiration = ymd(dateAwardExpiration))
}
if (all_data %>% hasName("dateAwardEffective")) {
all_data <- all_data %>%
mutate(dateAwardEffective = ymd(dateAwardEffective))
}
df_people <-
all_data %>%
select(idAward, nameFirstInvestigator) %>%
separate_rows(nameFirstInvestigator, sep = "\\|") %>%
mutate_if(is.character, str_squish) %>%
mutate(idRow = 1:n()) %>%
left_join(
all_data %>%
select(idAward, nameLastInvestigator) %>%
separate_rows(nameLastInvestigator, sep = "\\|") %>%
mutate(idRow = 1:n()) %>%
mutate_if(is.character, str_squish),
by = c("idAward", "idRow")
) %>%
select(-idRow) %>%
mutate_if(is.character,
list(function(x) {
ifelse(is.na(x), "", x)
})) %>%
unite(
nameInvestigator,
nameFirstInvestigator,
nameLastInvestigator,
sep = " ",
remove = F
) %>%
mutate(nameInvestigator = nameInvestigator %>% str_squish()) %>%
group_by(idAward) %>%
summarise_all(list(function(x) {
x %>% unique() %>% sort() %>% str_c(collapse = " | ") %>% str_to_upper()
})) %>%
ungroup() %>%
mutate_at("nameInvestigator", list(function(x) {
ifelse(x %in% c("", "DATA NOT AVAILABLE"), NA_character_, x)
}))
all_data <-
all_data %>% select(-c(nameFirstInvestigator, nameLastInvestigator)) %>%
left_join(df_people, by = "idAward") %>%
select(idAward, nameInvestigator, everything())
all_data <-
all_data %>% .split_column(column = 'countryInstitution', clean = T)
all_data <-
all_data %>% .split_column(column = 'nameFOIA', clean = T)
all_data <-
all_data %>% .split_column(column = 'cityInstitution', clean = T)
all_data <-
all_data %>% .split_column(column = 'nameInstitution', clean = T)
all_data <-
all_data %>% .split_column(column = 'roleInvestigator', clean = T)
all_data <-
all_data %>% .split_column(column = "typeAwardInstrument", clean = T)
all_data <-
all_data %>% .split_column(column = "nameDirectorateFunding", clean = T)
all_data <-
all_data %>% .split_column(column = "nameDivisionFunding", clean = T)
all_data <-
all_data %>% .split_column(column = "nameStateInstitution", clean = T)
all_data <-
all_data %>% .split_column(column = "addressStreetInstitution", clean = T)
all_data <-
all_data %>% .split_column(column = "textProgramElement", clean = T)
all_data <-
all_data %>% .split_column(column = "referenceProgramElement", clean = T)
all_data <-
all_data %>% .split_column(column = "nameProgramOfficer", clean = T)
all_data <-
all_data %>% .split_column(column = "nameDirectorateFunding", clean = T)
if (all_data %>% hasName("descriptionAbstract")) {
all_data <-
all_data %>%
mutate_at(c("descriptionAbstract"),
list(function(x) {
x %>% stri_enc_toascii() %>% str_remove_all(" \032 ") %>% str_to_upper() %>% str_replace_all("<BR/>|<BR>", " ") %>% str_squish() %>% str_remove("ABSTRACT") %>% str_squish()
}))
}
if (all_data %>% hasName("titleAward")) {
all_data <-
all_data %>%
mutate_at(c("titleAward"),
list(function(x) {
x %>% stri_enc_toascii() %>% str_remove_all(" \032 ") %>% str_to_upper() %>% str_replace_all("<BR/>|<BR>", " ") %>% str_squish() %>% str_remove("ABSTRACT") %>% str_squish()
}))
}
all_data <-
all_data %>%
mutate(urlNSFData = url)
file %>% unlink()
unlink("xml", recursive = T)
unz_files %>% unlink()
if (all_data %>% hasName("zipcodeInstitution")) {
all_data <-
all_data %>%
mutate(zipcodeInstitution = zipcodeInstitution %>% substr(1, 5))
}
if (all_data %>% select(dateAmmendmentInitial, dateAmmendmentRecent) %>% ncol() == 2) {
all_data <-
all_data %>%
mutate(countDaysAmmendment = pmin(
0,
(dateAmmendmentRecent - dateAmmendmentInitial) %>% as.numeric()
))
}
if (all_data %>% select(dateAwardEffective, dateAwardExpiration) %>% ncol() == 2) {
all_data <- all_data %>%
mutate(countDaysContract = pmin((
dateAwardExpiration - dateAwardEffective
) %>% as.numeric()))
}
gc()
if (all_data %>% select(matches("address|city|codeState|zip")) %>% ncol() == 4) {
df_locations <-
all_data %>%
select(
idAward,
addressStreetInstitution,
cityInstitution,
codeStateInstitution,
zipcodeInstitution
) %>%
mutate_if(is.character, list(function(x) {
ifelse(is.na(x), "", x)
})) %>%
unite(cityState, cityInstitution, codeStateInstitution, sep = " ") %>%
unite(cityStateZip, cityState, zipcodeInstitution, sep = " ") %>%
unite(locationInstitution,
addressStreetInstitution,
cityStateZip,
sep = ", ") %>%
mutate(locationInstitution = str_squish(locationInstitution))
all_data <-
all_data %>%
left_join(df_locations, by = "idAward") %>%
select(
one_of(
"idAward",
"titleAward",
"nameInstitution",
"nameInvestigator",
"locationInstitution"
),
everything()
)
all_data <-
all_data %>%
mutate(
locationInstitution = case_when(
locationInstitution == ", ," ~ NA_character_,
locationInstitution %>% substr(1, 2) == ", " ~ locationInstitution %>% substr(3, nchar(locationInstitution)),
locationInstitution %>% substr(nchar(locationInstitution), nchar(locationInstitution)) == "," ~
locationInstitution %>% substr(1, nchar(locationInstitution) - 1),
TRUE ~ locationInstitution
)
)
}
all_data <-
all_data %>%
mutate(
urlAward = glue(
"https://www.nsf.gov/awardsearch/showAward?AWD_ID={idAward}"
) %>% as.character()
)
all_data
}
#' NSF years grants
#'
#' Download and clean grants
#' from the National Science Foundationf
#' for specified periods
#'
#' @param years vector of numeric years
#' @param clean_institutions if \code{TRUE} cleans institution names
#' @param clean_parties if \code{TRUE} cleans party names
#' @param return_message if \code{TRUE} returns message
#'
#' @return \code{tibble}
#' @export
#'
#' @examples
nsf_years_grants <-
function(years = 2018,
clean_institutions = T,
clean_parties = T,
return_message = T
) {
df_years <-
dictionary_nsf_award_urls()
df_years <- df_years %>%
select(yearAward, urlNSFData)
urls <- df_years %>% filter(yearAward %in% years) %>% pull(urlNSFData)
.parse_nsf_url_safe <- possibly(.parse_nsf_url, tibble())
all_data <-
urls %>%
map_dfr(function(url){
.parse_nsf_url(url = url,return_message = return_message)
})
all_data <-
all_data %>%
mutate_if(is.character,
list(function(x){
ifelse(x == "", NA_character_, x) %>% str_squish()
}))
amt_names <- all_data %>% select(matches("amount")) %>% names()
all_data <-
all_data %>%
left_join(df_years, by = "urlNSFData")
all_data <-
all_data %>%
mutate_at(amt_names,
list(function(x) {
x %>% as.numeric() %>% currency(digits = 0)
})) %>%
select(yearAward, everything())
if (all_data %>% hasName("textProgramElement")) {
if (all_data %>% hasName("descriptionAbstract")) {
all_data <-
all_data %>%
mutate(
isSBIRSTTR = textProgramElement %>% str_detect(
"SBIR|STTR|SMALL BUSINESS PHASE|SMALL BUSINESS INNOVATION RESEARCH"
),
haSBIRSTTRMention = descriptionAbstract %>% str_detect(
"SBIR|STTR|SMALL BUSINESS PHASE|SMALL BUSINESS INNOVATION RESEARCH|SMALL BUSINESS TECHNOLOGY TRANSFER"
) |
titleAward %>% str_detect(
"SBIR|STTR|SMALL BUSINESS PHASE|SMALL BUSINESS INNOVATION RESEARCH|SMALL BUSINESS TECHNOLOGY TRANSFER"
),
isSBIRSTTR = case_when(is.na(isSBIRSTTR) &
haSBIRSTTRMention ~ TRUE,
TRUE ~ isSBIRSTTR)
)
} else {
all_data <-
all_data %>%
mutate(
isSBIRSTTR = textProgramElement %>% str_detect(
"SBIR|STTR|SMALL BUSINESS PHASE|SMALL BUSINESS INNOVATION RESEARCH"
),
haSBIRSTTRMention = titleAward %>% str_detect(
"SBIR|STTR|SMALL BUSINESS PHASE|SMALL BUSINESS INNOVATION RESEARCH|SMALL BUSINESS TECHNOLOGY TRANSFER"
),
isSBIRSTTR = case_when(is.na(isSBIRSTTR) &
haSBIRSTTRMention ~ TRUE,
TRUE ~ isSBIRSTTR)
)
}
} else {
if (all_data %>% hasName("descriptionAbstract")) {
all_data <- all_data %>%
mutate(
haSBIRSTTRMention = descriptionAbstract %>% str_detect(
"SBIR|STTR|SMALL BUSINESS PHASE|SMALL BUSINESS INNOVATION RESEARCH|SMALL BUSINESS TECHNOLOGY TRANSFER"
) |
titleAward %>% str_detect(
"SBIR|STTR|SMALL BUSINESS PHASE|SMALL BUSINESS INNOVATION RESEARCH|SMALL BUSINESS TECHNOLOGY TRANSFER"
),
isSBIRSTTR = haSBIRSTTRMention
)
} else {
all_data <-
all_data %>%
mutate(
haSBIRSTTRMention = titleAward %>% str_detect(
"SBIR|STTR|SMALL BUSINESS PHASE|SMALL BUSINESS INNOVATION RESEARCH|SMALL BUSINESS TECHNOLOGY TRANSFER"
),
isSBIRSTTR = haSBIRSTTRMention
)
}
}
all_data
}
.nsf_grants <-
function() {
file_nos <- 1:10
urls <- glue("https://asbcllc.com/r_packages/govtrackR/data/nsf/nsf_grants/nsf_grants_{file_nos}.rda")
data <-
urls %>%
map_dfr(function(url){
read_rda(file = url)
})
amt_names <- data %>% select(matches("amount")) %>% names()
data <-
data %>%
mutate_at(amt_names,
list(function(x){
currency(x, digits = 0)
})) %>%
mutate(nameAgencyParent = "NATIONAL SCIENCE FOUNDATION")
name_cols <- data %>% select(matches("name")) %>% names()
data <- data %>%
mutate_at(name_cols, str_to_upper)
data <- data %>%
separate(
nameDivisionFunding,
extra = "merge",
into = c("nameDivisionFunding", "slugDivisionFunding"),
sep = "\\("
) %>%
mutate(slugDivisionFunding = slugDivisionFunding %>% str_remove_all("\\)")) %>%
mutate_if(is.character, str_squish)
data <- data %>%
mutate(nameAgencyParent = "NATIONAL SCIENCES FOUNDATION")
data
}
#' National Science Foundation Grants
#'
#' Cached version of all National Science Foundation
#' Grants since 1971
#'
#' @return
#' @export
#'
#' @examples
nsf_grants <-
function(){
.tt <- memoise::memoise(.nsf_grants)
.tt()
}
# terms -------------------------------------------------------------------
.parse_nfs_grant_term <-
function(term = "Deep Learning") {
cookies = c(
'JSESSIONID' = 'D53656E0B6FFCCF35301DA4FC8BAC9D1'
)
slug <- glue('"{term}"') %>% str_to_upper() %>% URLencode()
url_refer <-
glue('https://nsf.gov/awardsearch/advancedSearchResult?PIId=&PIFirstName=&PILastName=&PIOrganization=&PIState=&PIZip=&PICountry=&ProgOrganization=&ProgEleCode=&BooleanElement=All&ProgRefCode=&BooleanRef=All&Program=&ProgOfficer=&Keyword={slug}&AwardNumberOperator=&AwardAmount=&AwardInstrument=&ActiveAwards=true&OriginalAwardDateOperator=&StartDateOperator=&ExpDateOperator=') %>% as.character()
headers = c(
`Connection` = 'close',
`Upgrade-Insecure-Requests` = '1',
`DNT` = '1',
`User-Agent` = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36',
`Sec-Fetch-User` = '?1',
`Accept` = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
`Sec-Fetch-Site` = 'same-origin',
`Sec-Fetch-Mode` = 'navigate',
`Referer` = url_refer,
`Accept-Encoding` = 'gzip, deflate, br',
`Accept-Language` = 'en-US,en;q=0.9'
)
params = list(`exportType` = 'csv')
url_refer %>% message()
res <-
httr::GET(
url = 'https://nsf.gov/awardsearch/ExportResultServlet',
httr::add_headers(.headers = headers),
query = params,
httr::set_cookies(.cookies = cookies)
)
data <-
res %>%
content() %>%
as_tibble()
data <-
.munge_nsf_names(data = data) %>%
.munge_data(clean_address = F) %>%
mutate(termSearch = term ) %>%
select(termSearch, everything())
data
}
.parse_nsf_award_url <-
function(url = "https://www.nsf.gov/awardsearch/showAward?AWD_ID=1610953") {
page <- read_html(x = url)
items <-
page %>% html_nodes(".tabletext2 strong") %>% html_text()
values <-
page %>% html_nodes(".tabletext2+ .tabletext2")
seq_along(items) %>%
map_dfr(function(x){
item <- items[[x]]
if (x == 1) {
values[x] %>% html_nodes("a") %>% html_text()
}
})
}
.munge_nsf <-
function(data) {
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.