# sql ---------------------------------------------------------------------
## https://aws.amazon.com/blogs/aws/new-usaspending-gov-on-an-amazon-rds-snapshot/
# urls --------------------------------------------------------------------
.fpds_years <- function(url = "https://www.fpds.gov/ddps/directory_browser/index.php") {
page <- url %>% read_html()
links <- page %>% html_nodes("td a")
urls <- links %>% html_attr('href') %>% discard(function(x){x %>% str_detect("/ddps")}) %>% str_c("https://www.fpds.gov/ddps/directory_browser/index.php",.)
years <- links %>% html_text() %>% str_trim() %>% discard(function(x){x == ""})
tibble(years, urlFPDSYear = urls) %>%
separate(years, sep = "\\-", into = c("yearFiscal", "versionFPDS"),extra = "merge") %>%
mutate(yearFiscal = yearFiscal %>% str_replace_all("FY", "20") %>% as.numeric())
}
.parse_fpds_year <-
function(url = "https://www.fpds.gov/ddps/directory_browser/index.php?somepath=../FY05-V1.4", return_message = T) {
if (return_message) {
glue("Parsing {url}") %>% message()
}
page <- url %>% read_html()
links <-
page %>% html_nodes("td a")
urls <-
links %>% html_attr('href') %>% discard(function(x){x %>% str_detect("/ddps")}) %>% str_c("https://www.fpds.gov/ddps/directory_browser/index.php",.)
offices <-
links %>% html_text() %>% str_trim() %>% discard(function(x){x == ""})
tibble(offices, urlOfficeYear = urls) %>%
separate(offices,
sep = "\\-",
extra = "merge",
into = c("idOffice", "slugOffice")) %>%
mutate(
slugOffice = ifelse(is.na(slugOffice), idOffice, slugOffice),
idOffice = ifelse(slugOffice == idOffice, NA_character_, idOffice)
) %>%
mutate(urlFPDSYear = url)
}
#' FPDS Archive XML Links
#'
#' Acquires all links for access to FPDS XML archives
#'
#' @param join_office_features if \code{TRUE} joins features for matching office id
#' @param return_message if \code{TRUE} returns messages
#'
#' @return \code{tibble}
#' @export
#'
#' @examples
#' fpds_xml_urls()
fpds_xml_urls <-
function(join_office_features = T, return_message = T) {
df_years <- .fpds_years()
.parse_fpds_year_safe <- possibly(.parse_fpds_year, tibble())
all_data <-
1:nrow(df_years) %>%
map_dfr(function(x){
df_row <- df_years %>% dplyr::slice(x)
fy <- df_row$yearFiscal
url <- df_row$urlFPDSYear
if (return_message) {
glue("Acquiring all FPDS agency urls for {fy}") %>% message()
}
data <- .parse_fpds_year_safe(url = url, return_message = return_message)
data %>%
mutate(yearFiscal = fy) %>%
select(yearFiscal, everything())
})
if (join_office_features){
dict_agencies <- dictionary_government_agencies()
d <- dict_agencies %>% select(idOffice, nameAgency, nameOffice, idDepartment, descriptionMission, url, urlCongressionalJustification)
all_data <- all_data %>%
left_join(d, by = "idOffice") %>%
select(yearFiscal, idOffice, names(d), everything())
}
all_data
}
# fpds_xml ----------------------------------------------------------------
.parse_fpds_xml <-
function(doc) {
contract_nodes <-
doc %>% xml_children() %>% length()
if (contract_nodes < 2) {
return(tibble())
}
all_data <- 2:contract_nodes %>%
map_dfr(function(x) {
glue("Parsing {x-1} of {contract_nodes-1} expenditures") %>% message()
xml_contract <-
doc %>% xml_child(x) %>% xml_contents()
contract_node_names <-
xml_name(xml_contract)
data <-
1:length(contract_node_names) %>%
map_dfr(function(table_no) {
node_contents <-
xml_contract[[table_no]] %>% as_list() %>% unlist()
if (length(node_contents) == 0) {
return(invisible())
}
tibble(item = names(node_contents),
value = as.character(node_contents)) %>%
mutate(nameNode = contract_node_names[[table_no]]) %>%
select(nameNode, everything())
})
data %>%
mutate(idTransactionOfficeYear = x - 1) %>%
select(idTransactionOfficeYear, everything())
})
df_items <-
all_data %>%
distinct(item)
dict_items <-
df_items$item %>%
map_dfr(function(item) {
tibble(
item = item,
itemActual =
item %>% str_remove_all(
"\\^placeOfPerformance.|vendorHeader.|vendorBusinessTypes.|federalGovernment.|vendorSiteDetails.|vendorSiteDetails.vendorLineOfBusiness.|vendorSocioEconomicIndicators.|minorityOwned.|vendorHeader.|vendorBusinessTypes.|treasuryAccount.treasuryAccountSymbol.|vendorSiteDetails.|awardContractID.|listOfTreasuryAccounts.|listOfTreasuryAccounts.treasuryAccount.treasuryAccountSymbol.|vendorSiteDetails.vendorOrganizationFactors.|vendorSiteDetails.vendorOrganizationFactors.|vendorSiteDetails.vendorCertifications.|vendorSiteDetails.typeOfEducationalEntity.|vendorSiteDetails.vendorOrganizationFactors.|vendorSiteDetails.typeOfGovernmentEntity.|vendorSiteDetails.vendorRelationshipWithFederalGovernment.|vendorSiteDetails.vendorBusinessTypes.businessOrOrganizationType.|vendorSiteDetails.vendorLineOfBusiness.|vendorSiteDetails.vendorBusinessTypes.localGovernment.|vendorSiteDetails.vendorBusinessTypes.federalGovernment.|vendorSiteDetails.vendorSocioEconomicIndicators."
) %>% str_remove_all(
"listOfTreasuryAccounts.|vendorOrganizationFactors.|typeOfEducationalEntity.|localGovernment.|businessOrOrganizationType.|vendorLineOfBusiness.|vendorRelationshipWithFederalGovernment.|typeOfGovernmentEntity.|profitStructure.|vendorCertifications.|treasuryAccountSymbol.|relevantContractDates.|OtherTransactionAwardContractID.|purchaserInformation."
) %>% str_remove_all(
"\\^vendor.|^dollarValues.|^contractData.|^competition.|^transactionInformation."
)
)
})
dict_names <- dictionary_fpds_names()
fdps_names <-
dict_items$itemActual
actual_names <-
fdps_names %>%
map_chr(function(name) {
df_row <-
dict_names %>% filter(nameFPDS == name)
if (nrow(df_row) == 0) {
glue::glue("Missing {name}") %>% message()
return(name)
}
df_row$nameActual
})
dict_items <-
dict_items %>%
mutate(nameActual = actual_names)
all_data <-
all_data %>%
left_join(dict_items, by = "item")
all_data <- all_data %>%
select(-one_of(c("item", "itemActual", "nameNode")))
col_order <-
c("idTransactionOfficeYear", unique(all_data$nameActual))
all_data <-
all_data %>%
group_by(idTransactionOfficeYear, nameActual) %>%
mutate(countItem = 1:n() - 1) %>%
ungroup() %>%
mutate(nameActual = case_when(countItem == 0 ~ nameActual,
TRUE ~ str_c(nameActual, countItem))) %>%
select(-countItem) %>%
spread(nameActual, value) %>%
select(one_of(col_order), everything())
all_data <-
all_data %>%
.munge_data(clean_address = F)
all_data
}
.dl_fpds_xml <-
function(url = "https://www.fpds.gov/ddps/directory_browser/index.php?somepath=..%2FFY19-V1.5%2F1145-PEACECORPS&n=4", return_message = T) {
page <- url %>% read_html()
url_file <-
page %>% html_nodes('a') %>% html_attr("href") %>% keep(function(x){
x %>% str_detect("zip")
}) %>%
substr(4, nchar(.)) %>%
str_c("https://www.fpds.gov/ddps/",.)
outfile <- tempfile("download", fileext = ".zip")
file <- curl::curl_download(url_file, outfile)
unz_files <- unzip(file, exdir = "xml")
.parse_fpds_xml_safe <- possibly(.parse_fpds_xml, tibble())
all_data <-
unz_files %>%
map_dfr(function(file) {
if (return_message) {
glue("Parsing {file}")
}
doc <- read_xml(file)
data <- .parse_fpds_xml(doc = doc)
data
})
unz_files %>% unlink()
file %>% unlink()
unlink("xml", recursive = T)
gc()
all_data
}
.dl_fpds_xml_urls <-
function(urls = c(
"https://www.fpds.gov/ddps/directory_browser/index.php?somepath=..%2FFY19-V1.5%2F1145-PEACECORPS&n=4"
),
return_message = T) {
df <-
tibble()
success <- function(res) {
url <-
res$url
if (return_message) {
glue::glue("Parsing {url}") %>%
message()
}
.dl_fpds_xml_safe <-
purrr::possibly(.dl_fpds_xml, tibble())
all_data <-
.dl_fpds_xml_safe(url = url, return_message = return_message)
df <<-
df %>%
bind_rows(all_data)
}
failure <- function(msg) {
tibble()
}
urls %>%
map(function(x) {
curl_fetch_multi(url = x, success, failure)
})
multi_run()
df
}
#' Download FPDS Agency URLS
#'
#' @param urls
#' @param include_psc
#' @param include_naics
#' @param return_message
#' @param ...
#'
#' @return
#' @export
#'
#' @examples
download_fpds_xml_urls <-
function(urls = NULL,
include_psc = T,
include_naics = T,
return_message = T,
...) {
if (length(urls) == 0) {
stop("Please enter vector of FPDS xml URLS")
}
all_data <-
.dl_fpds_xml_urls(urls = urls, return_message = return_message)
all_data <- all_data %>% .munge_data(clean_address = F)
dict_agencies <- dictionary_government_agencies()
if (all_data %>% hasName("idOfficeAward")) {
all_data <-
all_data %>%
left_join(
dict_agencies %>% select(
idOfficeAward = idOffice,
nameAgencyAward = nameAgency,
nameOfficeAward = nameOffice
),
by = "idOfficeAward"
)
}
if (all_data %>% hasName("idOfficeContracting")) {
all_data <-
all_data %>%
left_join(
dict_agencies %>% select(
idOfficeContracting = idOffice,
nameOfficeContracting = nameOffice
),
by = "idOfficeContracting"
)
}
if (all_data %>% hasName("idOfficeIDV")) {
all_data <-
all_data %>%
left_join(dict_agencies %>% select(idOfficeIDV = idOffice, nameOfficeIDV = nameOffice),
by = "idOfficeIDV")
}
if (include_naics) {
all_data <-
all_data %>%
left_join(dictionary_naics_codes(), by = "idNAICS")
}
if (include_psc) {
all_data <-
all_data %>%
left_join(dictionary_psc_active() %>% select(codeProductService, nameFullPSC),
by = "codeProductService")
}
all_data
}
# angency -----------------------------------------------------------------
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.