# template guide -------------------------------------------------------------------
#
# https://www.fbo.gov/?static=interface&s=generalinfo
# dict --------------------------------------------------------------------
dictionary_fbo_solicitation_codes <- function() {
tibble(
codeSolicitation = c(
"PRESOL",
"SRCSGT",
"SNOTE",
"SSALE",
"COMBINE",
"AMDCSS",
"MOD",
"AWARD",
"JA",
"FAIROPP",
"ARCHIVE",
"UNARCHIVE",
"ITB",
"FSTD",
"EPSUPLOAD",
"DELETE"
),
typeSolicitation = c(
"PRESOLICITATION",
"SOURCES SOUGHT",
"SPECIAL NOTICE",
"SALE OF SURPLUS PROPERTY",
"COMBINED SYNOPSIS - SOLICITATION",
"AMENDMENT TO A PREVIOUS COMBINED SOLICITATION",
"MODIFICATION/AMENDMENT/CANCEL",
"AWARD",
"JUSTIFICATION AND APPROVAL",
"FAIR OPPORTUNITY / LIMITED SOURCES JUSTIFICATION",
"ARCHIVE NOTICE",
"UNARCHIVE NOTICE",
"INTENT TO BUNDLE REQUIREMENTS DOD-FUNDED",
"FOREIGN GOVERNMENT STANDARD",
"DOCUMENT UPLOAD",
"DOCUMENT DELETE"
)
)
}
#' Federal Supply Code Dictionary
#'
#' Dictionary of parent solicitation groups
#' used by FBO.gov
#'
#' @return
#' @export
#'
#' @examples
#' dictionary_federal_supply_codes()
dictionary_federal_supply_codes <-
function() {
text_product <- "10 -- Weapons
11 -- Nuclear ordnance
12 -- Fire control equipment
13 -- Ammunition & explosives
14 -- Guided missiles
15 -- Aircraft & airframe structural components
16 -- Aircraft components & accessories
17 -- Aircraft launching, landing & ground handling equipment
18 -- Space vehicles
19 -- Ships, small craft, pontoons & floating docks
20 -- Ship and marine equipment
22 -- Railway equipment
23 -- Ground effects vehicles, motor vehicles, trailers & cycles
24 -- Tractors
25 -- Vehicular equipment components
26 -- Tires and tubes
28 -- Engines, turbines & components
29 -- Engine accessories
30 -- Mechanical power transmission equipment
31 -- Bearings
32 -- Woodworking machinery and equipment
34 -- Metalworking machinery
35 -- Service and trade equipment
36 -- Special industry machinery
37 -- Agricultural machinery & equipment
38 -- Construction, mining, excavating & highway maintenance equipment
39 -- Materials handling equipment
40 -- Rope, cable, chain & fittings
41 -- Refrigeration, air-conditioning & air circulating equipment
42 -- Fire fighting, rescue & safety equipment
43 -- Pumps & compressors
44 -- Furnace, steam plant & drying equipment; & nuclear reactors
45 -- Plumbing, heating, & sanitation equipment
46 -- Water purification & sewage treatment equipment
47 -- Pipe, tubing, hose & fittings
48 -- Valves
49 -- Maintenance & repair shop equipment
51 -- Hand tools
52 -- Measuring tools
53 -- Hardware & abrasives
54 -- Prefabricated structures and scaffolding
55 -- Lumber, millwork, plywood & veneer
56 -- Construction & building materials
58 -- Communication, detection, & coherent radiation equipment
59 -- Electrical and electronic equipment components
60 -- Fiber optics materials, components, assemblies & accessories
61 -- Electric wire & power & distribution equipment
62 -- Lighting fixtures & lamps
63 -- Alarm, signal & security detection equipment
65 -- Medical, dental & veterinary equipment & supplies
66 -- Instruments & laboratory equipment
67 -- Photographic equipment
68 -- Chemicals & chemical products
69 -- Training aids & devices
70 -- General purpose information technology equipment
71 -- Furniture
72 -- Household & commercial furnishings & appliances
73 -- Food preparation and serving equipment
74 -- Office machines, text processing systems & visible record equipment
75 -- Office supplies and devices
76 -- Books, maps & other publications
77 -- Musical instruments, phonographs & home-type radios
78 -- Recreational & athletic equipment
79 -- Cleaning equipment and supplies
80 -- Brushes, paints, sealers & adhesives
81 -- Containers, packaging, & packing supplies
83 -- Textiles, leather, furs, apparel & shoe findings, tents & flags
84 -- Clothing, individual equipment & insignia
85 -- Toiletries
87 -- Agricultural supplies
88 -- Live animals
89 -- Subsistence
91 -- Fuels, lubricants, oils & waxes
93 -- Nonmetallic fabricated materials
94 -- Nonmetallic crude materials
95 -- Metal bars, sheets & shapes
96 -- Ores, minerals & their primary products
99 -- Miscellaneous" %>%
read_lines()
text_service <- "A -- Research & Development
B -- Special studies and analysis - not R&D
C -- Architect and engineering services
D -- Information technology services, including telecommunications services
E -- Purchase of structures & facilities
F -- Natural resources & conservation services
G -- Social services
H -- Quality control, testing & inspection services
J -- Maintenance, repair & rebuilding of equipment
K -- Modification of equipment
L -- Technical representative services
M -- Operation of Government-owned facilities
N -- Installation of equipment
P -- Salvage services
Q -- Medical services
R -- Professional, administrative, and management support services
S -- Utilities and housekeeping services
T -- Photographic, mapping, printing, & publication services
U -- Education & training services
V -- Transportation, travel, & relocation services
W -- Lease or Rental of equipment
X -- Lease or rental of facilities
Y -- Construction of structures and facilities
Z -- Maintenance, repair, and alteration of real property" %>%
read_lines()
tibble(text_product) %>%
separate(text_product, into = c("idSolicitationGroup", "nameSolicitationGroup"), extra = "merge", sep = "--") %>%
mutate_all(list(function(x){
x %>% str_squish() %>% str_to_upper()
})) %>%
mutate(typeSolicitationGroup = "PRODUCT") %>%
select(typeSolicitationGroup, everything()) %>%
bind_rows(
tibble(text_service) %>%
separate(text_service, into = c("idSolicitationGroup", "nameSolicitationGroup"), extra = "merge",sep = "--") %>%
mutate_all(list(function(x){
x %>% str_squish() %>% str_to_upper()
})) %>%
mutate(typeSolicitationGroup = "SERVICE") %>%
select(typeSolicitationGroup, everything())
)
}
.munge_biz_opps_names <-
function(data) {
names_dict <- names(data)
dict <- dictionary_bizopps_names()
actual_names <-
names_dict %>%
map_chr(function(name) {
df_row <-
dict %>% filter(nameGovernment == name)
if (nrow(df_row) == 0) {
glue::glue("Missing {name}") %>% message()
return(name)
}
df_row$nameActual
})
data %>%
set_names(actual_names)
}
# ftp ---------------------------------------------------------------------
### https://www.fbo.gov/downloads/fbo_web_services_technical_documentation.pdf
.parse_ftp_base <- function(url = "ftp://ftp.fbo.gov/") {
data <-
url %>% read_table(col_names = F) %>%
select(5:9) %>%
setNames(c("sizeFileKB", "month", "day", "year", "nameFile")) %>%
suppressMessages()
data <-
data %>%
mutate(
nameFile = nameFile %>% str_remove_all("\\~"),
typeFile = nameFile %>% str_remove_all("[0-9]|\\.csv"),
dateFile = nameFile %>% str_remove_all("FBOFeed|DSCC|FBORecovery|FBORecoveryAwards|ftpVAN") %>% substr(1, 8) %>% ymd(),
urlFBOData = glue("ftp://ftp.fbo.gov/{nameFile}") %>% as.character(),
urlFBOData = case_when(
typeFile == "FBOBuyAmericanAct" ~ "ftp://ftp.fbo.gov/FBOBuyAmericanAct/Buy_American_Act.xls",
typeFile == "datagov" ~ "ftp://ftp.fbo.gov/datagov/FBOFullXML.xml",
typeFile == "FBORecoveryAwards" ~ glue("ftp://ftp.fbo.gov/FBORecoveryAwards/{nameFile}") %>% as.character(),
typeFile == "FBORecovery" ~ glue("ftp://ftp.fbo.gov/FBORecovery/{nameFile}") %>% as.character(),
TRUE ~ urlFBOData
)
) %>%
select(-one_of(c("month", "day", "year"))) %>%
select(typeFile, dateFile, nameFile, everything())
data
data
}
.parse_ftp_xml <-
memoise::memoise(function(url = "ftp://ftp.fbo.gov/FBOFeed20191018",
exclude_tags = NULL,
return_message = T) {
parts <-
url %>% str_split("/") %>% flatten_chr() %>% str_remove_all(".csv")
dateData <-
parts[length(parts)] %>% str_remove_all("\\FBORecovery|FBORecoveryAwards|Awards|FBOFeed|ftpVAN") %>% ymd()
typeFile <-
parts[length(parts)] %>% str_remove_all("[0-9]")
fake_xml <- read_lines(url) %>% str_c(collapse = "")
tags <- c(
'PRESOL',
'SRCSGT',
'SNOTE',
'SSALE',
'COMBINE',
'AMDCSS',
'MOD',
'AWARD',
'JA',
'FAIROPP',
'ARCHIVE',
'UNARCHIVE',
'ITB',
'FSTD',
'EPSUPLOAD',
'DELETE'
)
split_tags <-
glue("</{tags}>") %>% str_c(collapse = "|")
remove_tags <-
glue("<{tags}>") %>% str_c(collapse = "|")
parts <-
fake_xml %>%
str_split(split_tags) %>% flatten_chr() %>%
str_remove_all(remove_tags)
html_parts <-
c(
'a',
'abbr',
'acronym',
'address',
'applet',
'area',
'article',
'aside',
'audio',
'b',
'base',
'basefont',
'bdi',
'bdo',
'bgsound',
'big',
'blink',
'blockquote',
'body',
'br',
'button',
'canvas',
'caption',
'center',
'cite',
'code',
'col',
'colgroup',
'command',
'content',
'data',
'datalist',
'dd',
'del',
'details',
'dfn',
'dialog',
'dir',
'div',
'dl',
'dt',
'element',
'em',
'embed',
'fieldset',
'figcaption',
'figure',
'font',
'footer',
'form',
'frame',
'frameset',
'h1',
'h2',
'h3',
'h4',
'h5',
'h6',
'head',
'header',
'hgroup',
'hr',
'html',
'i',
'iframe',
'image',
'img',
'input',
'ins',
'isindex',
'kbd',
'keygen',
'label',
'legend',
'li',
'link',
'listing',
'main',
'map',
'mark',
'marquee',
'math',
'menu',
'menuitem',
'meta',
'meter',
'multicol',
'nav',
'nextid',
'nobr',
'noembed',
'noframes',
'noscript',
'object',
'ol',
'optgroup',
'option',
'output',
'p',
'param',
'picture',
'plaintext',
'pre',
'progress',
'q',
'rb',
'rbc',
'rp',
'rt',
'rtc',
'ruby',
's',
'samp',
'script',
'section',
'select',
'shadow',
'slot',
'small',
'source',
'spacer',
'span',
'strike',
'strong',
'style',
'sub',
'summary',
'sup',
'svg',
'table',
'tbody',
'td',
'template',
'textarea',
'tfoot',
'th',
'thead',
'time',
'title',
'tr',
'track',
'tt',
'u',
'ul',
'var',
'video',
'wbr',
'xmp',
" r /"
) %>% str_c(collapse = "|")
if (length(exclude_tags) > 0) {
exclude_tags <- str_to_upper(exclude_tags)
tags <- tags[!tags %in% exclude_tags]
}
all_data <-
tags %>%
map_dfr(function(tag) {
tag %>% message()
part <- parts[parts %>% str_detect(tag)]
if (length(parts) == 0) {
return(invisible())
}
all_parts <- seq_along(part)
df_all <-
all_parts %>%
map_dfr(function(x) {
glue("{x} of {max(all_parts)}") %>% message()
df <-
tibble(part = part[[x]] %>% str_split("<") %>% flatten_chr()) %>%
separate(part, into = c("item", "value"), extra = "merge",sep = ">") %>%
filter(!is.na(value)) %>%
mutate_all(function(x) {
x %>% str_replace_all(" |N/A", NA_character_) %>% str_squish()
})
df_base <-
df %>%
filter(item %>% str_detect("^[A-Z]")) %>%
filter(!value == "") %>%
filter(!value %>% is.na())
col_order <- df_base$item %>% unique()
df_base <-
df_base %>%
group_by(item) %>%
dplyr::slice(1) %>%
ungroup() %>%
spread(item, value) %>%
mutate(numberItem = x) %>%
select(numberItem, one_of(col_order), everything())
has_descriptions <-
df %>% filter(!item %>% str_detect("[A-Z]")) %>% filter(value != "") %>% nrow() > 0
if (has_descriptions) {
description <-
df %>% filter(!item %>% str_detect("[A-Z]")) %>% filter(value != "") %>%
pull(value) %>% str_c(collapse = " ") %>%
str_squish()
df_base <-
df_base %>%
mutate(detailsSolicitation = description)
}
df_base
})
df_all <-
df_all %>%
mutate(tag) %>%
select(tag, everything())
if (df_all %>% hasName("DATE")) {
df_all <-
df_all %>%
unite(DATE, DATE, YEAR, sep = "")
}
df_all <-
df_all %>%
select(-matches("Upon|Enter|Alternate"))
df_all <-
df_all %>% .munge_fpds_names()
df_all <-
df_all %>%
mutate_at(df_all %>% select(matches("date")) %>% names(),
mdy) %>%
.munge_data(clean_address = F)
df_all %>% nest(-tag)
}) %>%
rename(codeSolicitation = tag)
all_data <-
all_data %>%
left_join(
dictionary_fbo_solicitation_codes(),
by = "codeSolicitation"
) %>%
select(typeSolicitation, everything())
all_data <- all_data %>%
unnest() %>%
select(-numberItem)
if (all_data %>% hasName("zipcodeAgency")) {
all_data <- all_data %>%
rename(zipcodeOffice = zipcodeAgency)
}
if (all_data %>% hasName("nameAwardee")) {
all_data <- all_data %>%
clean_entity_data(entity_column = "nameAwardee")
}
if (return_message) {
contracts <- nrow(all_data) %>% comma(digits = 0)
glue("\n\nAcquired {contracts} for {dateData} from {typeFile}\n\n") %>% message()
}
all_data <-
all_data %>%
mutate(typeAward = ifelse(is.na(typeAward), typeSolicitation, typeAward))
all_data
})
.parse_all_xml <-
function(url = "ftp://ftp.fbo.gov/datagov/FBOFullXML.xml") {
data <- read_xml(url)
nodes <-
xml_contents(data)
all_nodes <- seq_along(nodes)
max_nodes <- max(all_nodes)
all_data <-
all_nodes %>%
map_dfr(function(x) {
glue("{x} of {max_nodes}") %>% message()
node <- nodes[[x]] %>% xml_children()
values <- node %>% xml_text()
items <- node %>% xml_name()
tibble(item = items, value = values) %>%
spread(item, value) %>%
select(one_of(items))
})
rm(data)
rm(nodes)
gc()
all_data <-
all_data %>%
.munge_fpds_names()
all_data <-
all_data %>%
mutate_if(is.character,
function(x) {
x %>% str_to_upper() %>% str_replace_all("N/A", NA_character_) %>% str_trim()
})
all_data <-
all_data %>%
.munge_data(clean_address = F) %>%
mutate(isAward = !is.na(idAward))
all_data
}
.parse_ftp_csv <-
memoise::memoise(function(url = "ftp://ftp.fbo.gov/FBORecovery/FBORecovery20090926.csv",
return_message = T) {
parts <-
url %>% str_split("/") %>% flatten_chr() %>% str_remove_all(".csv")
dateData <-
parts[length(parts)] %>% str_remove_all("\\FBORecovery|FBORecoveryAwards|Awards") %>% ymd()
typeFile <- parts[length(parts)] %>% str_remove_all("[0-9]")
data <-
url %>%
read_csv(col_names = T) %>%
suppressMessages()
data <-
data %>%
.munge_fpds_names() %>%
.munge_data(clean_address = F) %>%
mutate(dateData,
typeFile,
urlFBOData = url) %>%
select(dateData, typeFile, everything()) %>%
suppressWarnings()
if (return_message) {
contracts <- nrow(data) %>% formattable::comma(digits = 0)
glue(("Acquired {contracts} from {typeFile} for {dateData}")) %>% message()
}
data
})
#' Read FBO ftp CSV urls
#'
#' Acquire fbo csv data from FBO ftp
#'
#' @param urls vector of csv urls
#' @param return_message if \code{TRUE} returns a message
#'
#' @return
#' @export
#'
#' @examples
#' fbo_ftp_csvs(urls = "ftp://ftp.fbo.gov/FBORecoveryAwards/FBORecoveryAwards20190406.csv")
parse_fbo_ftp_csvs <-
function(urls = NULL,
return_message = T) {
if (length(urls) == 0) {
stop("Please enter FBO ")
}
.parse_ftp_csv_safe <-
possibly(.parse_ftp_csv, tibble())
urls %>%
map_dfr(function(url) {
.parse_ftp_csv_safe(url = url, return_message)
}) %>%
suppressWarnings() %>%
.format_data() %>%
arrange(dateSolicitation)
}
#' Parse FBO ftp XML files
#'
#' @param urls vector of urls
#' @param return_message if \code{TRUE} returns a message
#' @param exclude_tags if not \code{NULL} fbo solicitation types to exclude
#'
#' @return a \code{tibble()}
#' @export
#'
#' @examples
#' parse_fbo_ftp_xmls(urls = "ftp://ftp.fbo.gov/FBOFeed20151017")
parse_fbo_ftp_xmls <-
function(urls, exclude_tags = NULL,
return_message = T) {
if (length(urls) == 0) {
stop("Please enter FBO ")
}
.parse_ftp_xml_safe <-
possibly(.parse_ftp_xml, tibble())
urls %>%
map_dfr(function(url) {
.parse_ftp_xml_safe(url = url, return_message)
}) %>%
suppressWarnings() %>%
.format_data() %>%
arrange(dateSolicitation)
}
#' Acquire FBO bulk FTP data
#'
#' @param type type of file \itemize{
#' \item feed - daily feed
#' \item recovery - recovery data
#' \item recovery awards - recovery awards
#' \item buy american
#' \item all - all historic daily feeds
#' }
#' @param start_date if not \code{NULL} start date
#' @param end_date if not \code{NULL} end date
#' @param return_message if \code{TRUE} returns message
#' @param distinct_solicitations if \code{TRUE} returns only the most recent distinct solicitations
#' @param exclude_tags if not \code{NULL} fbo solicitation types to exclude
#'
#' @return
#' @export
#'
#' @examples
#' \dontrun{
#' fbo_ftp_data(type = "recovery")
#' }
fbo_ftp_data <-
function(type = NULL,
start_date = NULL,
end_date = NULL,
distinct_solicitations = TRUE,
exclude_tags = NULL,
return_message = T) {
type_slug <- type %>% str_to_lower()
if (type_slug == "buy american") {
data <-
fbo_buy_american_expenditures(return_message = return_message)
return(data)
}
if (type_slug == "all") {
data <- .parse_all_xml()
return(data)
}
slugs <-
case_when(
type_slug == "feed" ~ c("FBOFeed" , "ftpVAN"),
type_slug == "recovery" ~ "FBORecovery",
type_slug == "recovery awards" ~ "FBORecoveryAwards"
)
df_urls <-
fbo_ftp_urls()
df_urls <-
df_urls %>%
filter(typeFile %in% slugs)
if (length(start_date) > 0) {
df_urls <- df_urls %>%
filter(dateFile >= start_date)
}
if (length(end_date) > 0) {
df_urls <- df_urls %>%
filter(dateFile <= end_date)
}
if (type_slug == "feed") {
urls <- df_urls$urlFBOData
data <-
parse_fbo_ftp_xmls(urls = urls, return_message = return_message, exclude_tags = exclude_tags)
if (distinct_solicitations) {
data <-
data %>%
group_by(idSolicitation, nameSolicitation, nameAgency, descriptionSolicitation, idAward, amountAward) %>%
filter(dateSolicitation == max(dateSolicitation)) %>%
ungroup()
}
return(data)
}
urls <- df_urls$urlFBOData
all_data <-
parse_fbo_ftp_csvs(urls = urls, return_message = return_message)
all_data
}
# fbo_functions -----------------------------------------------------------
#' FBO ftp file dictionary
#'
#' Acquires all active files on
#' fbo.gov's fbo file dictionary
#'
#' @return
#' @export
#'
#' @examples
#' fbo_ftp_urls()
fbo_ftp_urls <-
function() {
urls <-
c(
"ftp://ftp.fbo.gov/",
"ftp://ftp.fbo.gov/FBORecoveryAwards/",
"ftp://ftp.fbo.gov/FBORecovery/"
)
all_data <-
urls %>%
map_dfr(function(url) {
.parse_ftp_base(url = url)
})
all_data <-
all_data %>%
filter(!nameFile %in% c("welcome.msg", "FBORecovery", "FBORecoveryAwards")) %>%
arrange(desc(dateFile))
all_data
}
#' FBO Buy American Act data
#'
#' Bulk data from
#'
#' @param return_message if \code{TRUE} returns a message
#'
#' @return
#' @export
#'
#' @examples
#' \dontrun{
#' fbo_buy_american_expenditures()
#' }
fbo_buy_american_expenditures <-
memoise::memoise(function(return_message = T) {
url <- "ftp://ftp.fbo.gov/FBOBuyAmericanAct/Buy_American_Act.xls"
outfile <- tempfile("buy_american", fileext = ".xls")
file <- curl::curl_download(url, outfile)
data <-
readxl::read_excel(file)
data <-
data %>%
.munge_fpds_names() %>%
separate(
"typeDescriptionManufacter",
into = c("locationManufactor", "descriptionReasonManufacture"),
sep = "\\ - ",
extra = "merge"
)
if (return_message) {
start <- data$dateContractSigned %>% min(na.rm = T)
end <- data$dateContractSigned %>% max(na.rm = T)
contracts <-
data %>% distinct(idContract) %>% nrow() %>% formattable::comma(digits = 0)
glue("Acquired {contracts} from {start} to {end} sourced through the Buy American Act") %>% message()
}
data
})
# options -----------------------------------------------------------------
function() {
page <-
"https://www.fbo.gov/index?s=opportunity&tab=search&mode=list" %>%
read_html()
naics_codes <-
page %>%
html_nodes("#scrollable_checkbox_dnf_class_values_procurement_notice__naics_code___ div") %>%
html_text()
classifications <-
page %>%
html_nodes(
"#scrollable_checkbox_dnf_class_values_procurement_notice__classification_code___"
) %>%
html_text() %>%
str_split("\n") %>%
flatten_chr()
classifications <- classifications[!classifications == ""]
}
# agencies ----------------------------------------------------------------
#' FBO Active Opportunities
#'
#' @param sleep_time if not \code{NULL} time in between scrapes
#' @param parse_contract_details if \code{TRUE} parses all detailed contracts
#' @param return_message if \code{TRUE} returns a message()
#'
#' @return
#' @export
#'
#' @examples
fbo_opportunities <-
function(sleep_time = NULL,
parse_contract_details = F,
exclude_awards = F,
return_message = T) {
df_urls <-
.active_urls()
data <-
df_urls$urlPage %>%
.parse.opportunity.urls()
char_cols <-
data %>%
select_if(is.character) %>%
select(-matches("url")) %>% names()
data <-
data %>%
mutate_at(char_cols,
list(function(x) {
ifelse(x == "N/A", NA_character_, x) %>% gsub("\\s+", " ", .) %>% str_to_upper()
}))
if (data %>% tibble::has_name("nameAgency")) {
dict_logos <- dictionary_agency_logos()
data <-
data %>%
select(-one_of("urlAgencyLogo")) %>%
left_join(dict_logos, by = "nameAgency")
}
data <-
data %>%
mutate(statusBid = case_when(
is.na(idSolicitation) ~ "AWARD",
TRUE ~ statusBid
)) %>%
mutate(isAward = statusBid == "AWARD")
data <-
data %>%
.munge_data(clean_address = F)
if (exclude_awards) {
data <-
data %>%
filter(!isAward)
}
data
}
# search ------------------------------------------------------------------
.fbo_cookies <-
function() {
structure(
list(
"balancer.172.16.121.3",
"12c6b82f50a58f2490926325bf05cb7e",
"1"
),
.Names = c("BALANCEID", "PHPSESSID", "sympcsm_cookies_enabled")
)
}
.fbo_headers <-
function() {
data <-
structure(
list(
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"gzip, deflate, br",
"en-US,en;q=0.9",
"max-age=0",
"close",
"application/x-www-form-urlencoded",
"1",
"https://www.fbo.gov",
"https://www.fbo.gov/",
"1",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.85 Safari/537.36"
),
.Names = c(
"Accept",
"Accept-Encoding",
"Accept-Language",
"Cache-Control",
"Connection",
"Content-Type",
"DNT",
"Origin",
"Referer",
"Upgrade-Insecure-Requests",
"User-Agent"
)
) %>%
flatten_df()
df_ref <- .generate_url_reference()
data$Referer <- df_ref$urlReferer
data$`User-Agent` <- df_ref$userAgent
as.list(data)
}
.fbo_params <-
function() {
structure(list("list", "opportunity", "list"),
.Names = c("mode",
"s", "tab"))
}
.fbo_data <-
function() {
data <- structure(
list(
"",
"1",
"365",
"",
"procurement_notice",
"",
"a",
"",
"",
"search",
"0",
"update",
"",
"vendor_procurement_notice_filter",
"1",
"dnf_"
),
.Names = c(
"autocomplete_input_dnf_class_values[procurement_notice][agency]",
"clear_filters_from_home",
"dnf_class_values[procurement_notice][_posted_date]",
"dnf_class_values[procurement_notice][agency]",
"dnf_class_values[procurement_notice][dnf_class_name]",
"dnf_class_values[procurement_notice][keywords]",
"dnf_class_values[procurement_notice][procurement_type][]",
"dnf_class_values[procurement_notice][set_aside][]",
"dnf_class_values[procurement_notice][zipstate]",
"dnf_opt_action",
"dnf_opt_finalize",
"dnf_opt_mode",
"dnf_opt_target",
"dnf_opt_template",
"dnf_opt_validate",
"so_form_prefix"
)
)
data
}
.get_fbo_search_result <-
function() {
cookies <-
.fbo_cookies()
headers <-
.fbo_headers()
params <-
.fbo_params()
data <-
.fbo_data()
page <-
Post(
url = 'https://www.fbo.gov/index',
headers = headers,
data = data,
cookies = cookies,
params = params
)
}
.parse.result.page <-
function(page) {
name_solicitation <-
page %>% .parse_for_text(css = ".solt") %>% str_to_upper()
href_sol <-
page %>% html_nodes(".lst-lnk-notice") %>%
html_attr("href")
urls_solicitation <-
href_sol %>%
str_c("https://www.fbo.gov/index", .)
page_ids <-
urls_solicitation %>%
map_chr(function(url) {
httr::parse_url(url = url) %>%
flatten_df() %>%
pull(id)
})
type_solicitation_group <-
page %>%
.parse_for_text(css = ".solcc")
id_solicitation <-
page %>%
.parse_for_text(css = ".soln")
data <-
seq_along(name_solicitation) %>%
map_dfr(function(x) {
name_sol <- name_solicitation[[x]]
if (name_sol %>% str_detect("\\--")) {
name_sol <-
name_sol %>% str_split("\\--") %>% flatten_chr() %>% .[[2]]
}
sol_details <-
type_solicitation_group[[x]] %>% str_split("\\ -- ") %>%
flatten_chr() %>%
str_to_upper()
tibble(
idRow = x,
idSolicitationGroup = sol_details[1],
descriptionSolicitationGroup = sol_details[2],
nameSolicitation = name_sol
)
}) %>%
mutate(idSolicitation = id_solicitation)
agency <-
page %>%
.parse_for_text(".pagency")
agency_details <-
page %>%
html_nodes(".lst-cl-first+ .lst-cl") %>%
gsub(pattern = '<.*?>', replacement = "|", .) %>%
str_trim()
offices <-
seq_along(agency) %>%
map_chr(function(x) {
actual_name <- agency[[x]]
actual_details <- agency_details[[x]]
actual_details %>% str_remove_all(actual_name) %>%
str_trim() %>%
str_split("\\|") %>%
flatten_chr() %>%
str_trim() %>%
gsub("\\s+", " ", .) %>%
discard(function(x) {
x == ""
}) %>%
str_c(collapse = "|")
})
data <-
data %>%
mutate(detailsAgency = agency_details,
nameOffice = offices) %>%
separate(nameOffice,
into = c("nameOffice", "detailsOffice"),
extra = "merge",
sep = "\\|") %>%
select(-detailsAgency)
type_setaside <-
page %>% .parse_for_text(css = ".lst-cl:nth-child(3)")
posted <-
page %>% .parse_for_text(css = ".lst-cl:nth-child(4)") %>%
lubridate::mdy()
data <-
data %>%
mutate(
nameAgency = agency,
typeSetAside = type_setaside,
datePosted = posted,
idURL = page_ids,
urlSolicitation = urls_solicitation
)
data <-
data %>%
mutate(
isModified = typeSetAside %>% str_detect("(Modified)"),
isCombinedSolicitation = typeSetAside %>% str_detect("Combined Synopsis/Solicitation"),
isAward = typeSetAside %>% str_detect("Award"),
isPreSolicitation = typeSetAside %>% str_detect("Presolicitation"),
typeSetAside = typeSetAside %>% str_replace_all(
"Combined Synopsis/Solicitation",
"Combined Synopsis - Solicitation"
) %>% str_replace_all("\\(", "") %>% str_replace_all("\\)", "") %>% str_replace_all("Modified", "")
) %>%
separate(typeSetAside,
into = c("statusBid", "typeSetAside"),
extra = "merge",
sep = " / ") %>%
mutate_if(is.character,
funs(. %>% str_trim())) %>%
mutate_if(is.character,
funs(ifelse(. == "", NA, .))) %>%
suppressWarnings()
data <-
data %>%
select(-one_of("idRow")) %>%
dplyr::select(which(colMeans(is.na(.)) < 1)) %>%
select(idSolicitation, one_of(c(
"nameAgency", "nameOffice", "detailsOffice"
)), everything()) %>%
suppressWarnings()
data
}
.parse.opportunity.url <-
function(url = "https://www.fbo.gov/index?s=opportunity&mode=list&tab=list&tabmode=list&pp=100&pageID=250") {
page <-
url %>%
read_html()
data <-
.parse.result.page(page = page) %>%
mutate(urlOpportunityPage = url) %>%
dplyr::select(dplyr::matches("Solicitation"), everything()) %>%
.munge_data()
data
}
.parse.opportunity.urls <-
function(urls = c(
"https://www.fbo.gov/index?s=opportunity&mode=list&tab=list&tabmode=list&pp=100&pageID=25",
"https://www.fbo.gov/index?s=opportunity&mode=list&tab=list&tabmode=list&pp=100&pageID=29"
),
sleep_time = NULL,
return_message = T) {
df <-
tibble()
success <- function(res) {
url <-
res$url
if (return_message) {
glue::glue("Parsing {url}") %>%
message()
}
.parse.opportunity.url.safe <-
purrr::possibly(.parse.opportunity.url, tibble())
all_data <-
.parse.opportunity.url.safe(url = url)
if (!sleep_time %>% purrr::is_null()) {
Sys.sleep(time = sleep_time)
}
df <<-
df %>%
bind_rows(all_data)
}
failure <- function(msg) {
tibble()
}
urls %>%
map(function(x) {
curl_fetch_multi(url = x, success, failure)
})
multi_run()
df
}
.active_urls <-
function() {
page <-
"https://www.fbo.gov/index?s=opportunity&mode=list&tab=list&tabmode=list&pp=100&pageID=1" %>%
read_html()
active <-
page %>%
html_nodes(".lst-head .lst-cnt") %>%
html_text() %>%
str_split(" of ") %>%
flatten_chr() %>%
.[[2]] %>%
as.numeric()
pages <- active %/% 100
all_pages <- 1:pages
urls <-
glue::glue(
"https://www.fbo.gov/index?s=opportunity&mode=list&tab=list&tabmode=list&pp=100&pageID={all_pages}"
) %>%
as.character()
tibble(idPage = all_pages,
urlPage = urls)
}
# offices -----------------------------------------------------------------
# urlsearch ---------------------------------------------------------------
.generate_url_reference <-
function() {
user_agents <-
c(
"Mozilla/5.0 (Linux; U; en-US) AppleWebKit/528.5+ (KHTML, like Gecko, Safari/528.5+) Version/4.0 Kindle/3.0 (screen 600x800; rotate)",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246",
"Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.111 Safari/537.36"
)
user_agent <-
user_agents[!user_agents %>% str_detect("bot|slurp")] %>%
sample(1)
tl_domain <-
c('.com', '.gov', '.org', '.mil', '.co') %>%
sample(1)
word_length <-
8:15
words <-
word_length %>% sample(1)
domain_slug <-
1:words %>%
map_chr(function(x) {
sample(letters, 1)
}) %>%
paste0(collapse = '')
url <-
list('http://', domain_slug, tl_domain) %>%
purrr::reduce(paste0)
df <-
tibble(urlReferer = url,
userAgent = user_agent)
return(df)
}
# contracts ---------------------------------------------------------------
.parse.contract_general <-
function(page) {
df_attrs <-
.get_page_attribute_data(page)
widgets <-
df_attrs %>%
filter(class == "widget") %>%
pull(id)
df_widgets <-
widgets %>%
map_dfr(function(id) {
css_id <-
glue::glue("#{id}") %>% as.character()
name <-
id %>% str_replace_all("dnf_class_values_procurement_notice__|__widget", "") %>% str_split("_") %>%
flatten_chr() %>% discard( ~ .x == "") %>% str_c(collapse = "_")
text <-
page %>%
html_nodes(css = css_id) %>%
html_text() %>%
str_trim()
tibble(id, name, text)
})
data <-
df_widgets %>%
filter(!name %>% str_detect("packages|additional_info_link")) %>%
select(-id) %>%
distinct() %>%
distinct()
actual_names <- data$name %>% .resolve_bizopp_ids()
data <-
data %>%
mutate(nameActual = actual_names) %>%
select(nameActual, text) %>%
mutate(
text = if_else(text == "N/A", NA_character_, text),
text = if_else(text == "-", NA_character_, text)
) %>%
filter(!is.na(text))
col_order <-
data$nameActual
data <-
data %>%
distinct() %>%
spread(nameActual, text) %>%
select(one_of(col_order))
data
}
.parse.contract.url <-
function(url = "https://www.fbo.gov/index?s=opportunity&mode=form&id=2de499d566906e1f47a49e9ba00d0ea9&tab=core&tabmode=list&=",
return_message = T) {
if (return_message) {
glue::glue("Parsing {url}") %>% message()
}
page <-
url %>%
read_html()
name_solicitation <-
page %>%
html_nodes("h2") %>%
html_text() %>%
str_to_upper() %>%
str_c(collapse = " ")
id_solicitiation <-
page %>%
html_nodes(".sol-num") %>%
html_text() %>%
str_replace_all("Solicitation Number: ", "")
agency_info <-
page %>%
html_nodes(".agency-name") %>%
str_replace_all("<br>", "\n") %>%
read_html() %>%
html_text() %>%
str_split("\n") %>%
flatten_chr() %>%
suppressWarnings()
df_agency <-
tibble(agency_info) %>%
separate(agency_info,
into = c("item", "value"),
extra = "merge",
sep = "\\: ") %>%
left_join(tibble(
item = c("Agency", "Office", "Location"),
nameActual = c("nameAgency", "nameOfficeAgency", "nameLocationAgency")
)) %>%
select(nameActual, value) %>%
spread(nameActual, value) %>%
suppressMessages()
data <-
page %>%
.parse.contract_general()
if (name_solicitation %>% str_detect("\\--")) {
sol_details <-
name_solicitation %>% str_split("\\--") %>% flatten_chr()
idSol <-
sol_details %>% .[[1]]
name_sol <-
sol_details %>% .[[2]]
df_sol <-
tibble(
idSolicitationGroup = idSol,
descriptionSolicitationGroup = name_sol,
nameSolicitation = name_solicitation
)
} else {
df_sol <- tibble(nameSolicitation = name_solicitation)
}
data <-
df_agency %>%
bind_cols(df_sol) %>%
bind_cols(data) %>%
mutate(idSolicitation = id_solicitiation) %>%
select(idSolicitation, nameSolicitation, everything())
data <-
data %>%
munge_data() %>%
suppressWarnings()
if (page %>% html_nodes("additional_info_link") %>% length() > 0) {
names <-
page %>%
html_nodes("#dnf_class_values_procurement_notice__additional_info_link__widget a") %>%
html_text() %>%
str_trim()
url <- page %>%
html_nodes("#dnf_class_values_procurement_notice__additional_info_link__widget a") %>%
html_attr("href")
tibble(urlLinkAdditional = url,
nameLinkAdditional = names)
}
notice_history <-
page %>%
html_nodes(".notice-hist")
urls_solicitation <-
notice_history %>% html_nodes("a") %>% html_attr("href") %>%
str_c("https://www.fbo.gov", ., sep = "")
url_length <-
length(urls_solicitation) + 1
df_releases <-
1:url_length %>%
map_dfr(function(x) {
css <-
glue::glue("li:nth-child({x})") %>% as.character()
text <-
notice_history %>% html_nodes(css = css) %>% html_children() %>%
html_text()
if (text %>% length() == 0) {
return(invisible())
}
if (text %>% str_detect("Complete View") %>% sum(na.rm = T)) {
return(invisible())
}
type <-
text[1]
start_d <-
length(text) - 1
dates <-
text[start_d:length(text)] %>%
str_c(collapse = " ") %>%
lubridate::mdy_hm()
tibble(
datetimePosted = dates,
typePosting = type %>% str_to_upper(),
numberContractRelease = x
) %>%
select(numberContractRelease, typePosting, everything())
}) %>%
mutate(urlSolicitationPrior = urls_solicitation)
if (df_releases %>% nrow() > 0) {
data <-
data %>%
mutate(dataRevisions = list(df_releases))
}
## files
file_nodes <-
page %>% html_nodes(".sb:nth-child(1) .pkg")
if (file_nodes %>% length() > 0) {
df_files <-
seq_along(file_nodes) %>%
map_dfr(function(x) {
nameFile <-
file_nodes[[x]] %>% html_nodes("a") %>% html_text() %>% str_trim() %>% discard( ~ .x == "")
urlFileZip <-
file_nodes[[x]] %>% html_nodes("a") %>% html_attr("href") %>%
keep( ~ .x %>% str_detect("dZip")) %>%
str_c("https://www.fbo.gov", .)
dateFile <-
file_nodes[[x]] %>% html_text() %>% str_replace_all(nameFile, "") %>% str_trim() %>% lubridate::mdy()
df <-
tibble(numberFile = x, nameFile, dateFile, urlFileZip)
file_nodes[[x]] %>% html_nodes("a") %>% html_attr("target")
df
})
file_n <-
page %>% html_nodes(".sb:nth-child(1)")
files <-
file_n %>%
html_nodes("a") %>% html_attr('href') %>%
keep( ~ .x %>% str_detect("/utils/view?")) %>%
str_c("https://www.fbo.gov", .)
if (length(files) == nrow(df_files)) {
df_files <-
df_files %>%
left_join(tibble(urlFile = files) %>%
mutate(numberFile = 1:n())) %>%
suppressMessages()
}
data <-
data %>%
mutate(dataFiles = list(df_files))
}
data <-
data %>%
mutate(urlSolicitation = url)
has_logo <-
page %>%
html_nodes('.agency-logo img') %>% length() > 0
if (has_logo) {
url_logo <-
page %>%
html_nodes('.agency-logo img') %>%
html_attr("src") %>%
str_c("https://www.fbo.gov/utils/", ., collapse = "")
data <-
data %>%
mutate(urlAgencyLogo = url_logo)
}
data
}
.parse_fbo_solicitation_urls <-
function(urls = NULL,
return_message = T,
sleep_time = NULL) {
if (urls %>% purrr::is_null()) {
stop("Please Enter URLS")
}
df <-
tibble()
success <- function(res) {
url <-
res$url
.parse.contract.url.safe <-
purrr::possibly(.parse.contract.url, tibble())
all_data <-
.parse.contract.url.safe(url = url)
if (!sleep_time %>% purrr::is_null()) {
Sys.sleep(time = sleep_time)
}
df <<-
df %>%
bind_rows(all_data)
}
failure <- function(msg) {
tibble()
}
urls %>%
map(function(x) {
curl_fetch_multi(url = x, success, failure)
})
multi_run()
df
}
#' Parse FBO contract url opportunities
#'
#'
#' @param urls a vector of urls
#' @param return_message if \code{TRUE} returns a message
#' @param sleep_time if not \code{NULL} system sleps for specified time
#'
#' @return a \code{tibble}
#' @export
#'
#' @examples
#' parse_fbo_solicitation_urls(urls =c("https://www.fbo.gov/index?s=opportunity&mode=form&id=887c71827400287a1d225950fab458e3&tab=core&_cview=0", "https://www.fbo.gov/index?s=opportunity&mode=form&id=5e77c77f41711c5774dac7e4b40e74be&tab=core&_cview=0"), return_message = T, sleep_time = NULL)
parse_fbo_solicitation_urls <-
function(urls = NULL,
return_message = T,
sleep_time = NULL) {
if (length(urls) == 0) {
"Enter URLs" %>% message()
return(invisible())
}
.parse_fbo_solicitation_urls_safe <-
possibly(.parse_fbo_solicitation_urls, tibble())
data <-
.parse_fbo_solicitation_urls(urls = urls,
return_message = return_message,
sleep_time = sleep_time)
char_cols <-
data %>%
select_if(is.character) %>%
select(-matches("url")) %>% names()
data <-
data %>%
mutate_at(char_cols,
list(function(x) {
ifelse(x == "N/A", NA_character_, x) %>% gsub("\\s+", " ", .)
})) %>%
.munge_data(clean_address = F)
if (data %>% tibble::has_name("nameAgency")) {
dict_logos <- dictionary_agency_logos()
data <-
data %>%
select(-one_of("urlAgencyLogo")) %>%
left_join(dict_logos, by = "nameAgency")
}
d_f_cols <-
data %>%
select(matches("data|feature")) %>%
names()
if (length(d_f_cols) > 0) {
df_features <-
data %>%
select(one_of("idSolicitation"), matches("data|feature")) %>%
mutate_at(d_f_cols, list(function(x) {
x %>% map_dbl(length) > 0
}))
new_name <- d_f_cols %>% substr(1, 1) %>% str_to_upper() %>%
str_c(d_f_cols %>% substr(2, nchar(d_f_cols)), sep = "")
new_names <- str_c("has", new_name)
names(df_features)[names(df_features) %in% d_f_cols] <-
new_names
data <-
data %>%
left_join(df_features, by = "idSolicitation")
}
data
}
#' FBO Active Solicitations
#'
#' Scraped and cached version of all non
#' award solicitations on FBO.gov
#'
#' @return
#' @export
#'
#' @examples
fbo_scraped_active <-
function() {
urls <- glue("https://asbcllc.com/r_packages/govtrackR/data/fbo/active_opportunity_cache/all_fbo_active/all_fbo_active_{1:10}.rda")
.read_rda_safe <- possibly(read_rda, tibble())
read_rda_m <- memoise::memoise(.read_rda_safe)
data <-
urls %>%
map_dfr(function(url){
read_rda_m(file = url)
})
data
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.