## https://github.com/GSA/IAE-Architecture/blob/master/as-is/tech-docs/SAM/SAMWebServicesExtractsMappingsv1.0/SAM%20Master%20Extract%20Mapping%20v5.5%20Sensitive%20File%20V2%20Layout.xlsx
# utils -------------------------------------------------------------------
.parse_html_description <-
function(description) {
description %>%
stri_enc_toascii() %>%
read_html() %>%
html_text() %>%
stringi::stri_trans_general("Latin-ASCII") %>%
stri_enc_toascii() %>% str_replace_all(" \032 ", " ") %>% str_to_upper() %>%
str_squish()
}
.filter_duns <-
function(data, duns) {
data %>%
filter(idDUNS %in% duns)
}
.clean_usg_organizations <-
function(data,
column = "nameDepartment",
include_slug = T) {
if (!data %>% hasName(column)) {
return(data)
}
df_dict <-
data %>%
select(column) %>%
distinct() %>%
filter(!is.na(!!sym(column))) %>%
arrange(!!sym(column))
new_col <- glue("{column}Clean") %>% as.character()
df_dict <-
df_dict %>%
separate(
!!sym(column),
sep = "\\,",
c("part1", "part2"),
fill = "right",
extra = "merge",
remove = F
) %>%
mutate_all(str_squish)
df_dict <-
df_dict %>%
mutate(UQ(new_col) := case_when(
is.na(part2) ~ part1,
!is.na(part2) ~ str_c(part2, part1, sep = " ")
)) %>%
select(one_of(column, new_col)) %>%
mutate_at(new_col, list(function(x) {
x %>% str_remove_all("ATTN:") %>% stri_enc_toascii() %>%
gsub("\\s+", " ", .) %>%
str_squish()
}))
if (include_slug) {
slug_col <-
new_col %>% str_replace_all("name", "slug") %>% str_remove_all("Clean")
df_dict <-
df_dict %>%
separate(
UQ(new_col),
into = c(new_col, slug_col),
sep = "\\(",
fill = "right",
extra = "merge"
)
df_dict <-
df_dict %>%
mutate_at(slug_col,
list(function(x) {
x %>% str_remove_all("\\)")
})) %>%
.remove_na()
}
df_dict <- df_dict %>%
mutate(
UQ(new_col) := case_when(
!!sym(new_col) == "DEPT OF DEFENSE" ~ "DEPARTMENT OF DEFENSE",
TRUE ~ !!sym(new_col)
)
)
data <-
data %>%
left_join(df_dict, by = column) %>%
select(-column) %>%
rename(UQ(column) := UQ(new_col)) %>%
select(one_of(names(data)), everything()) %>%
mutate_if(is.character, str_squish)
data
}
# dictionaries ------------------------------------------------------------
#' SAM Bulk Entity File Dictionary
#'
#' Returns list of the most recent
#' SAM Entity Data Dictionary
#'
#' @return
#' @export
#'
#' @examples
#' dictionary_sam_entity_files()
dictionary_sam_entity_files <-
function() {
data <-
"https://beta.sam.gov/api/prod/fileextractservices/v1/api/listfiles?random=1575841130052&domain=Entity%20Information" %>%
fromJSON() %>%
.[[1]]
data <- data[[1]] %>%
as_tibble() %>%
setNames(c(
"nameFile",
"dateModified",
"nameBucketS3",
"keyS3",
"urlFile"
)) %>%
mutate(dateModified = mdy(dateModified))
urls <- data$urlFile$self %>% pull()
date_slugs <- urls %>%
(
"https://s3.amazonaws.com/falextracts/Entity Information/|\\SAM_PUBLIC_MONTHLY_|entityinformation|csv|ZIP"
) %>% ("\\.")
data <-
data %>%
mutate(
urlFile = urls,
dateSlug = date_slugs,
typeAPI = case_when(
nameFile %>% str_detect("SAM_PUBLIC") ~ "SAM V2",
TRUE ~ "SAM OLD"
),
dateData = case_when(typeAPI == "SAM V2" ~ ymd(dateSlug),
TRUE ~ mdy(dateSlug)),
isMostRecent = dateData == dateData %>% max()
) %>%
select(dateData, nameFile, everything()) %>%
select(-dateSlug)
data
}
#' SAM Parent Entity Dictionary
#'
#' Returns information about SAM parent entity types.
#'
#' @return \code{tibble()}
#' @export
#'
#' @examples
#' dictionary_sam_parent_entity_types()
dictionary_sam_parent_entity_types <-
function() {
"2J - Sole Proprietorship
2K - Partnership or Limited Liability Partnership
2L - Non Tax Exempt Corporate Entity
8H - Tax Exempt Corporate Entity
2A - United States Government Entity
CY - Foreign Country or Government
X6 - International Organization
ZZ - Other" %>%
str_split("\n") %>%
flatten_chr() %>%
str_squish() %>%
tibble(type = .) %>%
separate(
type,
into = c("slugEntityStructure", "typeEntityStructure"),
sep = "\\ - "
) %>%
mutate_all(str_to_upper)
}
.dictionary_sam_extract_names <- function() {
tibble(
idColumn = 1:149,
nameColumn = c(
"idDUNS",
"slugDUNS",
"slugCAGE",
"slugDeptDefenseAddressCode",
"slugSAMExtract",
"slugRegistrationPurpose",
"dateRegistrationInitial",
"dateExpiration",
"dateLastUpdate",
"dateActivation",
"nameCompanyLegal",
"nameCompanyDBA",
"nameCompanyDivision",
"codeCompanyDivision",
"addressStreet1Company",
"addressStreet2Company",
"cityCompany",
"stateCompany",
"zipcodeCompany",
"zipcode4Company",
"codeCountryCompany",
"idCongressionalDistrictCompany",
"dateCompanyStart",
"monthdayFiscalYearEnd",
"urlCompany",
"slugEntityStructure",
"stateIncorporation",
"codeCountryIncorporation",
"countBusinessTypes",
"dataBusinessTypes",
"idNAICSPrimary",
"countNAICS",
"dataNAICS",
"countProductServiceCodes",
"dataProductServiceCodes",
"hasCreditCardUsage",
"slugCorrespondenceFlag",
"addressStreet1MailingCompany",
"addressStreet2MailingCompany",
"cityMailingCompany",
"zipcodeMailingCompany",
"zipcode4MailingCompany",
"codeCountryMailingCompany",
"stateMailingCompany",
"nameFirstPointOfContactGovt",
"nameMiddlePointOfContactGovt",
"nameLastPointOfContactGovt",
"titlePointOfContactGovt",
"addressStreet1PointOfContactGovt",
"addressStreet2PointOfContactGovt",
"cityPointOfContactGovt",
"zipcodePointOfContactGovt",
"zip4PointOfContactGovt",
"codeCountryPointOfContactGovt",
"statePointOfContactGovt",
"telephonePointOfContactGovt",
"telephoneExtensionPointOfContactGovt",
"telephoneNonUSPointOfContactGovt",
"faxPointOfContactGovt",
"emailPointOfContactGovt",
"nameFirstPointOfContactGovtAlt",
"nameMiddlePointOfContactGovtAlt",
"nameLastPointOfContactGovtAlt",
"titlePointOfContactGovtAlt",
"addressStreet1PointOfContactGovtAlt",
"addressStreet2PointOfContactGovtAlt",
"cityPointOfContactGovtAlt",
"zipcodePointOfContactGovtAlt",
"zip4PointOfContactGovtAlt",
"codeCountryPointOfContactGovtAlt",
"statePointOfContactGovtAlt",
"telephonePointOfContactGovtAlt",
"telephoneExtensionPointOfContactGovtAlt",
"telephoneNonUSPointOfContactGovtAlt",
"faxPointOfContactGovtAlt",
"emailPointOfContactGovtAlt",
"nameFirstPointOfContactPastPerformance",
"nameMiddlePointOfContactPastPerformance",
"nameLastPointOfContactPastPerformance",
"titlePointOfContactPastPerformance",
"addressStreet1PointOfContactPastPerformance",
"addressStreet2PointOfContactPastPerformance",
"cityPointOfContactPastPerformance",
"zipcodePointOfContactPastPerformance",
"zip4PointOfContactPastPerformance",
"codeCountryPointOfContactPastPerformance",
"statePointOfContactPastPerformance",
"telephonePointOfContactPastPerformance",
"telephoneExtensionPointOfContactPastPerformance",
"telephoneNonUSPointOfContactPastPerformance",
"faxPointOfContactPastPerformance",
"emailPointOfContactPastPerformance",
"nameFirstPointOfContactPastPerformanceAlt",
"nameMiddlePointOfContactPastPerformanceAlt",
"nameLastPointOfContactPastPerformanceAlt",
"titlePointOfContactPastPerformanceAlt",
"addressStreet1PointOfContactPastPerformanceAlt",
"addressStreet2PointOfContactPastPerformanceAlt",
"cityPointOfContactPastPerformanceAlt",
"zipcodePointOfContactPastPerformanceAlt",
"zip4PointOfContactPastPerformanceAlt",
"codeCountryPointOfContactPastPerformanceAlt",
"statePointOfContactPastPerformanceAlt",
"telephonePointOfContactPastPerformanceAlt",
"telephoneExtensionPointOfContactPastPerformanceAlt",
"telephoneNonUSPointOfContactPastPerformanceAlt",
"faxPointOfContactPastPerformanceAlt",
"emailPointOfContactPastPerformanceAlt",
"nameFirstPointOfContactElectronicBusiness",
"nameMiddlePointOfContactElectronicBusiness",
"nameLastPointOfContactElectronicBusiness",
"titlePointOfContactElectronicBusiness",
"addressStreet1PointOfContactElectronicBusiness",
"addressStreet2PointOfContactElectronicBusiness",
"cityPointOfContactElectronicBusiness",
"zipcodePointOfContactElectronicBusiness",
"zip4PointOfContactElectronicBusiness",
"codeCountryPointOfContactElectronicBusiness",
"statePointOfContactElectronicBusiness",
"telephonePointOfContactElectronicBusiness",
"telephoneExtensionPointOfContactElectronicBusiness",
"telephoneNonUSPointOfContactElectronicBusiness",
"faxPointOfContactElectronicBusiness",
"emailPointOfContactElectronicBusiness",
"nameFirstPointOfContactElectronicBusinessAlternate",
"nameMiddlePointOfContactElectronicBusinessAlternate",
"nameLastPointOfContactElectronicBusinessAlternate",
"titlePointOfContactElectronicBusinessAlternate",
"addressStreet1PointOfContactElectronicBusinessAlternate",
"addressStreet2PointOfContactElectronicBusinessAlternate",
"cityPointOfContactElectronicBusinessAlternate",
"zipcodePointOfContactElectronicBusinessAlternate",
"zip4PointOfContactElectronicBusinessAlternate",
"codeCountryPointOfContactElectronicBusinessAlternate",
"statePointOfContactElectronicBusinessAlternate",
"telephonePointOfContactElectronicBusinessAlternate",
"telephoneExtensionPointOfContactElectronicBusinessAlternate",
"telephoneNonUSPointOfContactElectronicBusinessAlternate",
"faxPointOfContactElectronicBusinessAlternate",
"emailPointOfContactElectronicBusinessAlternate",
"countNAICSExceptions",
"dataNAICSExceptions",
"hasDebtSubjectToOffsetClause",
"slugExclusionStatusFlag",
"countSBATypes",
"dataSBACodes",
"slugPublicDisplay",
"countDisasterResponseCodes",
"dataDisasterResponse"
)
)
}
#' SAM Business Type Codes
#'
#' Dictionary of the SAM business types
#'
#' @return \code{tibble()}
#' @export
#'
#' @examples
#' dictionary_sam_business_types()
dictionary_sam_business_types <- function() {
data <-
structure(
list(
c(
"2R",
"2F",
"12",
"3I",
"CY",
"A7",
"20",
"1D",
"LJ",
"XS",
"MF",
"2X",
"A8",
"2U",
"HK",
"A3",
"A5",
"QF",
"A2",
"23",
"FR",
"QZ",
"OY",
"PI",
"NB",
"8W",
"27",
"8E",
"8C",
"8D",
"NG",
"QW",
"C8",
"C7",
"ZR",
"MG",
"C6",
"H6",
"TW",
"UD",
"8B",
"86",
"KM",
"T4",
"H2",
"6D",
"M8",
"G6",
"G7",
"G8",
"HB",
"1A",
"1R",
"ZW",
"GW",
"OH",
"HS",
"QU",
"G3",
"G5",
"BZ",
"80",
"FY",
"HQ",
"05",
"OW",
"XY",
"8U",
"1B",
"FO",
"TR",
"G9",
"JX",
"V2",
"VW"
),
c(
"UNITED STATES FEDERAL GOVERNMENT",
"UNITED STATES STATE GOVERNMENT",
"UNITED STATES LOCAL GOVERNMENT",
"TRIBAL GOVERNMENT",
"FOREIGN GOVERNMENT",
"ABILITYONE NON PROFIT AGENCY",
"FOREIGN OWNED",
"SMALL AGRICULTURAL COOPERATIVE",
"LIMITED LIABILITY COMPANY",
"SUBCHAPTER S CORPORATION",
"MANUFACTURER OF GOODS",
"FOR PROFIT ORGANIZATION",
"NON-PROFIT ORGANIZATION",
"OTHER NOT FOR PROFIT ORGANIZATION",
"COMMUNITY DEVELOPMENT CORPORATION OWNED FIRM",
"LABOR SURPLUS AREA FIRM",
"VETERAN OWNED BUSINESS",
"SERVICE DISABLED VETERAN OWNED BUSINESS",
"WOMAN OWNED BUSINESS",
"MINORITY OWNED BUSINESS",
"ASIAN-PACIFIC AMERICAN OWNED",
"SUBCONTINENT ASIAN (ASIAN-INDIAN) AMERICAN OWNED",
"BLACK AMERICAN OWNED",
"HISPANIC AMERICAN OWNED",
"NATIVE AMERICAN OWNED",
"WOMAN OWNED SMALL BUSINESS",
"SELF CERTIFIED SMALL DISADVANTAGED BUSINESS",
"ECONOMICALLY DISADVANTAGED WOMEN SMALL OWNED BUSINESS",
"JOINT VENTURE WOMEN OWNED SMALL BUSINESS",
"JOINT VENTURE ECONOMICALLY DISADVANTAGED WOMEN SMALL OWNED BUSINESS",
"FEDERAL AGENCY",
"FEDERALLY FUNDED RESEARCH AND DEVELOPMENT CENTER",
"CITY",
"COUNTY",
"INTER-MUNICIPAL",
"LOCAL GOVERNMENT OWNED",
"MUNICIPALITY",
"SCHOOL DISTRICT",
"TRANSIT AUTHORITY",
"COUNCIL OF GOVERNMENTS",
"HOUSING AUTHORITIES PUBLIC/TRIBAL",
"INTERSTATE ENTITY",
"PLANNING COMMISSION",
"PORT AUTHORITY",
"COMMUNITY DEVELOPMENT CORPORATION",
"DOMESTIC SHELTER",
"EDUCATIONAL INSTITUTION",
"1862 LAND GRANT COLLEGE",
"1890 LAND GRANT COLLEGE",
"1994 LAND GRANT COLLEGE",
"HISTORICALLY BLACK COLLEGE OR UNIVERSITY",
"MINORITY INSTITUTION",
"PRIVATE UNIVERSITY OR COLLEGE",
"SCHOOL OF FORESTRY",
"HISPANIC SERVICING INSTITUTION",
"STATE CONTROLLED INSTITUTION OF HIGHER LEARNING",
"TRIBAL COLLEGE",
"VETERINARY COLLEGE",
"ALASKAN NATIVE SERVICING INSTITUTION",
"NATIVE HAWAIIAN SERVICING INSTITUTION",
"FOUNDATION",
"HOSPITAL",
"VETERINARY HOSPITAL",
"DOT CERTIFIED DBE",
"ALASKAN NATIVE CORPORATION OWNED FIRM",
"AMERICAN INDIAN OWNED",
"INDIAN TRIBE - FEDERALLY RECOGNIZED",
"NATIVE HAWAIIAN ORGANIZATION OWNED FIRM",
"TRIBALLY OWNED FIRM",
"TOWNSHIP",
"AIRPORT AUTHORITY",
"OTHER THAN ONE OF THE PROCEEDING",
"SELF-CERTIFIED HUBZONE JOINT VENTURE",
"GRANTS",
"CONTRACTS AND GRANTS"
)
),
.Names = c("slugBusinessType", "typeBusinessSAM"),
class = c("tbl_df",
"tbl", "data.frame"),
row.names = c(NA, -75L)
)
### add types
data
}
#' SAM Extract Codes
#'
#' Returns information about
#' SAM extract fields, ie, what type of
#' record is most recent.
#'
#' @return
#' @export
#'
#' @examples
#' dictionary_sam_extract_types()
dictionary_sam_extract_types <-
function() {
tibble(
slugSAMExtract = c("A", "E", "1", "2", "3", "4"),
typeSAMExtract = c("Active", "Expired", "Deleted", "New", "Update", "Expired") %>% str_to_upper()
)
}
#' SAM Registration Code Dictionary
#'
#' Information about the reasons for
#' the SAM entity registration
#'
#' @return \code{tibble()}
#' @export
#'
#' @examples
#' dictionary_sam_registration_purpose_codes()
dictionary_sam_registration_purpose_codes <-
function() {
tibble(
slugRegistrationPurpose = c("Z1", "Z2", "Z3", "Z4", "Z5"),
typeRegistrationPurpose = c(
"Federal Assistance",
"All Awards",
"IGT-Only",
"IGT | Federal Assistance",
"All Awards | IGT"
) %>% str_to_upper()
)
}
#' SBA Setaside Codes for SAM
#'
#' @return
#' @export
#'
#' @examples
#' dictionary_sam_sba_set_asides()
dictionary_sam_sba_set_asides <-
function() {
structure(
list(
c("HUBZONE CERTIFIED", "8A PARTICPANT", "8A JOINT VENTURE"),
c("XX", "A6", "JT")
),
class = c("tbl_df", "tbl", "data.frame"),
row.names = c(NA, -3L),
.Names = c("typeSetAside", "slugSetAside")
)
}
#' Dictionary SAM Bulk Entity API URL
#'
#' @return
#' @export
#'
#' @examples
dictionary_sam_bulk_entity_urls <- function() {
json <-
"https://beta.sam.gov/api/prod/fileextractservices/v1/api/listfiles?random=&domain=Entity%20Information" %>% fromJSON(simplifyDataFrame = T)
data <- json[[1]]$customS3ObjectSummaryList
urls <- data$`_links` %>% unlist() %>% as.character()
data <-
data[, 1:(ncol(data) - 1)] %>%
as_tibble() %>%
mutate(urlFile = urls)
data %>%
mutate(
dateModified = mdy(dateModified),
typeEntitiesFile = case_when(
displayKey %>% str_detect("SAM_PUBLIC") ~ "SAM",
TRUE ~ "ENTITY"
),
dateData = case_when(
typeEntitiesFile == "ENTITY" ~
displayKey %>% str_remove_all(".csv|entityinformation") %>% mdy(),
TRUE ~
displayKey %>% str_remove_all(".ZIP|SAM_PUBLIC_MONTHLY_") %>%
ymd()
)
) %>%
select(dateData, dateModified, urlFile)
}
# general -----------------------------------------------------------------
# # https://open.gsa.gov/api/
# entities ----------------------------------------------------------------
### https://beta.sam.gov/data-services?domain=none
.dl_zip <-
function(url) {
tmp <-
tempfile()
file <- curl_download(URLencode(url), tmp)
unz_files <- unzip(file, exdir = "xml")
data <-
unz_files %>%
fread(header = F, verbose = F,showProgress = FALSE, quote = "")
unz_files %>% unlink()
file %>% unlink()
unlink("xml", recursive = T)
gc()
data <-
data %>%
as_tibble()
data
}
## https://open.gsa.gov/api/sam-entity-extracts-api/v1/public_extract_layout.pdf
.parse_bulk_sam_entities <-
function(url = "https://s3.amazonaws.com/falextracts/Entity Information/SAM_PUBLIC_MONTHLY_20191201.ZIP") {
options(scipen = 999999)
data <-
.dl_zip(url = url) %>%
select(-150)
data <- data %>%
setNames(.dictionary_sam_extract_names() %>% pull(nameColumn))
data <- data %>%
mutate(idDUNS = as.integer(idDUNS))
data <-
data %>% mutate_if(is.character,
list(function(x) {
x %>%
str_squish()
}))
data <-
data %>%
mutate_if(is.character,
list(function(x) {
case_when(x %in% c("", " ") ~ NA_character_,
TRUE ~ x)
}))
logical_cols <-
data %>% select(matches("^is|^has")) %>% names()
data <- data %>%
mutate_at(logical_cols,
list(function(x) {
case_when(x == "N" ~ FALSE,
x == "Y" ~ TRUE,
TRUE ~ NA)
}))
data <-
data %>%
left_join(dictionary_sam_parent_entity_types(), by = "slugEntityStructure")
data <-
data %>%
mutate(slugSAMExtract = as.character(slugSAMExtract)) %>%
left_join(dictionary_sam_extract_types(), by = "slugSAMExtract")
data <- data %>%
left_join(dictionary_sam_registration_purpose_codes(), by = "slugRegistrationPurpose")
data <- data %>%
mutate(isNonPublicSAMRegistered = case_when(slugPublicDisplay == "NPDY" ~ TRUE,
TRUE ~ FALSE)) %>%
select(-slugPublicDisplay)
data <-
data %>%
mutate(isExpiredSAMEntity = typeSAMExtract == "EXPIRED") %>%
select(idDUNS, isExpiredSAMEntity, everything())
df_types <-
data %>%
filter(!is.na(dataBusinessTypes)) %>%
select(idDUNS, dataBusinessTypes) %>%
separate_rows("dataBusinessTypes", sep = "~") %>%
rename(slugBusinessType = dataBusinessTypes) %>%
filter(!is.na(slugBusinessType))
df_types <-
df_types %>%
left_join(dictionary_sam_business_types(), by = "slugBusinessType") %>%
nest(dataBusinessTypes = c(slugBusinessType, typeBusinessSAM))
data <-
data %>%
select(-dataBusinessTypes) %>%
left_join(df_types, by = "idDUNS") %>%
select(one_of(names(data)), everything())
rm(df_types)
gc()
df_response_codes <-
data %>%
filter(!is.na(dataDisasterResponse)) %>%
select(idDUNS, dataDisasterResponse) %>%
rename(slugDisasterResponseCode = dataDisasterResponse) %>%
separate_rows(slugDisasterResponseCode, sep = "\\~") %>%
filter(!is.na(slugDisasterResponseCode)) %>%
mutate(
typeAreaResponse = case_when(
slugDisasterResponseCode == "ANY" ~ "ANY AREA",
slugDisasterResponseCode %>% str_detect("^STA") ~ "STATE",
slugDisasterResponseCode %>% str_detect("^CTY") ~ "COUNTY",
slugDisasterResponseCode %>% str_detect("^MSA") ~ "METROPOLITAN SERVICE AREA"
),
codeAreaResponse =
case_when(
slugDisasterResponseCode == "ANY" ~ NA_character_,
TRUE ~ substr(slugDisasterResponseCode,
4,
nchar(slugDisasterResponseCode))
)
)
df_response_codes <-
df_response_codes %>%
nest(
dataDisasterResponse = c(slugDisasterResponseCode,
typeAreaResponse,
codeAreaResponse)
)
data <-
data %>%
select(-dataDisasterResponse) %>%
left_join(df_response_codes, by = "idDUNS")
rm(df_response_codes)
gc()
dict_naics <-
dictionary_naics_codes()
data <-
data %>%
group_by(idDUNS) %>%
slice(1) %>%
ungroup()
data <-
data %>%
left_join(dict_naics %>% select(idNAICSPrimary = idNAICS, nameNAICSPrimary = nameNAICS),
by = "idNAICSPrimary") %>%
group_by(idDUNS) %>%
slice(1) %>%
ungroup()
df_naics <-
data %>%
select(idDUNS, idNAICSPrimary, dataNAICS) %>%
filter(!is.na(dataNAICS)) %>%
separate_rows(dataNAICS, sep = "\\~") %>%
rename(codeNAICS = dataNAICS) %>%
filter(!is.na(codeNAICS)) %>%
group_by(idDUNS) %>%
slice(1) %>%
ungroup()
df_naics <-
df_naics %>%
mutate(hasSmallBusinessIndicator = codeNAICS %>% str_detect("Y$")) %>%
mutate(idNAICS = parse_number(codeNAICS),
isPrimaryNAICS = idNAICS == idNAICSPrimary) %>%
select(idDUNS, isPrimaryNAICS, idNAICS, hasSmallBusinessIndicator)
df_naics <-
df_naics %>%
left_join(dict_naics, by = "idNAICS")
df_naics <- df_naics %>% select(-yearCodeBookNAICS)
data <-
data %>% select(-dataNAICS) %>%
left_join(df_naics %>%
nest(
dataNAICS = c(
isPrimaryNAICS,
idNAICS,
hasSmallBusinessIndicator,
nameNAICS,
isPrimaryNAICS
)
), by = "idDUNS") %>%
distinct() %>%
group_by(idDUNS) %>%
slice(1) %>%
ungroup()
df_naics_exceptions <-
data %>%
select(idDUNS, dataNAICSExceptions) %>%
filter(!is.na(dataNAICSExceptions)) %>%
separate_rows(dataNAICSExceptions, sep = "\\~") %>%
rename(slugNAICS = dataNAICSExceptions) %>%
mutate_if(is.character, str_squish) %>%
filter(slugNAICS != "") %>%
mutate(idNAICS = slugNAICS %>% substr(1, 6) %>% as.integer()) %>%
mutate(isException = T) %>%
left_join(dict_naics %>% select(idNAICS, nameNAICS), by = "idNAICS")
df_naics_exceptions <-
df_naics_exceptions %>%
mutate(
slugExceptions = slugNAICS %>% str_remove_all("[0-9]"),
countExceptions = slugExceptions %>% nchar()
) %>%
nest(
dataNAICSExceptions = c(
slugNAICS,
idNAICS,
isException,
nameNAICS,
slugExceptions,
countExceptions
)
)
data <-
data %>%
select(-dataNAICSExceptions) %>%
left_join(df_naics_exceptions, by = "idDUNS") %>%
group_by(idDUNS) %>%
slice(1) %>%
ungroup()
rm(df_naics)
rm(dict_naics)
gc()
dict_psc <-
dictionary_psc_active(only_active = T)
df_psc <-
data %>%
filter(!is.na(dataProductServiceCodes)) %>%
select(idDUNS, dataProductServiceCodes) %>%
separate_rows(dataProductServiceCodes, sep = "\\~") %>%
rename(codeProductService = dataProductServiceCodes)
df_psc <-
df_psc %>%
filter(!codeProductService == "") %>%
mutate(
letternumber = codeProductService %>% substr(1, 1),
isNumber = letternumber %>% str_detect("[0-9]")
) %>%
mutate(
idSolicitationGroup = case_when(
isNumber ~ codeProductService %>% substr(1, 2),
TRUE ~ codeProductService %>% substr(1, 1)
)
) %>%
select(-c(letternumber, isNumber))
dict_psc <-
dict_psc %>% select(typePSC,
codeProductService,
nameProductService)
df_psc <-
df_psc %>%
left_join(dict_psc, by = c("codeProductService"))
df_psc <-
df_psc %>%
nest(
dataProductServiceCodes = c(
typePSC,
nameProductService,
idSolicitationGroup,
codeProductService
)
)
data <-
data %>%
select(-dataProductServiceCodes) %>%
left_join(df_psc, by = "idDUNS") %>%
group_by(idDUNS) %>%
slice(1) %>%
ungroup()
if (data %>% hasName("urlCompany")) {
data <- data %>%
mutate(
urlCompany = case_when(
urlCompany %>% str_detect("http:|https:") ~ urlCompany,
is.na(urlCompany) ~ NA_character_,
TRUE ~ glue("https://{urlCompany}") %>% as.character()
)
)
}
rm(df_psc)
gc()
df_sba <-
data %>%
filter(!is.na(dataSBACodes)) %>%
select(idDUNS, dataSBACodes) %>%
separate_rows(dataSBACodes, sep = "\\~") %>%
rename(codeSetAside = dataSBACodes)
df_sba <- df_sba %>%
mutate(
typeSetAside = case_when(
codeSetAside == "XX" ~ "HUBZONE CERTIFIED",
codeSetAside %>% str_detect("^A6") ~ "8A PARTICPANT",
codeSetAside %>% str_detect("^JT") ~ "8A JOINT VENTURE"
),
slugSetAside = codeSetAside %>% substr(1, 2)
)
df_sba <- df_sba %>%
mutate(
dateExpirationSBA = case_when(
slugSetAside == "XX" ~ NA_character_,
TRUE ~ codeSetAside %>% substr(3, nchar(codeSetAside))
) %>% as.numeric() %>% ymd()
) %>%
select(-codeSetAside)
df_sba <- df_sba %>%
nest(dataSBA = c(typeSetAside, slugSetAside, dateExpirationSBA))
data <-
data %>%
select(-dataSBACodes) %>%
left_join(df_sba, by = "idDUNS")
date_names <-
data %>%
select(matches("date")) %>%
select(-matches("dateCompanyStart")) %>%
names()
data <-
data %>%
mutate_at(date_names,
list(function(x) {
x %>% ymd()
}))
data <- data %>%
mutate(
charDate = nchar(dateCompanyStart),
ymdDate = case_when(
charDate == 7 ~ dateCompanyStart %>% substr(2, 7) %>% as.integer(),
TRUE ~ dateCompanyStart
),
date = ymd(ymdDate),
year = year(date),
ymdDate = case_when(
year <= 1120 ~ ymdDate %>% substr(3, nchar(ymdDate)) %>% as.character(),
idDUNS %in% c(
942278250,
807634428,
81559260,
54766010,
64402055,
78443143,
967454070
) ~ ymdDate %>% substr(3, nchar(ymdDate)) %>% as.character(),
TRUE ~ as.character(ymdDate)
),
date = ymd(ymdDate),
year = year(date)
) %>%
select(-c(dateCompanyStart, year, charDate, ymdDate)) %>%
rename(dateCompanyStart = date) %>%
select(one_of(names(data)), everything())
data <-
data %>%
mutate_at(c("nameCompanyLegal",
"nameCompanyDivision",
"nameCompanyDBA"),
list(function(x) {
x %>% str_replace_all("\\, INC.|\\, INC|\\,INCORPORATED", " INC") %>%
str_replace_all("\\, LLC|\\, L.L.C|\\, L.L.C.", " LLC") %>%
str_replace_all("\\, LLP|\\, L.L.P|\\, L.L.P.", " LLP") %>%
str_replace_all("\\, LP|\\, L.P|\\, L.P.", " LP") %>%
str_replace_all("\\, LTD.|\\, LTD", " LTD") %>%
str_replace_all("\\, COMPANY", " COMPANY") %>%
str_replace_all("\\,CORP", " CORP") %>%
str_remove_all("\\*") %>%
str_remove_all('\\"')
}))
data <-
data %>%
mutate(
countDaysPostFoundingToSAMRegistration = (dateActivation - dateCompanyStart) %>% as.integer(),
countDaysExistence = (Sys.Date()-dateCompanyStart) %>% as.integer()
)
lower_names <-
data %>% select_if(is.character) %>%
select(matches("^email|^url")) %>%
names()
data <- data %>%
mutate_at(lower_names, str_to_lower)
upper_names <-
data %>% select_if(is.character) %>% select(-one_of(lower_names)) %>% names()
data <-
data %>%
mutate_at(upper_names, str_to_upper)
data <-
data %>%
mutate(
nameContactGovernment = case_when(
!is.na(nameFirstPointOfContactGovt) &
!is.na(nameLastPointOfContactGovt) ~ str_c(
nameFirstPointOfContactGovt,
nameLastPointOfContactGovt,
sep = " "
),
TRUE ~ NA_character_
),
nameContactGovernmentAlt = case_when(
!is.na(nameFirstPointOfContactGovtAlt) &
!is.na(nameLastPointOfContactGovtAlt) ~ str_c(
nameFirstPointOfContactGovtAlt,
nameLastPointOfContactGovtAlt,
sep = " "
),
TRUE ~ NA_character_
),
nameContactPastPerformance = case_when(
!is.na(nameFirstPointOfContactPastPerformance) &
!is.na(nameLastPointOfContactPastPerformance) ~ str_c(
nameFirstPointOfContactPastPerformance,
nameLastPointOfContactPastPerformance,
sep = " "
),
TRUE ~ NA_character_
),
nameContactElectronic = case_when(
!is.na(nameFirstPointOfContactElectronicBusiness) &
!is.na(nameLastPointOfContactElectronicBusiness) ~ str_c(
nameFirstPointOfContactElectronicBusiness,
nameLastPointOfContactElectronicBusiness,
sep = " "
),
TRUE ~ NA_character_
)
)
data <- data %>%
mutate(
typeEntity =
case_when(
typeEntityStructure == "OTHER" ~ "OTHER",
typeEntityStructure %>% str_detect("GOVERNMENT") ~ "GOVERNMENT",
TRUE ~ "CORPORATION"
),
isTaxExemptEntity = typeEntityStructure %in% c(
"FOREIGN COUNTRY OR GOVERNMENT",
"TAX EXEMPT CORPORATE ENTITY",
"UNITED STATES GOVERNMENT ENTITY"
)
)
data <- data %>%
mutate(
isForeignLocatedCompany = codeCountryCompany != "USA",
isForeignIncorporatedcompany = codeCountryIncorporation != "USA"
)
data <- data %>%
mutate(
hasBusinessTypes = dataBusinessTypes %>% map_dbl(length) > 0,
hasSBA = dataSBA %>% map_dbl(length) > 0,
hasProductServiceCodes = dataProductServiceCodes %>% map_dbl(length) > 0,
hasNAICS = dataNAICS %>% map_dbl(length) > 0,
hasDisasterResponse = dataDisasterResponse %>% map_dbl(length) > 0,
hasNAICSExceptions = dataNAICSExceptions %>% map_dbl(length) > 0
)
gc()
data
}
.bulk_entities_api_old <-
function(file_type = "ENTITY",
sensitivity = "PUBLIC",
frequency = "daily",
date = Sys.Date() - 1,
api_key = "O4kzViWGVYNumPqhAzUhYGiZZZwW3RKUEYJOI6ii",
only_active = F,
sam_extract_filter = NULL,
psc_filter = NULL,
naics_filter = NULL,
incorporation_company_filter = NULL,
company_state_filter = NULL,
company_zipcode_filter = NULL,
company_country_filter = NULL,
set_aside_filter = NULL,
business_type_filter = NULL,
return_message = T) {
date <- ymd(date)
date_year <- year(date)
month_date <- month(date)
day_date <- day(date)
month_date <-
case_when(nchar(month_date) == 1 ~ str_c("0", month_date),
TRUE ~ as.character(month_date))
day_slug <-
case_when(nchar(day_date) == 1 ~ str_c("0", day_date),
TRUE ~ as.character(day_date))
period_slug <-
glue("{month_date}/{day_slug}/{year(date)}") %>% as.character()
url <-
glue(
"https://api.sam.gov/prod/dataservices/v1/extracts?api_key={api_key}&fileType={str_to_upper(file_type)}&sensitivity={str_to_upper(sensitivity)}&frequency={str_to_upper(frequency)}&date={period_slug}"
)
if (return_message) {
glue("Parsing {url}") %>% message()
}
data <- .parse_bulk_sam_entities(url = url)
data <-
data %>%
mutate(dateData = date,
frequencyData = str_to_upper(frequency)) %>%
select(frequencyData, dateData, everything())
if (length(sam_extract_filter) > 0 &&
str_to_upper(frequency) == "DAILY") {
if (return_message) {
glue(
"Including only {str_c(sam_extract_filter, collapse = ',')} SAM entity registration(s)"
)
}
data <- data %>%
filter(typeSAMExtract %in% str_to_upper(sam_extract_filter))
}
if (only_active) {
if (return_message) {
"Filtering for only active SAM companies" %>% message()
}
data <- data %>%
filter(!isExpiredSAMEntity)
}
if (length(psc_filter) > 0) {
if (return_message) {
glue(
"Filtering for companies with a registered PSC:\n{str_c(psc_filters, collapse = '\n')}"
) %>% message()
}
filter_duns <-
data %>%
filter(hasProductServiceCodes) %>%
select(idDUNS, dataProductServiceCodes) %>%
unnest_legacy() %>%
filter(codeProductService %in% psc_filters) %>%
pull(idDUNS)
data <- .filter_duns(data = data, duns = filter_duns)
}
if (length(naics_filter) > 0) {
if (return_message) {
glue(
"Filtering for companies with a registered naics:\n{str_c(naics_filters, collapse = '\n')}"
) %>% message()
}
filter_duns <-
data %>%
filter(hasNAICS) %>%
select(idDUNS, dataNAICS) %>%
unnest_legacy() %>%
filter(idNAICS %in% naics_filters) %>%
pull(idDUNS)
data <- .filter_duns(data = data, duns = filter_duns)
}
if (length(set_aside_filter) > 0) {
if (return_message) {
glue(
"Filtering for companies with a set aside code of:\n{str_c(set_aside_filter, collapse = '\n')}"
) %>% message()
}
filter_duns <-
data %>%
filter(hasSBA) %>%
select(idDUNS, dataSBA) %>%
unnest_legacy() %>%
filter(slugSetAside %in% str_to_upper(set_aside_filter)) %>%
pull(idDUNS)
data <- .filter_duns(data = data, duns = filter_duns)
}
if (length(incorporation_company_filter) > 0) {
if (return_message) {
glue(
"Filtering for companies domiciled in:\n{str_c(incorporation_company_filter, collapse = '\n')}"
) %>% message()
}
data <-
data %>%
filter(codeCountryIncorporation %in% str_to_upper(incorporation_company_filter))
}
if (length(company_country_filter) > 0) {
if (return_message) {
glue(
"Filtering for companies head-quartered in:\n{str_c(company_country_filter, collapse = '\n')}"
) %>% message()
}
data <-
data %>%
filter(codeCountryCompany %in% str_to_upper(company_country_filter))
}
if (length(company_state_filter) > 0) {
if (return_message) {
glue(
"Filtering for companies headquarters in:\n{str_c(company_state_filter, collapse = '\n')}"
) %>% message()
}
data <- data %>%
filter(stateCompany %in% str_to_upper(company_state_filter))
}
if (length(company_zipcode_filter)) {
if (return_message) {
glue(
"Filtering for companies located in zipcode(s):\n{str_c(company_zipcode_filter, collapse = '\n')}"
) %>% message()
}
data <- data %>%
filter(zipcodeCompany %in% str_to_upper(as.character(company_zipcode_filter)))
}
if (length(business_type_filter) > 0) {
if (return_message) {
glue(
"Filtering for business types:\n{str_c(business_type_filter, collapse = '\n')}"
) %>% message()
}
filter_duns <-
data %>%
filter(hasBusinessTypes) %>%
select(idDUNS, dataBusinessTypes) %>%
unnest_legacy() %>%
filter(slugBusinessType %in% str_to_upper(business_type_filter)) %>%
pull(idDUNS) %>%
unique()
data <-
.filter_duns(data = data, duns = filter_duns)
}
name_cols <- data %>% select(matches("name")) %>% names()
data <- data %>%
mutate_at(name_cols,
list(function(x) {
x %>% str_replace_all(" OF, ", " OF ")
}))
if (data %>% hasName("slugDUNS")) {
data <- data %>% mutate(slugDUNS = as.character(slugDUNS))
}
data <- data %>%
mutate_if(is.numeric, as.numeric)
data
}
.bulk_sam_entities_old <-
function(frequencies = c("monthly"),
dates = c(Sys.Date()),
api_key = "O4kzViWGVYNumPqhAzUhYGiZZZwW3RKUEYJOI6ii",
only_active = F,
sam_extract_filter = NULL,
psc_filter = NULL,
naics_filter = NULL,
incorporation_company_filter = NULL,
company_state_filter = NULL,
company_zipcode_filter = NULL,
company_country_filter = NULL,
set_aside_filter = NULL,
business_type_filter = NULL,
file_type = "ENTITY",
sensitivity = "PUBLIC",
return_message = T) {
df_input <-
expand_grid(date = dates,
frequency = str_to_upper(frequencies)) %>%
as_tibble()
df_input <-
df_input %>% filter(frequency == "MONTHLY") %>% filter(date == max(date)) %>%
bind_rows(df_input %>% filter(frequency == "DAILY"))
all_data <-
1:nrow(df_input) %>%
map_dfr(function(x) {
df_row <-
df_input %>% dplyr::slice(x)
.bulk_entities_api(
frequency = df_row$frequency,
date = df_row$date,
api_key = api_key,
only_active = only_active,
psc_filter = psc_filter,
sam_extract_filter = sam_extract_filter,
naics_filter = naics_filter,
file_type = file_type,
sensitivity = sensitivity,
incorporation_company_filter = incorporation_company_filter,
company_state_filter = company_state_filter,
company_zipcode_filter = company_zipcode_filter,
company_country_filter = company_country_filter,
set_aside_filter = set_aside_filter,
business_type_filter = business_type_filter,
return_message = return_message
)
})
if (return_message) {
"Generating FAR and DFAR link urls" %>% message()
}
df_duns <- all_data %>%
distinct(idDUNS) %>%
mutate(char = nchar(idDUNS),
zeros = 9 - char)
df_duns <- df_duns %>%
filter(zeros != 0) %>%
mutate(
slugZero = zeros %>% map_chr(function(x) {
rep("0", times = x) %>% str_c(collapse = "")
}),
slugDUNS = glue("{slugZero}{idDUNS}") %>% as.character()
) %>%
select(idDUNS, slugDUNS) %>%
bind_rows(df_duns %>% filter(zeros == 0) %>%
select(idDUNS) %>%
mutate(slugDUNS = as.character(idDUNS))) %>%
mutate(
urlFARPDF = glue(
"https://www.sam.gov/SAM/filedownload?pdfType=1&duns={slugDUNS}"
) %>% as.character(),
urlDFARPDF = glue(
"https://www.sam.gov/SAM/filedownload?pdfType=2&duns={slugDUNS}"
) %>% as.character()
) %>%
select(-slugDUNS)
all_data <-
all_data %>%
left_join(df_duns, by = "idDUNS")
all_data
}
#' SAM Active Contract Opportunities
#'
#' Returns all active contract information
#' from SAM contract opportunity data store.
#'
#' @param exclude_archived if \code{TRUE} excludes archived listings
#' @param exclude_awards if \code{TRUE} excludes awards
#' @param only_active if \code{TRUE} only
#' @param include_unknown_responses if \code{TRUE} keep opportunities with unknown response dates
#' @param snake_names
#' @param join_address
#'
#' @return
#' @export
#'
#' @examples
#' bulk_sam_contract_opportunities()
bulk_sam_contract_opportunities <-
function(exclude_archived = T,
only_active = T,
include_unknown_responses = F,
snake_names = F,
join_address = T,
exclude_awards = T) {
data <-
"https://s3.amazonaws.com/falextracts/Contract%20Opportunities/datagov/ContractOpportunitiesFullCSV.csv" %>%
read_csv(guess_max = 100000)
data <-
data %>%
.munge_biz_opps_names()
data <-
data %>%
filter(!idNotice %>% str_detect("<"))
data <-
data %>%
mutate(amountAward = currency(parse_number(as.character(amountAward)), digits = 0))
upper_cols <-
data %>% select_if(is.character) %>% select(-matches("email|idNotice|url")) %>% names()
lower_cols <-
data %>% select_if(is.character) %>% select(matches("email|url")) %>% names()
data <- data %>%
mutate_at(upper_cols, str_to_upper)
data <-
data %>% mutate_at(lower_cols, str_to_lower)
data <-
data %>% mutate_if(is.character, str_squish)
date_cols <-
data %>%
select_if(is.character) %>%
select(matches("^datetime[A-Z]|^date[A-Z]")) %>% names()
if (data %>% hasName("nameSolicitation")) {
data <-
data %>%
mutate(
nameSolicitation = nameSolicitation %>% str_remove_all("^[A-Z]--") %>%
str_remove_all("^[0-9][0-9]--") %>%
str_squish() %>% str_to_upper()
)
}
if (length(date_cols) > 0) {
data <-
data %>%
mutate_at(date_cols,
list(function(x) {
case_when(is.na(x) ~ NA_character_,
TRUE ~ x %>% substr(1, 10)) %>% ymd()
}))
}
date_cols <-
data %>% select(matches("^date")) %>% names()
df_years <-
data %>%
select(date_cols) %>%
transmute_at(date_cols, year)
names(df_years) <-
names(df_years) %>% str_replace_all("datetime", "year") %>% str_replace_all("date", "year")
data <-
data %>%
bind_cols(df_years)
data <-
data %>%
mutate(
hasNoticeUpdates = typeNotice != typeNoticeBase,
nameAwardee = case_when(
nameAwardee == "NULL" ~ NA_character_,
nameAwardee == "VARIOUS" ~ "MULTIPLE",
TRUE ~ nameAwardee
),
isAward = !is.na(nameAwardee),
hasAwardeeDUNS = nameAwardee %>% str_detect("DUNS:") %>% coalesce(FALSE),
isActive = case_when(isActive == "YES" ~ TRUE,
TRUE ~ FALSE)
)
data <- data %>%
separate(
typeSetAside,
sep = "\\(FAR",
into = c("typeSetAside", "codeFAR"),
extra = "drop",
fill = "right"
) %>%
mutate_at(c("typeSetAside", "codeFAR"),
list(function(x) {
x %>% str_remove_all("\\(|\\)") %>%
str_remove_all("\\HUBZONE|\\IEE|\\EDWOSB|\\SDVOSB") %>%
str_squish()
})) %>%
mutate(
hasSetAside = !is.na(typeSetAside) %>% coalesce(FALSE),
isSoleSource = typeSetAside %>% str_detect("SOLE SOURCE") %>% coalesce(FALSE)
)
df_solicitation_groups <-
data %>%
filter(!is.na(idSolicitationGroup)) %>%
count(idSolicitationGroup) %>%
mutate(
char = nchar(idSolicitationGroup),
hasPSCExact = char > 2,
typePSC = case_when(
idSolicitationGroup %>% str_detect("[A-Z]") ~ "SERVICE",
TRUE ~ "PRODUCT"
),
codeProductService = idSolicitationGroup,
idSolicitationGroupActual = case_when(
typePSC == "PRODUCT" ~ idSolicitationGroup %>% substr(1, 2),
TRUE ~ idSolicitationGroup %>% substr(1, 1)
)
) %>%
select(-c(n, char)) %>%
left_join(
dictionary_psc_active() %>%
select(
codeProductService,
nameSolicitationGroup,
nameProductService
),
by = "codeProductService"
)
data <- data %>%
left_join(df_solicitation_groups, by = "idSolicitationGroup") %>%
select(-idSolicitationGroup) %>%
rename(idSolicitationGroup = idSolicitationGroupActual) %>%
select(one_of(names(data)), everything()) %>%
select(idNotice:idSolicitation,
names(
df_solicitation_groups %>% select(-idSolicitationGroupActual)
),
everything())
data <-
data %>%
mutate(
isActive = as.Date(datetimeResponse) > Sys.Date(),
dateResponse = as.Date(datetimeResponse),
countDaysToRespond = (as.Date(datetimeResponse) - Sys.Date()) %>% as.numeric(),
countDaysToRespond = case_when(is.na(countDaysToRespond) ~ 0,
TRUE ~ countDaysToRespond),
countDaysToRespond = pmax(0, countDaysToRespond, na.rm = T),
countDaysOnline = (Sys.Date() - as.Date(datetimePublished)) %>% as.integer(),
countDaysOpenToRespond = (as.Date(datetimeResponse) - as.Date(datetimePublished)) %>% as.numeric(),
countDaysOpenToRespond = pmax(0, countDaysOpenToRespond, na.rm = T),
urlOpportunityAPI = glue(
"https://beta.sam.gov/api/prod/opps/v2/opportunities/{idNotice}"
) %>% as.character(),
urlOpportunityAttachmentAPI = glue(
"https://beta.sam.gov/api/prod/opps/v3/opportunities/{idNotice}/resources?excludeDeleted=false&withScanResult=false"
) %>% as.character(),
urlOpportunityAttachmentZIP = glue(
"https://beta.sam.gov/api/prod/opps/v3/opportunities/{idNotice}/resources/download/zip"
) %>% as.character()
)
if (data %>% hasName("hasPSCExact")) {
data <- data %>%
mutate(hasPSCExact = hasPSCExact %>% coalesce(FALSE))
}
data <-
data %>%
mutate_at(c("nameSolicitation", "descriptionSolicitation"),
list(function(x) {
x %>% stringi::stri_enc_toascii() %>%
stri_unescape_unicode() %>%
str_remove_all("\032") %>%
str_replace_all("\\|", " ") %>%
str_squish()
}))
# df_parents <-
# data %>%
# group_by(nameSolicitation) %>%
# filter(datetimePublished == min(datetimePublished)) %>%
# dplyr::slice(1) %>%
# ungroup() %>%
# select(
# nameSolicitation,
# datetimePublishedInitial = datetimePublished,
# idSolicitationParent = idSolicitation
# ) %>%
# group_by(idSolicitationParent) %>%
# dplyr::slice(1) %>%
# ungroup()
#
# data %>%
# select(countDaysOnline,datetimeResponse) %>%
# mutate(isPossibleActive = is.na(datetimeResponse) & countDaysOnline %>% between(1, 365))
data <-
.clean_usg_organizations(data = data, column = "nameDepartment")
data <-
.clean_usg_organizations(data = data, column = "nameCommandSub")
data <-
data %>% .clean_usg_organizations("nameOffice")
data <-
data %>% fix_usg_organization_col(org_col = "nameDepartment")
data <-
data %>% fix_usg_organization_col(org_col = "nameOffice")
data <-
data %>% fix_usg_organization_col(org_col = "nameCommandSub")
data <- distinct(data)
if (exclude_awards) {
data <-
data %>%
filter(is.na(nameAwardee)) %>%
filter(is.na(dateAward))
}
if (only_active) {
if (include_unknown_responses) {
data <-
data %>%
filter(isActive | is.na(isActive))
} else {
data <-
data %>%
filter(isActive)
}
}
if (exclude_archived) {
data <-
data %>%
filter(dateArchive >= Sys.Date() | is.na(dateArchive))
}
df_descriptions <-
data %>%
filter(!is.na(descriptionSolicitation)) %>%
distinct(descriptionSolicitation) %>%
mutate(html = glue("<html>{descriptionSolicitation}</html>") %>% as.character())
descriptions <-
seq_along(df_descriptions$html) %>%
map_chr(function(x) {
df_descriptions$html[[x]] %>%
stri_enc_toascii() %>%
read_html() %>%
html_text() %>%
stringi::stri_trans_general("Latin-ASCII") %>%
stri_enc_toascii() %>% str_replace_all(" \032 ", " ") %>% str_to_upper() %>%
str_squish()
})
df_descriptions <-
df_descriptions %>%
mutate(descriptionSolicitationActual = descriptions) %>%
select(-html)
data <-
data %>%
left_join(df_descriptions, by = "descriptionSolicitation") %>%
select(-descriptionSolicitation) %>%
rename(descriptionSolicitation = descriptionSolicitationActual)
rm(df_descriptions)
rm(descriptions)
gc()
data <-
data %>%
.remove_na()
data <- data %>%
mutate(
namePrimaryContact = namePrimaryContact %>% stringi::stri_enc_toascii() %>%
stri_unescape_unicode() %>%
str_remove_all("\032") %>%
str_replace_all("\\|", " ") %>%
str_squish() %>%
str_to_upper()
)
df_naics <-
data %>%
filter(!is.na(idNAICS)) %>%
distinct(idNAICS)
dict_naics <-
dictionary_naics_codes() %>%
group_by(idNAICS) %>%
slice(1) %>%
ungroup() %>%
mutate(idNAICS = as.numeric(idNAICS))
data <-
data %>%
mutate(idNAICS = as.numeric(idNAICS)) %>%
left_join(dict_naics, by = "idNAICS") %>%
group_by(idNotice) %>%
slice(1) %>%
ungroup()
if (join_address) {
data <-
data %>%
build_address()
}
data <- data %>%
mutate(hasResponseDate = !is.na(dateResponse),
hasAwardID = !is.na(idAward),
idSolicitationClean = idSolicitation %>% str_remove_all("\\-|\\_|\\(|\\)"))
if (snake_names) {
data <-
data %>%
clean_names()
}
data
}
# historic_contracts ------------------------------------------------------
#' Historic USA Spending extracts
#'
#' Links for annual
#' contract solicitations by
#' year.
#'
#' @return \code{tibble}
#' @export
#'
#' @examples
#' dictionary_usa_spending_contract_archive()
dictionary_usa_spending_contract_archive <-
memoise::memoise(function() {
json <-
"https://beta.sam.gov/api/prod/fileextractservices/v1/api/listfiles?random=1578940065681&domain=Contract%20Opportunities/Archived%20Data" %>%
fromJSON(simplifyDataFrame = T)
df <- json[[1]][[1]]
urlCSV <- df[["_links"]]$self$href %>% map_chr(URLencode)
data <-
df[, 1:4] %>%
as_tibble() %>%
select(1:2) %>%
setNames(c("slugFile", "dateModified")) %>%
mutate(dateModified = mdy(dateModified)) %>%
mutate(yearData = slugFile %>% substr(3, 6) %>% as.numeric()) %>%
select(yearData, everything()) %>%
mutate(urlCSV)
data
})
.parse_contract_archive_url <-
function(url = "https://s3.amazonaws.com/falextracts/Contract Opportunities/Archived Data/FY2012_archived_opportunities.csv",
join_address = T) {
data <-
url %>%
URLencode() %>%
vroom()
data <-
data %>%
.munge_biz_opps_names()
upper_cols <-
data %>% select_if(is.character) %>% select(-matches("email|idNotice|url")) %>% names()
lower_cols <-
data %>% select_if(is.character) %>% select(matches("email|url")) %>% names()
data <-
data %>%
mutate_at(upper_cols,
list(function(x) {
x %>%
stringi::stri_enc_toascii() %>%
str_remove_all("\032") %>%
str_replace_all("\\|", " ") %>%
str_squish() %>%
str_to_upper() %>%
str_remove_all("\\_|\\~|--") %>%
# stri_unescape_unicode() %>%
stri_replace_all_charclass("\\p{WHITE_SPACE}", " ")
}))
data <- data %>%
mutate(
namePrimaryContact = namePrimaryContact %>%
stringi::stri_enc_toascii() %>%
str_remove_all("\032") %>%
str_replace_all("\\|", " ") %>%
str_squish() %>%
str_to_upper() %>%
str_remove_all("\\_|\\~|--") %>%
stri_unescape_unicode() %>%
stri_replace_all_charclass("\\p{WHITE_SPACE}", " ")
)
data <-
data %>%
mutate_at(lower_cols,
list(function(x) {
x %>%
stringi::stri_enc_toascii() %>%
str_remove_all("\032") %>%
str_replace_all("\\|", " ") %>%
str_squish() %>%
str_to_upper() %>%
str_remove_all("\\_|\\~|--") %>%
stri_unescape_unicode() %>%
stri_replace_all_charclass("\\p{WHITE_SPACE}", " ") %>%
str_to_lower()
}))
data <-
data %>% mutate_if(is.character, str_squish)
if (data %>% hasName("zipcodeOrganization")) {
data <- data %>%
mutate(zipcodeOrganization = as.character(zipcodeOrganization))
}
if (data %>% hasName("nameSolicitation")) {
data <-
data %>%
mutate(
nameSolicitation = nameSolicitation %>% str_remove_all("^[A-Z]--") %>%
str_remove_all("^[0-9][0-9]--") %>%
str_squish() %>% str_to_upper()
)
}
date_cols <-
data %>%
select_if(is.character) %>%
select(matches("^datetime[A-Z]|^date[A-Z]")) %>% names()
data <- data %>%
mutate(amountAward = parse_number(as.character(amountAward)))
if (length(date_cols) > 0) {
data <-
data %>%
mutate_at(date_cols,
list(function(x) {
case_when(is.na(x) ~ NA_character_,
TRUE ~ x %>% substr(1, 10)) %>% ymd()
}))
}
date_cols <-
data %>% select(matches("^date")) %>% names()
df_years <-
data %>%
select(date_cols) %>%
transmute_at(tidyselect::all_of(date_cols), year)
names(df_years) <-
names(df_years) %>% str_replace_all("datetime", "year") %>% str_replace_all("date", "year")
data <-
data %>%
bind_cols(df_years)
data <-
data %>%
mutate(
hasNoticeUpdates = typeNotice != typeNoticeBase,
isActive = case_when(isActive == "YES" ~ TRUE,
TRUE ~ FALSE)
) %>%
.remove_na()
if (data %>% hasName("nameAwardee")) {
data <- data %>%
mutate(
nameAwardee = case_when(
nameAwardee == "NULL" ~ NA_character_,
nameAwardee == "VARIOUS" ~ "MULTIPLE",
TRUE ~ nameAwardee
),
isAward = !is.na(nameAwardee),
hasAwardeeDUNS = nameAwardee %>% str_detect("DUNS:") %>% coalesce(FALSE)
)
}
if (data %>% hasName("typeSetAside")) {
data <-
data %>%
separate(
typeSetAside,
sep = "\\(FAR",
into = c("typeSetAside", "codeFAR"),
extra = "drop",
fill = "right"
) %>%
mutate_at(c("typeSetAside", "codeFAR"),
list(function(x) {
x %>% str_remove_all("\\(|\\)") %>%
str_remove_all("\\HUBZONE|\\IEE|\\EDWOSB|\\SDVOSB") %>%
str_squish()
})) %>%
mutate(
hasSetAside = !is.na(typeSetAside) %>% coalesce(FALSE),
isSoleSource = typeSetAside %>% str_detect("SOLE SOURCE") %>% coalesce(FALSE)
)
}
df_solicitation_groups <-
data %>%
filter(!is.na(idSolicitationGroup)) %>%
count(idSolicitationGroup) %>%
mutate(
char = nchar(idSolicitationGroup),
hasPSCExact = char > 2,
typePSC = case_when(
idSolicitationGroup %>% str_detect("[A-Z]") ~ "SERVICE",
TRUE ~ "PRODUCT"
),
codeProductService = idSolicitationGroup,
idSolicitationGroupActual = case_when(
typePSC == "PRODUCT" ~ idSolicitationGroup %>% substr(1, 2),
TRUE ~ idSolicitationGroup %>% substr(1, 1)
)
) %>%
select(-c(n, char)) %>%
mutate_at(c("codeProductService", "idSolicitationGroup"),
as.character) %>%
left_join(
dictionary_psc_active() %>%
select(
codeProductService,
nameSolicitationGroup,
nameProductService
),
by = "codeProductService"
)
data <-
data %>%
mutate_at(c("idSolicitationGroup"),
as.character) %>%
left_join(df_solicitation_groups, by = "idSolicitationGroup") %>%
select(-idSolicitationGroup) %>%
rename(idSolicitationGroup = idSolicitationGroupActual) %>%
select(one_of(names(data)), everything()) %>%
select(idNotice:idSolicitation,
names(
df_solicitation_groups %>% select(-idSolicitationGroupActual)
),
everything())
data <-
data %>%
mutate(
isActive = as.Date(datetimeResponse) > Sys.Date(),
dateResponse = as.Date(datetimeResponse),
countDaysToRespond = (as.Date(datetimeResponse) - Sys.Date()) %>% as.numeric(),
countDaysToRespond = case_when(is.na(countDaysToRespond) ~ 0,
TRUE ~ countDaysToRespond),
countDaysToRespond = pmax(0, countDaysToRespond, na.rm = T),
countDaysOnline = (Sys.Date() - as.Date(datetimePublished)) %>% as.integer(),
countDaysOpenToRespond = (as.Date(datetimeResponse) - as.Date(datetimePublished)) %>% as.numeric(),
countDaysOpenToRespond = pmax(0, countDaysOpenToRespond, na.rm = T),
urlOpportunityAPI = glue(
"https://beta.sam.gov/api/prod/opps/v2/opportunities/{idNotice}"
) %>% as.character(),
urlOpportunityAttachmentAPI = glue(
"https://beta.sam.gov/api/prod/opps/v3/opportunities/{idNotice}/resources?excludeDeleted=false&withScanResult=false"
) %>% as.character(),
urlOpportunityAttachmentZIP = glue(
"https://beta.sam.gov/api/prod/opps/v3/opportunities/{idNotice}/resources/download/zip"
) %>% as.character()
)
if (data %>% hasName("hasPSCExact")) {
data <- data %>%
mutate(hasPSCExact = hasPSCExact %>% coalesce(FALSE))
}
if (data %>% hasName("descriptionSolicitation")) {
data <-
data %>%
filter(!descriptionSolicitation %>% str_detect("New Action IEJ Amendment")) %>%
mutate_at(c("nameSolicitation", "descriptionSolicitation"),
list(function(x) {
x %>%
stringi::stri_enc_toascii() %>%
str_remove_all("\032") %>%
str_replace_all("\\|", " ") %>%
str_squish() %>%
str_remove_all("\\_|\\~|--") %>%
stri_replace_all_charclass("\\p{WHITE_SPACE}", " ")
}))
df_descriptions <-
data %>%
filter(!is.na(descriptionSolicitation)) %>%
distinct(descriptionSolicitation) %>%
mutate(html = glue("<html>{descriptionSolicitation}</html>") %>% as.character())
.parse_html_description_safe <-
possibly(.parse_html_description, NA)
descriptions <-
seq_along(df_descriptions$html) %>%
map_chr(function(x) {
x %>% message()
description <-
df_descriptions$html[[x]] %>%
.parse_html_description_safe()
description
})
df_descriptions <-
df_descriptions %>%
mutate(descriptionSolicitationActual = descriptions) %>%
select(-html)
data <-
data %>%
left_join(df_descriptions, by = "descriptionSolicitation") %>%
select(-descriptionSolicitation) %>%
rename(descriptionSolicitation = descriptionSolicitationActual) %>%
mutate(urlCSV = url)
rm(df_descriptions)
rm(descriptions)
}
data <-
.clean_usg_organizations(data = data, column = "nameDepartment")
data <-
.clean_usg_organizations(data = data, column = "nameCommandSub")
data <-
data %>% .clean_usg_organizations("nameOffice")
data <- distinct(data) %>%
.remove_na()
if (data %>% hasName("idNAICS")) {
dict_naics <-
dictionary_naics_codes() %>%
group_by(idNAICS) %>%
slice(1) %>%
ungroup()
data <-
data %>%
left_join(dict_naics, by = "idNAICS") %>%
group_by(idNotice) %>%
slice(1) %>%
ungroup()
}
if (join_address) {
data <- data %>%
build_address()
}
data <- data %>%
mutate(
hasResponseDate = !is.na(dateResponse),
hasAwardID = !is.na(idAward),
idSolicitationClean = idSolicitation %>% str_remove_all("\\-|\\_|\\(|\\)")
)
gc()
data
}
#' Parse vector of contract
#' archive urls
#'
#' @param urls vector of SAM bulk contract urls
#' @param snake_names if \code{TRUE} returns snaked names
#'
#' @return
#' @export
#'
#' @examples
parse_contract_archive_urls <-
function(urls = c(
"https://s3.amazonaws.com/falextracts/Contract%20Opportunities/Archived%20Data/FY1970_archived_opportunities.csv",
"https://s3.amazonaws.com/falextracts/Contract%20Opportunities/Archived%20Data/FY1980_archived_opportunities.csv",
"https://s3.amazonaws.com/falextracts/Contract%20Opportunities/Archived%20Data/FY1998_archived_opportunities.csv"
),snake_names = F,
join_address = T) {
options(future.globals.maxSize = 500 * 1024 ^ 10)
all_data <-
urls %>%
map_dfr(function(url) {
parts <- url %>% str_split("/") %>% flatten_chr()
yearData <-
parts[length(parts)] %>% substr(3,6) %>%
as.numeric()
.parse_contract_archive_url(url = url, join_address = join_address) %>%
mutate(yearData) %>%
select(yearData, everything())
})
if (snake_names) {
all_data <-
all_data %>%
clean_names()
}
all_data
}
#' Bulk Download of SAM Bulk Contract Opportunities
#'
#' @param contract_years vector of contract years available from \link{dictionary_usa_spending_contract_archive}
#' @param snake_names if `true` snake names
#' @param join_address if true joins address
#'
#' @return \code{tibble}
#' @export
#'
#' @examples
bulk_sam_contract_archives <-
function(contract_years = NULL,
snake_names = F,
join_address = T
) {
dict_archives <- dictionary_usa_spending_contract_archive()
if (length(contract_years) == 0) {
stop("Please Enter contract years")
}
if (contract_years %>% length() > 0) {
dict_archives <-
dict_archives %>%
filter(yearData %in% contract_years)
}
data <-
dict_archives$urlCSV %>%
parse_contract_archive_urls(snake_names = snake_names, join_address = join_address)
data
}
#' Cached SAM Contracts
#'
#' @param munge
#' @param snake_names
#'
#' @return
#' @export
#'
#' @examples
cached_active_sam_contracts <-
function(munge = T, snake_names = T) {
data <-
read_rda("https://asbcllc.com/r_packages/govtrackR/sheldon/data/active_contracts.rda")
if (munge) {
data <- data %>%
munge_lite(snake_names = snake_names)
}
data
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.