# https://www.export.gov/article?id=Consolidated-Screening-List
dictionary_bis_names <-
function(){
tibble(nameBIS = c("source", "entity_number", "type", "programs", "name", "title",
"addresses", "federal_register_notice", "start_date", "end_date",
"standard_order", "license_requirement", "license_policy", "call_sign",
"vessel_type", "gross_tonnage", "gross_registered_tonnage", "vessel_flag",
"vessel_owner", "remarks", "source_list_url", "alt_names", "citizenships",
"dates_of_birth", "nationalities", "places_of_birth", "source_information_url",
"ids",
"Classification",
"Name",
"Prefix",
"First",
"Middle",
"Last",
"Suffix",
"Address 1",
"Address 2",
"Address 3",
"Address 4",
"City",
"State / Province",
"Country",
"Zip Code",
"DUNS",
"Exclusion Program",
"Excluding Agency",
"CT Code",
"Exclusion Type",
"Additional Comments",
"Active Date",
"Termination Date",
"Record Status",
"Cross-Reference",
"SAM Number",
"CAGE",
"NPI",
"Creation_Date"
),
nameActual = c("nameSource", "idEntity", "typeEntity", "programBIS", "nameParty", "titleParty",
"addressesParty", "detailsFederalRegisterNotice",
"dateStart", "dateEnd",
"isStandardOrder", "detailsLicenseRequirement", "detailsLicensePolicy",
"slugCallSign",
"typeVesel", "amountGrossTonnage", "amountGrossTonnageRegistered", "flagVessel",
"ownerVessel", "detailsRemarks", "urlSources", "namesAlternative", "countriesCitizenship",
"detailsDatesOfBirth", "detailsNationalities", "detailsPlacesOfBirth", "urlSourceInformation",
"detailsIdentifiers",
"typeClassification",
"nameEntity",
"prefixEntity",
"nameFirst",
"nameMiddle",
"nameLast",
"suffixEntity",
"address1",
"address2",
"address3",
"address4",
"city",
"state",
"country",
"zipcode",
"idDUNS",
"typeExclusionProgram",
"slugAgencyExcluding",
"codeCauseTreatment",
"typeExclusion",
"commentsExclusion",
"dateActive",
"dateTermination",
"statusRecord",
"descriptionCrossReference",
"idSAM",
"slugCAGE",
"idNationalProvider",
"dateCreated"
)
)
}
.munge_bis_names <-
function(data) {
dict_names <- dictionary_bis_names()
fdps_names <-
names(data)
actual_names <-
fdps_names %>%
map_chr(function(name) {
df_row <-
dict_names %>% filter(nameBIS == name)
if (nrow(df_row) == 0) {
glue::glue("Missing {name}") %>% message()
return(name)
}
df_row$nameActual
})
data %>%
set_names(actual_names)
}
# entities ----------------------------------------------------------------
.dl_bis <-
memoise::memoise(function() {
data <-
"https://api.trade.gov/consolidated_screening_list/search.csv?api_key=OHZYuksFHSFao8jDXTkfiypO" %>%
data.table::fread(verbose = F, showProgress = FALSE) %>%
as_tibble()
data
})
#' Bureau of Industry and Security Entity List
#'
#' Consolidated list of entities and individuals that
#' may be banned from doing business in the United States or
#' with U.S. parties.
#'
#' @return \code{tibble}
#' @export
#'
#' @examples
bis_entities <-
function(snake_names = F){
data <- .dl_bis()
data <-.munge_bis_names(data)
data <- data %>%
separate(nameSource, into = c("nameSource", "nameAgencyResponsible"), sep = "\\ - ",extra = "merge", fill = "right") %>%
separate(nameSource, into = c("nameSource", "slugSource"), sep = "\\(",extra = "merge", fill = "right") %>%
mutate(slugSource = slugSource %>% str_remove_all("\\)")) %>%
mutate_if(is.character, str_trim)
data <-
data %>%
mutate_if(is.character,
list(function(x) {
ifelse(x == "", NA, x)
})) %>%
mutate(
dateStart = dateStart %>% str_replace_all("0097", "1997") %>% ymd(),
dateEnd = dateEnd %>% ymd()
)
data <- data %>%
.munge_data(clean_address = F)
data <- data %>%
mutate_at(c("amountGrossTonnage", "amountGrossTonnageRegistered"),list(function(x){
x %>% as.numeric() %>% formattable::comma(digits = 0)
}))
data <- data %>%
mutate(idRow = 1:n())
df_people <- data %>% filter(typeEntity == "INDIVIDUAL")
df_people <-
df_people %>% distinct(nameParty) %>%
separate(
nameParty,
sep = "\\, ",
into = c("nameLast", "nameFirst"),
remove = F,
extra = "merge"
) %>%
mutate(namePartyClean = case_when(
is.na(nameFirst) ~ nameLast,
TRUE ~ str_c(nameFirst, nameLast, sep = " ")
)) %>%
select(-c(nameFirst, nameLast)) %>%
left_join(df_people, by = "nameParty") %>%
select(-nameParty) %>%
rename(nameParty = namePartyClean) %>%
select(one_of(names(df_people)))
data <-
data %>%
filter(typeEntity != "INDIVIDUAL" | is.na(typeEntity)) %>%
bind_rows(df_people) %>%
arrange(idRow)
data <- data %>%
mutate_if(is.character,
list(function(x) {
x %>% str_replace_all("\\; ", "\\ | ")
}))
df_birthdays <- data %>%
distinct(idRow, detailsDatesOfBirth) %>%
separate_rows(detailsDatesOfBirth, sep = "\\| ") %>%
mutate_if(is.character, str_trim) %>%
mutate(detailsDatesOfBirth = case_when(
nchar(detailsDatesOfBirth) == 4 ~ glue("{detailsDatesOfBirth}-01-01"),
TRUE ~ detailsDatesOfBirth
)) %>%
filter(!is.na(detailsDatesOfBirth)) %>%
group_by(idRow) %>%
summarise(detailsDatesOfBirth = as.character(detailsDatesOfBirth) %>% str_c(collapse = " | ")) %>%
ungroup()
data <- data %>%
select(-detailsDatesOfBirth) %>%
left_join(df_birthdays, by = "idRow") %>%
select(names(data), everything())
data <- data %>%
mutate(
detailsLicensePolicy = case_when(
detailsLicensePolicy %>% str_detect("PRESUMPTION OF DENIAL") ~ "PRESUMPTION OF DENIAL",
detailsLicensePolicy %>% str_detect("CASE-BY-CASE BASIS") ~ "CASE-BY-CASE BASIS",
detailsLicensePolicy %>% str_detect("032746.5 OF THE EAR") ~ "032746.5 OF THE EAR",
TRUE ~ detailsLicensePolicy
)
)
data <- data %>%
mutate(
detailsLicenseRequirement =
case_when(
detailsLicenseRequirement %in% c(
"FOR ALL ITEMS SUBJECT TO THE EAR .",
"FOR ALL ITEMS SUBJECT TO THE EAR. .",
"FOR ALL ITEMS SUBJECT TO THE EAR",
"FOR ALL ITEMS SUBJECT TO THE EAR."
) ~ "ITEMS SUBJECT TO EAR",
TRUE ~ detailsLicenseRequirement
)
)
if (snake_names){
data <- clean_names(data)
}
data
}
#' BIS Denied Persons List
#'
#' @param clean_address
#'
#' @return
#' @export
#'
#' @examples
bis_denied_persons <-
memoise::memoise(function(clean_address = T) {
data <- "https://www.bis.doc.gov/dpl/dpl.txt" %>% read_tsv()
data <- data %>% clean_names()
data <- data %>%
rename(
party = name,
is_standard_order = standard_order,
type_action = action,
zipcode = postal_code,
date_last_updated = last_update,
date_effective = effective_date,
date_expiration = expiration_date
) %>%
munge_data(clean_address = clean_address) %>%
mutate(date_data = Sys.Date()) %>%
clean_names()
data
})
# sam_entities ------------------------------------------------------------
#' SAM Entitiy URLS
#'
#' @return
#' @export
#'
#' @examples
dictionary_sam_entities <-
memoise::memoise(function() {
page <- "https://www.sam.gov/SAM/pages/public/extracts/samPublicAccessData.jsf" %>%
read_html()
nodes <- page %>% html_nodes(".overrideResetCSS+ .overrideResetCSS b~ a")
urls <-
nodes %>% html_attr("href") %>%
str_c("https://www.sam.gov",.)
files <-
nodes %>% html_text() %>% str_to_upper()
julian_dates <-
urls %>%
str_remove_all(".ZIP") %>%
str_split("MONTHLY_") %>%
map_chr(function(x){
x[[2]]
}) %>%
as.numeric()
tibble(
dateData = ymd(julian_dates),
nameFile = files,
urlZIP = urls
) %>%
mutate(typeFile = "SAM BULK",
isASCII = nameFile %>% str_detect("ASCII"))
})
.parse_sam_entity_zip <- function(url = "https://www.sam.gov/SAM/extractfiledownload?role=SAM-PUBLIC-UTF8&version=SAM&filename=SAM_PUBLIC_UTF-8_MONTHLY_20200607.ZIP") {
outfile <- tempfile("download", fileext = ".zip")
file <- download.file(url, outfile)
unz_files <- outfile %>% unzip(exdir = "zip")
data <-
unz_files %>%
fread()
unz_files %>% unlink()
file %>% unlink()
outfile %>% unlink()
unlink("zip", recursive = T, force = T)
data <-
data %>%
select(-150)
actual_names <- .dictionary_sam_extract_names() %>% pull(nameColumn)
data <- data %>% setNames(actual_names)
data
}
.sam_bulk_entities <-
function(clean_address = T, return_message =T) {
df_urls <-
dictionary_sam_entities()
df_urls <- df_urls %>% filter(!isASCII)
url <- df_urls$urlZIP
dateData <- df_urls$dateData
data <- .parse_sam_entity_zip(url = url)
data <- data %>%
mutate(idDUNS = as.integer(idDUNS))
data <-
data %>% mutate_if(is.character,
list(function(x) {
x %>%
str_squish()
}))
data <-
data %>%
mutate_if(is.character,
list(function(x) {
case_when(x %in% c("", " ") ~ NA_character_,
TRUE ~ x)
}))
logical_cols <-
data %>% select(matches("^is|^has")) %>% names()
data <- data %>%
mutate_at(logical_cols,
list(function(x) {
case_when(x == "N" ~ FALSE,
x == "Y" ~ TRUE,
TRUE ~ NA)
}))
data <-
data %>%
left_join(dictionary_sam_parent_entity_types(), by = "slugEntityStructure")
data <-
data %>%
mutate(slugSAMExtract = as.character(slugSAMExtract)) %>%
left_join(dictionary_sam_extract_types(), by = "slugSAMExtract")
data <- data %>%
left_join(dictionary_sam_registration_purpose_codes(), by = "slugRegistrationPurpose")
data <- data %>%
mutate(isNonPublicSAMRegistered = case_when(slugPublicDisplay == "NPDY" ~ TRUE,
TRUE ~ FALSE)) %>%
select(-slugPublicDisplay)
data <-
data %>%
mutate(isExpiredSAMEntity = typeSAMExtract == "EXPIRED") %>%
select(idDUNS, isExpiredSAMEntity, everything())
df_types <-
data %>%
filter(!is.na(dataBusinessTypes)) %>%
select(idDUNS, dataBusinessTypes) %>%
separate_rows("dataBusinessTypes", sep = "~") %>%
rename(slugBusinessType = dataBusinessTypes) %>%
filter(!is.na(slugBusinessType))
df_types <-
df_types %>%
left_join(dictionary_sam_business_types(), by = "slugBusinessType") %>%
nest(dataBusinessTypes = c(slugBusinessType, typeBusinessSAM))
data <-
data %>%
select(-dataBusinessTypes) %>%
left_join(df_types, by = "idDUNS") %>%
select(one_of(names(data)), everything())
rm(df_types)
gc()
df_response_codes <-
data %>%
filter(!is.na(dataDisasterResponse)) %>%
select(idDUNS, dataDisasterResponse) %>%
rename(slugDisasterResponseCode = dataDisasterResponse) %>%
separate_rows(slugDisasterResponseCode, sep = "\\~") %>%
filter(!is.na(slugDisasterResponseCode)) %>%
mutate(
typeAreaResponse = case_when(
slugDisasterResponseCode == "ANY" ~ "ANY AREA",
slugDisasterResponseCode %>% str_detect("^STA") ~ "STATE",
slugDisasterResponseCode %>% str_detect("^CTY") ~ "COUNTY",
slugDisasterResponseCode %>% str_detect("^MSA") ~ "METROPOLITAN SERVICE AREA"
),
codeAreaResponse =
case_when(
slugDisasterResponseCode == "ANY" ~ NA_character_,
TRUE ~ substr(slugDisasterResponseCode,
4,
nchar(slugDisasterResponseCode))
)
)
df_response_codes <-
df_response_codes %>%
nest(
dataDisasterResponse = c(slugDisasterResponseCode,
typeAreaResponse,
codeAreaResponse)
)
data <-
data %>%
select(-dataDisasterResponse) %>%
left_join(df_response_codes, by = "idDUNS")
rm(df_response_codes)
gc()
dict_naics <-
dictionary_naics_codes()
data <-
data %>%
group_by(idDUNS) %>%
slice(1) %>%
ungroup()
data <-
data %>%
left_join(dict_naics %>% select(idNAICSPrimary = idNAICS, nameNAICSPrimary = nameNAICS),
by = "idNAICSPrimary") %>%
group_by(idDUNS) %>%
slice(1) %>%
ungroup()
df_naics <-
data %>%
select(idDUNS, idNAICSPrimary, dataNAICS) %>%
filter(!is.na(dataNAICS)) %>%
separate_rows(dataNAICS, sep = "\\~") %>%
rename(codeNAICS = dataNAICS) %>%
filter(!is.na(codeNAICS)) %>%
group_by(idDUNS) %>%
slice(1) %>%
ungroup()
df_naics <-
df_naics %>%
mutate(hasSmallBusinessIndicator = codeNAICS %>% str_detect("Y$")) %>%
mutate(idNAICS = parse_number(codeNAICS),
isPrimaryNAICS = idNAICS == idNAICSPrimary) %>%
select(idDUNS, isPrimaryNAICS, idNAICS, hasSmallBusinessIndicator)
df_naics <-
df_naics %>%
left_join(dict_naics, by = "idNAICS")
df_naics <- df_naics %>% select(-yearCodeBookNAICS)
data <-
data %>% select(-dataNAICS) %>%
left_join(df_naics %>%
nest(
dataNAICS = c(
isPrimaryNAICS,
idNAICS,
hasSmallBusinessIndicator,
nameNAICS,
isPrimaryNAICS
)
), by = "idDUNS") %>%
distinct() %>%
group_by(idDUNS) %>%
slice(1) %>%
ungroup()
df_naics_exceptions <-
data %>%
select(idDUNS, dataNAICSExceptions) %>%
filter(!is.na(dataNAICSExceptions)) %>%
separate_rows(dataNAICSExceptions, sep = "\\~") %>%
rename(slugNAICS = dataNAICSExceptions) %>%
mutate_if(is.character, str_squish) %>%
filter(slugNAICS != "") %>%
mutate(idNAICS = slugNAICS %>% substr(1, 6) %>% as.integer()) %>%
mutate(isException = T) %>%
left_join(dict_naics %>% select(idNAICS, nameNAICS), by = "idNAICS")
df_naics_exceptions <-
df_naics_exceptions %>%
mutate(
slugExceptions = slugNAICS %>% str_remove_all("[0-9]"),
countExceptions = slugExceptions %>% nchar()
) %>%
nest(
dataNAICSExceptions = c(
slugNAICS,
idNAICS,
isException,
nameNAICS,
slugExceptions,
countExceptions
)
)
data <-
data %>%
select(-dataNAICSExceptions) %>%
left_join(df_naics_exceptions, by = "idDUNS") %>%
group_by(idDUNS) %>%
slice(1) %>%
ungroup()
rm(df_naics)
rm(dict_naics)
gc()
dict_psc <-
dictionary_psc_active(only_active = T)
df_psc <-
data %>%
filter(!is.na(dataProductServiceCodes)) %>%
select(idDUNS, dataProductServiceCodes) %>%
separate_rows(dataProductServiceCodes, sep = "\\~") %>%
rename(codeProductService = dataProductServiceCodes)
df_psc <-
df_psc %>%
filter(!codeProductService == "") %>%
mutate(
letternumber = codeProductService %>% substr(1, 1),
isNumber = letternumber %>% str_detect("[0-9]")
) %>%
mutate(
idSolicitationGroup = case_when(
isNumber ~ codeProductService %>% substr(1, 2),
TRUE ~ codeProductService %>% substr(1, 1)
)
) %>%
select(-c(letternumber, isNumber))
dict_psc <-
dict_psc %>% select(typePSC,
codeProductService,
nameProductService)
df_psc <-
df_psc %>%
left_join(dict_psc, by = c("codeProductService"))
df_psc <-
df_psc %>%
nest(
dataProductServiceCodes = c(
typePSC,
nameProductService,
idSolicitationGroup,
codeProductService
)
)
data <-
data %>%
select(-dataProductServiceCodes) %>%
left_join(df_psc, by = "idDUNS") %>%
group_by(idDUNS) %>%
slice(1) %>%
ungroup()
if (data %>% hasName("urlCompany")) {
data <- data %>%
mutate(
urlCompany = case_when(
urlCompany %>% str_detect("http:|https:") ~ urlCompany,
is.na(urlCompany) ~ NA_character_,
TRUE ~ glue("https://{urlCompany}") %>% as.character()
)
)
}
rm(df_psc)
gc()
df_sba <-
data %>%
filter(!is.na(dataSBACodes)) %>%
select(idDUNS, dataSBACodes) %>%
separate_rows(dataSBACodes, sep = "\\~") %>%
rename(codeSetAside = dataSBACodes)
df_sba <- df_sba %>%
mutate(
typeSetAside = case_when(
codeSetAside == "XX" ~ "HUBZONE CERTIFIED",
codeSetAside %>% str_detect("^A6") ~ "8A PARTICPANT",
codeSetAside %>% str_detect("^JT") ~ "8A JOINT VENTURE"
),
slugSetAside = codeSetAside %>% substr(1, 2)
)
df_sba <- df_sba %>%
mutate(
dateExpirationSBA = case_when(
slugSetAside == "XX" ~ NA_character_,
TRUE ~ codeSetAside %>% substr(3, nchar(codeSetAside))
) %>% as.numeric() %>% ymd()
) %>%
select(-codeSetAside)
df_sba <- df_sba %>%
nest(dataSBA = c(typeSetAside, slugSetAside, dateExpirationSBA))
data <-
data %>%
select(-dataSBACodes) %>%
left_join(df_sba, by = "idDUNS")
date_names <-
data %>%
select(matches("date")) %>%
select(-matches("dateCompanyStart")) %>%
names()
data <-
data %>%
mutate_at(date_names,
list(function(x) {
x %>% ymd()
}))
data <-
data %>%
mutate(
charDate = nchar(dateCompanyStart),
dateCompanyStart = as.integer(dateCompanyStart),
ymdDate = case_when(
charDate == 7 ~ dateCompanyStart %>% substr(2, 7) %>% as.integer(),
TRUE ~ dateCompanyStart
),
date = ymd(ymdDate),
year = year(date),
ymdDate = case_when(
year <= 1120 ~ ymdDate %>% substr(3, nchar(ymdDate)) %>% as.character(),
idDUNS %in% c(
942278250,
807634428,
81559260,
54766010,
64402055,
78443143,
967454070
) ~ ymdDate %>% substr(3, nchar(ymdDate)) %>% as.character(),
TRUE ~ as.character(ymdDate)
),
date = ymd(ymdDate),
year = year(date)
) %>%
select(-c(dateCompanyStart, year, charDate, ymdDate)) %>%
rename(dateCompanyStart = date) %>%
select(one_of(names(data)), everything())
data <-
data %>%
mutate_at(c("nameCompanyLegal",
"nameCompanyDivision",
"nameCompanyDBA"),
list(function(x) {
x %>% str_replace_all("\\, INC.|\\, INC|\\,INCORPORATED", " INC") %>%
str_replace_all("\\, LLC|\\, L.L.C|\\, L.L.C.", " LLC") %>%
str_replace_all("\\, LLP|\\, L.L.P|\\, L.L.P.", " LLP") %>%
str_replace_all("\\, LP|\\, L.P|\\, L.P.", " LP") %>%
str_replace_all("\\, LTD.|\\, LTD", " LTD") %>%
str_replace_all("\\, COMPANY", " COMPANY") %>%
str_replace_all("\\,CORP", " CORP") %>%
str_remove_all("\\*") %>%
str_remove_all('\\"')
}))
data <-
data %>%
mutate(
countDaysPostFoundingToSAMRegistration = (dateActivation - dateCompanyStart) %>% as.integer(),
countDaysExistence = (Sys.Date()-dateCompanyStart) %>% as.integer()
)
lower_names <-
data %>% select_if(is.character) %>%
select(matches("^email|^url")) %>%
names()
data <- data %>%
mutate_at(lower_names, str_to_lower)
upper_names <-
data %>% select_if(is.character) %>% select(-one_of(lower_names)) %>% names()
data <-
data %>%
mutate_at(upper_names, str_to_upper)
data <-
data %>%
mutate(
nameContactGovernment = case_when(
!is.na(nameFirstPointOfContactGovt) &
!is.na(nameLastPointOfContactGovt) ~ str_c(
nameFirstPointOfContactGovt,
nameLastPointOfContactGovt,
sep = " "
),
TRUE ~ NA_character_
),
nameContactGovernmentAlt = case_when(
!is.na(nameFirstPointOfContactGovtAlt) &
!is.na(nameLastPointOfContactGovtAlt) ~ str_c(
nameFirstPointOfContactGovtAlt,
nameLastPointOfContactGovtAlt,
sep = " "
),
TRUE ~ NA_character_
),
nameContactPastPerformance = case_when(
!is.na(nameFirstPointOfContactPastPerformance) &
!is.na(nameLastPointOfContactPastPerformance) ~ str_c(
nameFirstPointOfContactPastPerformance,
nameLastPointOfContactPastPerformance,
sep = " "
),
TRUE ~ NA_character_
),
nameContactElectronic = case_when(
!is.na(nameFirstPointOfContactElectronicBusiness) &
!is.na(nameLastPointOfContactElectronicBusiness) ~ str_c(
nameFirstPointOfContactElectronicBusiness,
nameLastPointOfContactElectronicBusiness,
sep = " "
),
TRUE ~ NA_character_
)
)
data <- data %>%
mutate(
typeEntity =
case_when(
typeEntityStructure == "OTHER" ~ "OTHER",
typeEntityStructure %>% str_detect("GOVERNMENT") ~ "GOVERNMENT",
TRUE ~ "CORPORATION"
),
isTaxExemptEntity = typeEntityStructure %in% c(
"FOREIGN COUNTRY OR GOVERNMENT",
"TAX EXEMPT CORPORATE ENTITY",
"UNITED STATES GOVERNMENT ENTITY"
)
)
data <- data %>%
mutate(
isForeignLocatedCompany = codeCountryCompany != "USA",
isForeignIncorporatedcompany = codeCountryIncorporation != "USA"
)
data <- data %>%
mutate(
hasBusinessTypes = dataBusinessTypes %>% map_dbl(length) > 0,
hasSBA = dataSBA %>% map_dbl(length) > 0,
hasProductServiceCodes = dataProductServiceCodes %>% map_dbl(length) > 0,
hasNAICS = dataNAICS %>% map_dbl(length) > 0,
hasDisasterResponse = dataDisasterResponse %>% map_dbl(length) > 0,
hasNAICSExceptions = dataNAICSExceptions %>% map_dbl(length) > 0
)
data <-
data %>%
mutate(dateSAMData = dateData) %>%
select(dateSAMData, everything())
if (clean_address) {
data <- data %>%
build_address(return_message = return_message)
}
data
}
#' Bulk SAM Entity Registration Data
#'
#' Downloads bulk SAM registered entities based on
#' user specifications and filters from SAM monthly release
#'
#'
#' @param only_active
#' @param sam_extract_filter
#' @param psc_filter
#' @param naics_filter
#' @param incorporation_company_filter
#' @param company_state_filter
#' @param company_zipcode_filter
#' @param company_country_filter
#' @param set_aside_filter
#' @param business_type_filter
#' @param return_message
#' @param snake_names
#' @param clean_address
#'
#' @return `tibble`
#' @export
#'
#' @examples
bulk_sam_entities <-
function(only_active = F,
sam_extract_filter = NULL,
psc_filter = NULL,
naics_filter = NULL,
incorporation_company_filter = NULL,
company_state_filter = NULL,
company_zipcode_filter = NULL,
company_country_filter = NULL,
set_aside_filter = NULL,
business_type_filter = NULL,
snake_names = F,
clean_address = T,
return_message = T) {
data <-
.sam_bulk_entities(clean_address = clean_address, return_message = return_message)
if (only_active) {
if (return_message) {
"Filtering for only active SAM companies" %>% message()
}
data <- data %>%
filter(!isExpiredSAMEntity)
}
if (length(psc_filter) > 0) {
if (return_message) {
glue(
"Filtering for companies with a registered PSC:\n{str_c(psc_filters, collapse = '\n')}"
) %>% message()
}
filter_duns <-
data %>%
filter(hasProductServiceCodes) %>%
select(idDUNS, dataProductServiceCodes) %>%
unnest_legacy() %>%
filter(codeProductService %in% psc_filters) %>%
pull(idDUNS)
data <- .filter_duns(data = data, duns = filter_duns)
}
if (length(naics_filter) > 0) {
if (return_message) {
glue(
"Filtering for companies with a registered naics:\n{str_c(naics_filters, collapse = '\n')}"
) %>% message()
}
filter_duns <-
data %>%
filter(hasNAICS) %>%
select(idDUNS, dataNAICS) %>%
unnest_legacy() %>%
filter(idNAICS %in% naics_filters) %>%
pull(idDUNS)
data <- .filter_duns(data = data, duns = filter_duns)
}
if (length(set_aside_filter) > 0) {
if (return_message) {
glue(
"Filtering for companies with a set aside code of:\n{str_c(set_aside_filter, collapse = '\n')}"
) %>% message()
}
filter_duns <-
data %>%
filter(hasSBA) %>%
select(idDUNS, dataSBA) %>%
unnest_legacy() %>%
filter(slugSetAside %in% str_to_upper(set_aside_filter)) %>%
pull(idDUNS)
data <- .filter_duns(data = data, duns = filter_duns)
}
if (length(incorporation_company_filter) > 0) {
if (return_message) {
glue(
"Filtering for companies domiciled in:\n{str_c(incorporation_company_filter, collapse = '\n')}"
) %>% message()
}
data <-
data %>%
filter(codeCountryIncorporation %in% str_to_upper(incorporation_company_filter))
}
if (length(company_country_filter) > 0) {
if (return_message) {
glue(
"Filtering for companies head-quartered in:\n{str_c(company_country_filter, collapse = '\n')}"
) %>% message()
}
data <-
data %>%
filter(codeCountryCompany %in% str_to_upper(company_country_filter))
}
if (length(company_state_filter) > 0) {
if (return_message) {
glue(
"Filtering for companies headquarters in:\n{str_c(company_state_filter, collapse = '\n')}"
) %>% message()
}
data <- data %>%
filter(stateCompany %in% str_to_upper(company_state_filter))
}
if (length(company_zipcode_filter)) {
if (return_message) {
glue(
"Filtering for companies located in zipcode(s):\n{str_c(company_zipcode_filter, collapse = '\n')}"
) %>% message()
}
data <- data %>%
filter(zipcodeCompany %in% str_to_upper(as.character(company_zipcode_filter)))
}
if (length(business_type_filter) > 0) {
if (return_message) {
glue(
"Filtering for business types:\n{str_c(business_type_filter, collapse = '\n')}"
) %>% message()
}
filter_duns <-
data %>%
filter(hasBusinessTypes) %>%
select(idDUNS, dataBusinessTypes) %>%
unnest_legacy() %>%
filter(slugBusinessType %in% str_to_upper(business_type_filter)) %>%
pull(idDUNS) %>%
unique()
data <-
.filter_duns(data = data, duns = filter_duns)
}
name_cols <- data %>% select(matches("name")) %>% names()
data <- data %>%
mutate_at(name_cols,
list(function(x) {
x %>% str_replace_all(" OF, ", " OF ")
}))
if (data %>% hasName("slugDUNS")) {
data <- data %>% mutate(slugDUNS = as.character(slugDUNS))
}
data <- data %>%
mutate_if(is.numeric, as.numeric)
if (return_message) {
"Generating FAR and DFAR link urls" %>% message()
}
df_duns <- data %>%
distinct(idDUNS) %>%
mutate(char = nchar(idDUNS),
zeros = 9 - char)
df_duns <- df_duns %>%
filter(zeros != 0) %>%
mutate(
slugZero = zeros %>% map_chr(function(x) {
rep("0", times = x) %>% str_c(collapse = "")
}),
slugDUNS = glue("{slugZero}{idDUNS}") %>% as.character()
) %>%
select(idDUNS, slugDUNS) %>%
bind_rows(df_duns %>% filter(zeros == 0) %>%
select(idDUNS) %>%
mutate(slugDUNS = as.character(idDUNS))) %>%
mutate(
urlFARPDF = glue(
"https://www.sam.gov/SAM/filedownload?pdfType=1&duns={slugDUNS}"
) %>% as.character(),
urlDFARPDF = glue(
"https://www.sam.gov/SAM/filedownload?pdfType=2&duns={slugDUNS}"
) %>% as.character()
) %>%
select(-slugDUNS)
data <-
data %>%
left_join(df_duns, by = "idDUNS")
if (snake_names) {
data <-
janitor::clean_names(data)
}
data
}
# sam_exclusions ----------------------------------------------------------
### https://www.sam.gov/SAM/pages/public/extracts/samPublicAccessData.jsf
###
#' SAM Exclusion URLs
#'
#' List of the most recent SAM exclusion URLS
#'
#' @return \code{tibble()}
#' @export
#'
#' @examples
#' dictionary_sam_exclusion_urls()
dictionary_sam_exclusion_urls <-
function() {
page <- "https://www.sam.gov/SAM/pages/public/extracts/samPublicAccessData.jsf" %>%
read_html()
urls <- page %>% html_nodes("tr~ tr+ tr a") %>% html_attr("href") %>%
str_c("https://www.sam.gov",.)
julian_dates <- urls %>%
str_remove_all(".ZIP") %>%
str_split("Extract_") %>%
map_chr(function(x){
x[[2]]
}) %>%
as.numeric()
year <-
substr(julian_dates,1,2) %>% str_c("20") %>% as.numeric()
day_of_year <-
julian_dates %>% substr(3, nchar(julian_dates)) %>% as.numeric()
start_date <- glue("{year}-01-01") %>% ymd() %>% unique()
dates <- start_date + day_of_year
tibble(dateData = dates, urlZIP = urls) %>%
mutate(typeFile = "SAM EXCLUSIONS")
}
.parse_sam_exclusion_zip <- function(url = "https://www.sam.gov/SAM/extractfiledownload?role=WW&version=EPLSPUB&filename=SAM_Exclusions_Public_Extract_20050.ZIP") {
outfile <- tempfile("download", fileext = ".zip")
file <- download.file(url, outfile)
unz_files <- outfile %>% unzip(exdir = "zip")
data <-
unz_files %>% fread(showProgress = FALSE) %>% as_tibble()
data <-
data %>%
.munge_bis_names()
unz_files %>% unlink()
file %>% unlink()
unlink("attachments", recursive = T, force = T)
data %>% select(matches("date"))
data <- data %>%
mutate_at(c("dateTermination", "dateActive"),
mdy) %>%
mutate_at("dateCreated", ymd)
data <- data %>%
.munge_data(parse_dates = F, clean_address = F)
data <- data %>%
mutate(
idNationalProvider = as.numeric(idNationalProvider),
idNationalProvider = case_when(idNationalProvider == 0 ~ NA_real_,
TRUE ~ idNationalProvider),
namePerson = case_when(
namePerson == "" ~ NA_character_,
TRUE ~ namePerson
),
nameParty = case_when(
is.na(nameEntity) ~ namePerson,
TRUE ~ nameEntity
)
) %>%
select(typeClassification, nameParty, everything()) %>%
.remove_na()
data <-
data %>%
mutate(descriptionCrossReference = descriptionCrossReference %>% str_remove_all("^ALSO"),
urlZIP = url)
data
}
#' SAM Entity Exclusion data
#'
#' Returns information about people and entities
#' on the most recent SAM exclusion list
#'
#' @param snake_names
#'
#' @return
#' @export
#'
#' @examples
#' sam_exclusions()
sam_exclusions <-
function(snake_names = T, clean_address = T) {
df_urls <-
dictionary_sam_exclusion_urls()
df_today <-
df_urls %>%
filter(dateData == max(dateData))
data <-
df_today$urlZIP %>% .parse_sam_exclusion_zip()
data <- data %>%
left_join(df_today, by = "urlZIP")
data <- data %>% munge_data(snake_names = snake_names, clean_address = clean_address)
data
}
#' Join SAM data
#'
#' @param data
#' @param duns_column
#' @param snake_names
#' @param sam_select_columns
#'
#' @return
#' @export
#'
#' @examples
tbl_sam_data <-
function(data, duns_column = "id_duns",
sam_select_columns = c(
"dateSAMData",
"idDUNS",
"slugCAGE",
"nameCompanyLegalClean",
"nameCompanyLegal",
"isExpiredSAMEntity",
"slugDeptDefenseAddressCode",
"dateRegistrationInitial",
"dateExpiration",
"dateLastUpdate",
"dateActivation",
"dateCompanyStart",
"monthdayFiscalYearEnd",
"urlCompany",
"nameCompanyDBA",
"nameCompanyDivision",
"countSBATypes",
"typeEntityStructure",
"typeSAMExtract",
"typeRegistrationPurpose",
"isNonPublicSAMRegistered",
"nameNAICSPrimary",
"nameSectorNAICS",
"nameSubSectorNAICS",
"nameIndustryGroupNAICS",
"nameIndustryNAICS",
"nameContactGovernment",
"nameContactGovernmentAlt",
"nameContactPastPerformance",
"nameContactElectronic",
"typeEntity",
"isTaxExemptEntity",
"isForeignLocatedCompany",
"isForeignIncorporatedcompany",
"locationCompany",
"addressStreet1Company",
"addressStreet2Company",
"cityStateCompany",
"locationMailingCompany",
"cityStateMailingCompany",
"cityCompany",
"stateCompany",
"zipcodeCompany",
"zipcode4Company",
"codeCountryCompany",
"idCongressionalDistrictCompany",
"slugEntityStructure",
"stateIncorporation",
"codeCountryIncorporation",
"countBusinessTypes",
"idNAICSPrimary",
"countNAICS",
"countProductServiceCodes",
"emailPointOfContactGovt",
"dataBusinessTypes",
"dataDisasterResponse",
"dataNAICS",
"dataNAICSExceptions",
"dataProductServiceCodes",
"dataSBA",
"urlFARPDF",
"urlDFARPDF"
),
snake_names = T) {
if (snake_names) {
data <- data %>% clean_names()
duns_column <- make_clean_names(duns_column)
}
matched_duns <-
data %>%
distinct(!!sym(duns_column)) %>%
filter(!is.na(!!sym(duns_column))) %>%
pull()
df_sam <- bulk_sam_entities()
tbl_sam <-
df_sam %>% filter(idDUNS %in% matched_duns) %>%
select(one_of(sam_select_columns), matches("^data"))
if (snake_names) {
tbl_sam <- tbl_sam %>%
clean_names()
if (tbl_sam %>% hasName("data_naics")) {
df_naics <-
tbl_sam %>%
select(id_duns, data_naics) %>%
unnest() %>%
clean_names() %>%
group_by(id_duns) %>%
nest() %>%
ungroup() %>%
rename(data_naics = data)
tbl_sam <- tbl_sam %>%
select(-data_naics) %>%
left_join(df_naics, by = "id_duns") %>%
select(-matches("data"), everything())
}
if (tbl_sam %>% hasName("data_naics_exceptions")) {
df_naics_ex <-
tbl_sam %>%
select(id_duns, data_naics_exceptions) %>%
unnest() %>%
clean_names() %>%
group_by(id_duns) %>%
nest() %>%
ungroup() %>%
rename(data_naics_exceptions = data)
tbl_sam <- tbl_sam %>%
select(-data_naics_exceptions) %>%
left_join(df_naics_ex, by = "id_duns") %>%
select(-matches("data"), everything())
}
if (tbl_sam %>% hasName("data_disaster_response")) {
df_response <-
tbl_sam %>%
select(id_duns, data_disaster_response) %>%
unnest() %>%
clean_names() %>%
group_by(id_duns) %>%
nest() %>%
ungroup() %>%
rename(data_disaster_response = data)
tbl_sam <- tbl_sam %>%
select(-data_disaster_response) %>%
left_join(df_response, by = "id_duns") %>%
select(-matches("data"), everything())
}
if (tbl_sam %>% hasName("data_product_service_codes")) {
df_psc <- tbl_sam %>%
select(id_duns, data_product_service_codes) %>%
unnest() %>%
clean_names() %>%
group_by(id_duns) %>%
nest() %>%
ungroup() %>%
rename(data_product_service_codes = data)
tbl_sam <- tbl_sam %>%
select(-data_product_service_codes) %>%
left_join(df_psc, by = "id_duns") %>%
select(-matches("data"), everything())
}
if (tbl_sam %>% hasName("data_business_types")) {
df_bt <-
tbl_sam %>%
select(id_duns, data_business_types) %>%
unnest() %>%
clean_names() %>%
group_by(id_duns) %>%
nest() %>%
ungroup() %>%
rename(data_business_types = data)
tbl_sam <-
tbl_sam %>%
select(-data_business_types) %>%
left_join(df_bt, by = "id_duns") %>%
select(-matches("data"), everything())
}
if (tbl_sam %>% hasName("data_sba")) {
df_sba <-
tbl_sam %>%
select(id_duns, data_sba) %>%
unnest() %>%
clean_names() %>%
group_by(id_duns) %>%
nest() %>%
ungroup() %>%
rename(data_sba = data)
tbl_sam <-
tbl_sam %>%
select(-data_sba) %>%
left_join(df_sba, by = "id_duns") %>%
select(-matches("data"), everything())
}
}
data_cols <-
data %>% select(-matches(duns_column)) %>% names()
if (length(data_cols) > 0) {
remove_cols <-
data_cols[data_cols %in% names(tbl_sam)]
if (length(remove_cols) > 0) {
data <-
data %>% select(-one_of(remove_cols))
}
}
data <-
data %>%
left_join(tbl_sam, by = duns_column)
data
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.