.build_folder <-
function(path = "Desktop/abresler.github.io/trelliscopes/jinkie/otr/kaute") {
oldwd <- getwd()
setwd("~")
folder_exists <-
dir.exists(paths = path)
if (folder_exists) {
setwd(oldwd)
return(invisible())
}
parts <- path %>% str_split("/") %>% flatten_chr()
seq_along(parts) %>%
map(function(x) {
if (x == 1) {
directory <- parts[x]
if (!dir.exists(directory)) {
dir.create(directory)
}
return(invisible())
}
directory <- parts[1:x] %>% str_c(collapse = '/')
if (!dir.exists(directory)) {
dir.create(directory)
}
return(invisible())
})
setwd(oldwd)
return(invisible())
}
.fix_bad_sbir <-
function(data) {
data <- data %>%
mutate(
idContract = case_when(
idContract == "H9222217P0004" ~ "H9222218P0004",
idContract == "W911SR17P0022" ~ "W911SR18P0022",
TRUE ~ idContract
)
)
if (data %>% hasName("idContractResolved")) {
data <- data %>%
mutate(
idContractResolved = case_when(
idContractResolved == "H9222217P0004" ~ "H9222218P0004",
idContractResolved == "W911SR17P0022" ~ "W911SR18P0022",
TRUE ~ idContractResolved
)
)
}
data
}
.distinct_sbir_cols <-
function(data) {
cols <-
tibble(column = names(data)) %>%
mutate(idColumn = 1:n()) %>%
group_by(column) %>%
filter(idColumn == max(idColumn)) %>%
ungroup() %>%
pull(idColumn)
data <-
data[, cols]
data
}
.parse_soliciation_descriptions <-
function(description) {
tibble(description) %>%
separate(
col = "description",
extra = "merge",
into = c("item", "value"),
sep = "\\: "
) %>%
filter(!is.na(value)) %>%
mutate()
}
#' SBIR name dictionary
#'
#' @return
#' @export
#'
#' @examples
dictionary_sbir_names <-
function() {
tibble(
nameSBIR = c(
"Agency",
"Branch",
"Contract",
"Agency Tracking Number",
"Amount",
"Phase",
"Program",
"Awards Year",
"Solicitation Year",
"Solicitation Topic Code",
"Solicitation Number",
"DUNS",
"HUBZone Owned",
"Woman Owned",
"Socially and Economically Disadvantaged",
"title",
"duns",
"address1",
"address2",
"city",
"state",
"zip",
"hubzone_owned",
"minority_owned",
"woman_owned",
"link",
"abstract",
"agency",
"program",
"phase",
"year",
"firm",
"reseach institution",
"description",
"source_url",
"close_date",
"release_date",
"its_field_open_date",
"status",
"Program | Phase | Year",
"Solicitation",
"Topic Number",
"NOTE",
"FY",
"AGENCY",
"FIRM NAME",
"TOPIC NUMBER",
"THIRD PARTY INVESTOR",
"THIRD PARTY INVESTMENT",
"NEW TOTAL PH2 AWARD AMOUNT",
"wosb_flag",
"sdb_flag",
"topic_year",
"sttr_ri_city",
"ta_bio_medical_flag",
"org_ph2_enhanced_sbir_proposals",
"submission_id",
"sttr_ri_name",
"topic_background",
"topic_description",
"ta_chem_bio_defense_flag",
"aw_program_type_name",
"pi_benefit",
"topic_renew_energy_flag",
"award_end_date",
"org_award_count_ph2",
"topic_title",
"org_award_count_ph1",
"topic_aq_program_name",
"sttr_ri_url",
"sub_direct2_phase2_flag",
"agency_name_abbr",
"pi_equivalent_work_flag",
"topic_phase2",
"org_ph2_sttr_proposals",
"topic_phase1",
"ta_info_systems_flag",
"topic_itar_restricted_flag",
"topic_phase3",
"org_zip_code",
"aw_topic_number",
"org_total_investment",
"org_name",
"org_state_name_abbr",
"org_total_sales",
"ph2_prev_awarded_proposal_number",
"manufacturing_type_name",
"id",
"org_poc_fullname",
"org_addr2",
"award_amount",
"org_addr1",
"org_emp_count_current",
"topic_keywords",
"ta_air_platform_flag",
"sub_vet_owned_flag",
"aw_fiscal_year",
"hub_zone_flag",
"org_duns_number",
"org_firstaward_yrph2",
"sttr_ri_zip",
"org_firstaward_yrph1",
"award_flag",
"topic_objective",
"org_geolat",
"org_sbir_proposals",
"fed_facility_flag",
"pi_technical_abstract",
"ta_nuclear_flag",
"sttr_ri_pct_of_work_firm",
"sttr_ri_work_criteria_flag",
"sttr_ri_addr2",
"sttr_ri_addr1",
"foreign_national_flag",
"animal_research_flag",
"num_of_employees",
"co_name",
"org_full_address",
"pm_name",
"org_ipo_flag",
"small_business_flag",
"aw_proposal_number",
"org_cage_code",
"org_revenue_last_year",
"ta_ground_sea_flag",
"org_website",
"org_ph2_sbir_proposals",
"org_geolong",
"org_year_founded",
"org_city",
"ta_electronics_flag",
"pi_equivalent_work_details",
"sttr_ri_poc_fullname",
"ta_weapons_flag",
"org_patent_count",
"ta_sensors_flag",
"org_ceo_email",
"org_ph2_enhanced_sttr_proposals",
"solicitation_name",
"org_percentage_revenue_from_sbir",
"org_ceo_full_name",
"org_sttr_proposals",
"org_num_ccr",
"submission_title",
"org_geo_latlong",
"ta_human_systems_flag",
"fpds_vendor_id",
"contract_number",
"submission_type",
"ta_battlespace_flag",
"award_date",
"topic_aqp_statement_of_interest",
"topic_id",
"corp_entity_flag",
"org_total_award",
"aw_select_flag",
"ta_materials_flag",
"org_emp_count_firstaward_ph2",
"ta_space_platforms_flag",
"pi_keywords",
"topic_rationale",
"ph2_prev_awarded_submission_id",
"org_certify_name",
"sdvosb_flag",
"_version_",
"score",
"org_narrative",
"award_comments",
"sba_id",
"aw_field_office",
"sttr_ri_type",
"sttr_ri_state_name_abbr",
"sttr_ri_full_address",
"SolicitationDropdown_String",
"SolicitationPreReleaseDate_String",
"SolicitationOpenDate_String",
"SolicitationCloseDate_String",
"SelectionsDueDate_String",
"AwardsDueDate_String",
"TopicStage1StartDate_String",
"TopicStage2StartDate_String",
"TopicStage3aStartDate_String",
"TopicStage3bStartDate_String",
"TopicStage3cStartDate_String",
"TopicStage3dStartDate_String",
"TopicStage4StartDate_String",
"TopicStage5StartDate_String",
"TopicStage6StartDate_String",
"TopicStage7StartDate_String",
"TopicStage8StartDate_String",
"TopicStage1EndDate_String",
"TopicStage2EndDate_String",
"TopicStage3aEndDate_String",
"TopicStage3bEndDate_String",
"TopicStage3cEndDate_String",
"TopicStage3dEndDate_String",
"TopicStage4EndDate_String",
"TopicStage5EndDate_String",
"TopicStage6EndDate_String",
"TopicStage7EndDate_String",
"TopicStage8EndDate_String",
"IsTopic_Stage1",
"IsTopic_Stage2",
"IsTopic_Stage3a",
"IsTopic_Stage3b",
"IsTopic_Stage3c",
"IsTopic_Stage3d",
"IsTopic_Stage4",
"IsTopic_Stage5",
"IsTopic_Stage6",
"IsTopic_Stage7",
"IsTopic_Stage8",
"IsSolicitation_PreRelease",
"NumberOfSecondsBeforeSolicitationClose",
"IsSolicitation_Open",
"IsSolicitation_Closed",
"IsSolicitation_SelectionsDue",
"IsSolicitation_AwardsDue",
"IsSolicitation_SITUS",
"IsSolicitation_SITIS_Admin_Open",
"AreTopicsEditable_SBIRManager",
"AreTopicsEditable_Component",
"IsPmCertifiedUploadOpen",
"IsComponentInstructionUploadOpen",
"IsComponentTopicSubmissionOpen",
"IsDirty",
"SolicitationId",
"FiscalYear",
"ProgramType",
"SolicitationName",
"OldSolicitationId",
"CreationDtime",
"DeleteFlag",
"HistoricalFlag",
"PmViewLockFlag",
"PmViewLockDtime",
"SubmissionLockFlag",
"SubmissionLockDtime",
"SitisFlag",
"TopicStage1StartDate",
"TopicStage1EndDate",
"TopicStage2StartDate",
"TopicStage2EndDate",
"TopicStage3aStartDate",
"TopicStage3aEndDate",
"TopicStage3bStartDate",
"TopicStage3bEndDate",
"TopicStage3cStartDate",
"TopicStage3cEndDate",
"TopicStage3dStartDate",
"TopicStage3dEndDate",
"TopicStage4StartDate",
"TopicStage4EndDate",
"TopicStage5StartDate",
"TopicStage5EndDate",
"TopicStage6StartDate",
"TopicStage6EndDate",
"TopicStage7StartDate",
"TopicStage7EndDate",
"TopicStage8StartDate",
"TopicStage8EndDate",
"SolicitationPreReleaseDate",
"SolicitationOpenDate",
"SolicitationCloseDate",
"SelectionsDueDate",
"AwardsDueDate",
"OutOfCycle",
"Solicitation:",
"Award Year",
"Award Start Date (Proposal Award Date)",
"Award End Date (Contract End Date)",
"Proposal Number",
"Proposal Title",
"Business Official",
"Email",
"Firm",
"Award Amount",
"State",
"Mission Directorate",
"Center",
"Year",
"TopicId",
"TopicNumber",
"TopicTitle",
"AgencyId",
"CommandId",
"PublishedQuestionCount",
"PublishedAnswerCount",
"TechAreaAirPlatform",
"TechAreaBattlespace",
"TechAreaBioMedical",
"TechAreaChemBioDefense",
"TechAreaElectronics",
"TechAreaGroundSea",
"TechAreaHumanSystems",
"TechAreaInfoSystems",
"TechAreaMaterials",
"TechAreaNuclear",
"TechAreaSensors",
"TechAreaSpacePlatforms",
"TechAreaWeapons",
"SitisAdditionalInfo",
"RT_Area_Microelectronics",
"RT_Area_Cybersecurity_EW",
"RT_Area_Quantum_Science",
"RT_Area_Directed_Energy",
"RT_Area_MachineLearning_AI",
"AC_Fully_Networked_C3",
"AC_Space",
"AS_AOR_Autonomy",
"AS_AOR_Hypersonic",
"AS_AOR_Nuclear_Modernization",
"RTAreaMicroelectronics",
"RTAreaCybersecurityEW",
"RTAreaQuantumScience",
"RTAreaDirectedEnergy",
"RTAreaMachineLearningAI",
"ACFullyNetworkedC3",
"ACSpace",
"ASAORAutonomy",
"ASAORHypersonic",
"ASAORNuclearModernization",
"AgencyName",
"CommandName",
"ProgramTypeName",
"ShowTpocEmail",
"ShowTpocName",
"ShowTpocPhone",
"urlDODSBIRTopicAPI",
"ApPocEmail",
"ApPocNameFirst",
"ApPocNameLast",
"ApProgramName",
"ApStatementOfInterest",
"Background",
"Description",
"DirectToPhase2",
"ItarText",
"Keywords",
"LastUpdateDtime",
"ManufacturingTypeString",
"Objective",
"Phase1",
"Phase2",
"Poc2Email",
"Poc2NameFirst",
"Poc2NameLast",
"Poc2OfficeSymbol",
"Poc3Email",
"Poc3NameFirst",
"Poc3NameLast",
"Poc3OfficeSymbol",
"Poc4Email",
"Poc4NameFirst",
"Poc4NameLast",
"Poc4OfficeSymbol",
"PocEmail",
"PocNameFirst",
"PocNameLast",
"PocOfficeSymbol",
"PriorTopicNumber",
"ProgramTypeString",
"Rationale",
"SitusAdditionalInfo",
"AwardCommandId",
"CreationUserId",
"DirectToPhaseIiFlag",
"ImportFlag",
"ItarRestrictFlag",
"LastUpdateUserId",
"ManufacturingType",
"MeetTopicCriteriaFlag",
"PrevApproveReviewFlag",
"PrevSubmitToReviewFlag",
"PriorSolicitationFlag",
"PriorTopicId",
"RenewableEnergyFlag",
"TopicStatus",
"ReferenceUploadFlag",
"TopicReferenceId",
"ReferenceIndex",
"ReferenceText",
"ReferenceCreateDtime",
"PocPhoneInfo",
"Topic",
"SBIR or STTR?",
"Company Name",
"Company Website"
),
nameActual = c(
"nameAgency",
"nameBranch",
"idContract",
"idAgencyTracking",
"amountContract",
"idPhase",
"idProgram",
"yearAward",
"yearSolicitation",
"groupSolicitation",
"idSolicitation",
"idDUNS",
"isHUBZoneOwned",
"isWomanOwned",
"isSociallyAndEconomicallyDisadvantaged",
"title",
"slugDUNS",
"addressStreet1",
"addressStreet2",
"city",
"codeState",
"zipcode",
"isSBACertifiedHUBZone",
"isMinorityOwned",
"isWomanOwned",
"urlSBIRAward",
"descriptionAward",
"nameAgency",
"nameProgram",
"idPhase",
"yearAward",
"nameCompany",
"nameResearchInstitution",
"descriptionSBIR",
"urlSBIR",
"dateClosed",
"dateReleased",
"dateFieldOpen",
"statusSBIR",
"programPhaseYear",
"slugSolicitation",
"slugTopic",
"noteSolication",
"yearFiscal",
"slugAgency",
"nameCompany",
"slugTopic",
"namesInvestorsThirdParty",
"amountInvestmentThirdParty",
"amountPhase2TotalNew",
"isWomanOwned",
"isSmallDisadvantagedBusiness",
"yearTopic",
"citySTTR",
"isBioMedicalTopic",
"isPhase2Enhanced",
"idSubmission",
"nameResearchInstitution",
"backgroundTopic",
"descriptionTopic",
"isBioChemicalDefenseTopic",
"typeProgram",
"descriptionPrincipalBenefit",
"isRenewableEnergy",
"datetimeAwardEnd",
"countCompanyPhase2Awards",
"titleTopic",
"countCompanyPhase1Awards",
"slugAcquisitionProgramName",
"urlSTTR",
"isDirectToPhase2",
"slugAgency",
"hasEquivalentWorkFlag",
"topicPhase2",
"idPhase2STTRProposal",
"topicPhase1",
"isInformationSystemTopic",
"isITARRestrictedTopic",
"topicPhase3",
"zipcodeCompany",
"groupSolicitation",
"amountInvestmentTotal",
"nameCompany",
"slugStateCompany",
"amountRevenueCompany",
"countPhase2Prior",
"typeManufacturing",
"id",
"nameFullContact",
"addressStreet2Company",
"amountContract",
"adressStreet1Company",
"countEmployeesCurrent",
"keywordsTopicSBIR",
"isAirPlatformTopic",
"isVetSmallDisadvantagedBusiness",
"yearFiscalAwarwd",
"isHubZoneCompany",
"slugDUNS",
"yearFirstPhase2Company",
"zipcodeSTTR",
"yearFirstPhase1Company",
"isAwarded",
"topicObjective",
"latitudeCompany",
"countSBIRProposals",
"isFedFacility",
"descriptionAward",
"isNuclearTopic",
"pctWorkSTTR",
"hasRIWorkSTTR",
"addressStreet2STTR",
"addressStreet1STTR",
"isForeignNational",
"hasAnimalResearch",
"countEmployees",
"nameTitleBusinessContact",
"addressFullCompany",
"nameTitlePrincipalInvestigator",
"hasIPO",
"isSmallBusiness",
"groupProposal",
"slugCageCompany",
"amountRevenueCompanyPriorYear",
"isGroundSeaTopic",
"urlCompany",
"countSBIRPhase2ApplicationsCompany",
"longitudeCompany",
"yearCompanyFounded",
"cityCompany",
"isElectronicsTopic",
"detailsEquivalentWork",
"nameFullPointOfContactSTTR",
"isWeaponsTopic",
"countPatentsCompany",
"isSensorsTopic",
"emailCEO",
"countPhase2EnhancedSTTRProposals",
"nameSolicitation",
"pctRevenueSBIR",
"nameFullCEO",
"countSTTRApplications",
"idCCR",
"titleSBIR",
"companyLatLon",
"isHumanSystemsTopic",
"idVendorFPDS",
"idContract",
"idPhaseSBIR",
"isBattleSpaceTopic",
"datetimeAward",
"topicAQPStatementOfInterest",
"idTopic",
"isCorporateEntity",
"amountAwardsTotalCompany",
"isAwardSelected",
"isMaterialsTopic",
"countEmployeesPhase2Initial",
"isSpaceTopic",
"keywordsSBIRCompany",
"descriptionTopicRationale",
"idSubmissionPhase2Prior",
"nameCompanyCertifier",
"isServiceDisabledOwnedSmallBusiness",
"keyVersion",
"scoreMatch",
"descriptionNarrativeAgency",
"commentsAward",
"idSBA",
"nameOffice",
"idPhaseSTTR",
"slugStateSTTR",
"addressFullSTTR",
"groupSolicitation",
"dateSolicitationPreRelease",
"dateSolicitationOpen",
"dateSolicitationClose",
"dateSelectionsDue",
"dateAwardsDue",
"dateTopicStage1Start",
"dateTopicStage2Start",
"dateTopicStage3aStart",
"dateTopicStage3bStart",
"dateTopicStage3cStart",
"dateTopicStage3dStart",
"dateTopicStage4Start",
"dateTopicStage5Start",
"dateTopicStage6Start",
"dateTopicStage7Start",
"dateTopicStage8Start",
"dateTopicStage1End",
"dateTopicStage2End",
"dateTopicStage3aEnd",
"dateTopicStage3bEnd",
"dateTopicStage3cEnd",
"dateTopicStage3dEnd",
"dateTopicStage4End",
"dateTopicStage5End",
"dateTopicStage6End",
"dateTopicStage7End",
"dateTopicStage8End",
"isTopicStage1",
"isTopicStage2",
"isTopicStage3a",
"isTopicStage3b",
"isTopicStage3c",
"isTopicStage3d",
"isTopicStage4",
"isTopicStage5",
"isTopicStage6",
"isTopicStage7",
"isTopicStage8",
"isSolicitationPreRelease",
"countSecondsBeforeSolicitationClose",
"isSolicitationOpen",
"isSolicitationClosed",
"isSolicitationSelectionsDue",
"isSolicitationAwardsDue",
"isSolicitationSITUS",
"isSolicitationSITISAdminOpen",
"isTopicEditableSBIRManager",
"isTopicEditableComponent",
"isPmCertifiedUploadOpen",
"isComponentInstructionUploadOpen",
"isComponentTopicSubmissionOpen",
"isDirty",
"idSolicitation",
"yearTopic",
"typeProgram",
"nameSolicitation",
"idSolicitationOld",
"datetimeCreated",
"hasDeleteFlag",
"hasHistoricalFlag",
"hasPmViewLockFlag",
"datetimeRemoveViewLock",
"hasSubmissionLockFlag",
"datetimeRemoveSubmissionLock",
"hasSitisFlag",
"TopicStage1StartDate",
"TopicStage1EndDate",
"TopicStage2StartDate",
"TopicStage2EndDate",
"TopicStage3aStartDate",
"TopicStage3aEndDate",
"TopicStage3bStartDate",
"TopicStage3bEndDate",
"TopicStage3cStartDate",
"TopicStage3cEndDate",
"TopicStage3dStartDate",
"TopicStage3dEndDate",
"TopicStage4StartDate",
"TopicStage4EndDate",
"TopicStage5StartDate",
"TopicStage5EndDate",
"TopicStage6StartDate",
"TopicStage6EndDate",
"TopicStage7StartDate",
"TopicStage7EndDate",
"TopicStage8StartDate",
"TopicStage8EndDate",
"SolicitationPreReleaseDate",
"SolicitationOpenDate",
"SolicitationCloseDate",
"SelectionsDueDate",
"AwardsDueDate",
"isOutOfCycle",
"groupSolicitation",
"yearAward",
"dateAward",
"dateAwardEnd",
"idSolicitation",
"nameAward",
"namePrincipal",
"emailPrincipal",
"nameCompany",
"amountContract",
"nameState",
"desciptionMissionDirectorate",
"codeCenter",
"yearAward",
"idTopic",
"idSolicitation",
"nameAward",
"idAgencySBIR",
"idCommandSBIR",
"countPublishedQuestion",
"countPublishedAnswer",
"isTechAreaAirPlatform",
"isTechAreaBattlespace",
"isTechAreaBioMedical",
"isTechAreaChemBioDefense",
"isTechAreaElectronics",
"isTechAreaGroundSea",
"isTechAreaHumanSystems",
"isTechAreaInfoSystems",
"isTechAreaMaterials",
"isTechAreaNuclear",
"isTechAreaSensors",
"isTechAreaSpacePlatforms",
"isTechAreaWeapons",
"descriptionSitisAdditionalInfo",
"isRestrictedAreaMicroelectronics",
"isRestrictedAreaCybersecurity",
"isRestrictedAreaQuantumScience",
"isRestrictedAreaDirectedEnergy",
"isRestrictedAreaMachineLearning",
"isACFullyNetworkedC3",
"isACSpace",
"isASAORAutonomy",
"isASAORHypersonic",
"isASAORNuclearModernization",
"isRTAreaMicroelectronics",
"isRTAreaCybersecurityEW",
"isRTAreaQuantumScience",
"isRTAreaDirectedEnergy",
"isRTAreaMachineLearningAI",
"isACFullyNetworkedC3",
"isACSpace",
"isASAORAutonomy",
"isASAORHypersonic",
"isASAORNuclearModernization",
"nameAgency",
"nameCommand",
"typeProgram",
"hasTPOCEmail",
"hasTPOCName",
"hasTPOCNPhone",
"urlDODSBIRTopicAPI",
"emailContactAP",
"nameFirstAP",
"nameLastAP",
"nameProgramAP",
"descriptionStatementOfInterestAP",
"backgroundTopic",
"descriptionTopic",
"descriptionDirectToPhase2",
"descriptionITAR",
"keywordsTopicSBIR",
"datetimeLastUpdated",
"typeManufacturing",
"topicObjective",
"topicPhase1",
"topicPhase2",
"emailContact2",
"nameFirstContact2",
"nameLastContact2",
"slugOfficeContact2",
"emailContact3",
"nameFirstContact3",
"nameLastContact3",
"slugOfficeContact3",
"emailContact4",
"nameFirstContact4",
"nameLastContact4",
"slugOfficeContact4",
"emailContact",
"nameFirstContact",
"nameLastContact",
"slugOfficeContact",
"idSolicitationPrior",
"typeProgramString",
"topicRationale",
"descriptionSitisAdditionalInfo",
"idCommandAward",
"idUserCreation",
"hasDirectToPhase2",
"hasImportFlag",
"isITARRestrictedTopic",
"idUserLastUpdated",
"idManufacturing",
"hasMeetTopicCriteriaFlag",
"hasPrevApproveReviewFlag",
"hasPrevSubmitToReviewFlag",
"hasPriorSolicitationFlag",
"idTopicPrior",
"hasRenewableEnergyFlag",
"idTopicStatus",
"hasReferenceUploadFlag",
"idTopicReference",
"numberReference",
"descriptionReference",
"datetimeReferenceCreated",
"telephoneContact",
"descriptionTopic",
"idProgram",
"nameCompany",
"urlCompany"
)
)
}
.munge_sbir_names <-
function(data) {
dict_names <- dictionary_sbir_names()
fdps_names <-
names(data)
actual_names <-
fdps_names %>%
map_chr(function(name) {
df_row <-
dict_names %>% filter(nameSBIR == name)
if (nrow(df_row) == 0) {
glue::glue("Missing {name}") %>% message()
return(name)
}
df_row$nameActual
})
data %>%
set_names(actual_names)
}
# api
#' SBIR Solicitations
#'
#'
#'
#' @param only_open if \code{TRUE} only active solicitations
#'
#' @return
#' @export
#'
#' @examples
#' sbir_solicitations_api()
sbir_solicitations_api <-
function(only_open = F) {
data <-
"https://www.sbir.gov/api/solicitations.json?keyword=sbir" %>%
fromJSON(simplifyDataFrame = T) %>%
as_tibble() %>%
.munge_sbir_names() %>%
rename(nameSolicitation = title) %>%
mutate_if(is.character, list(function(x) {
x %>% stringi::stri_trans_general("Latin-ASCII") %>% str_squish() %>%
str_remove_all(" & ") %>%
str_to_upper() %>% gsub("\\s+", " ", .)
}))
data <-
data %>%
mutate(
idSBIR = urlSBIRAward %>% str_to_lower() %>%
str_remove_all("https://www.sbir.gov/sbirsearch/detail/") %>% as.integer()
)
data <-
data %>% mutate_at(c("urlSBIRAward", "urlSBIR"), str_to_lower)
if (data %>% hasName("nameAgency")) {
df_agency <-
data %>% select(idSBIR, nameAgency) %>% mutate(hasAgency = nameAgency %>% map_dbl(length) > 0) %>% filter(hasAgency) %>% unnest() %>% group_by(idSBIR) %>% summarise(nameAgencies = str_c(nameAgency, collapse = " | "))
data <-
data %>% select(-nameAgency) %>%
left_join(df_agency, by = "idSBIR") %>%
select(nameAgencies, everything())
}
date_cols <- data %>% select(matches("date")) %>% names()
if (length(date_cols) > 0) {
data <- data %>%
mutate_at(date_cols,
list(function(x) {
as.POSIXct(x, origin = "1970-01-01", tz = "UTC") %>% as.Date()
}))
}
data <-
data %>%
.munge_data()
data <-
data %>%
mutate(isOpen = statusSBIR == "OPEN")
if (only_open) {
data <-
data %>%
filter(isOpen)
}
data
}
#' SBIR Funded Companies
#'
#' @param return_message if \code{TRUE} returns message
#'
#' @return a \code{tibble}
#' @export
#'
#' @examples
#' sbir_companies()
sbir_companies <-
function(return_message = T) {
data <-
"https://www.sbir.gov/api/firm.json" %>%
jsonlite::fromJSON(simplifyDataFrame = T) %>% as_tibble()
data <- data %>%
set_names(
c(
"nameCompany",
"slugDUNS",
"addressStreet1",
"addressStreet2",
"city",
"codeState",
"zipcode",
"isSBACertifiedHUBZone",
"isMinorityOwned",
"isWomanOwned"
)
) %>%
mutate(zipcode = ifelse(zipcode == "-", "", zipcode))
data <-
data %>%
unite(addressStreet,
addressStreet1,
addressStreet2,
sep = " ",
remove = F) %>%
unite(cityState,
city,
codeState,
sep = ", ",
remove = F) %>%
unite(cityStateZip,
cityState,
zipcode,
sep = " ",
remove = F) %>%
unite(locationCompany,
addressStreet,
cityStateZip,
sep = ", ") %>%
mutate(locationCompany = str_squish(locationCompany) %>% str_replace_all("\\ , ", "\\, ")) %>%
mutate_if(is.character, list(function(x) {
ifelse(x == "", NA, x)
})) %>%
mutate_all(list(function(x) {
x %>% stringi::stri_trans_general("Latin-ASCII") %>% str_squish() %>%
str_remove_all(" & ") %>%
str_to_upper() %>% gsub("\\s+", " ", .)
})) %>%
.munge_data(clean_address = F)
data <-
data %>%
group_by(idDUNS, nameCompany) %>%
dplyr::slice(1) %>%
ungroup() %>%
select(idDUNS, nameCompany, everything()) %>%
mutate(idDUNS = case_when(nchar(idDUNS) <= 3 ~ NA_real_,
TRUE ~ idDUNS))
if (return_message) {
glue("Acquired {nrow(data) %>% formattable::comma(digits = 0)} SBIR funded companies") %>% message()
}
data
}
.parse_sbir_firm_json <-
function(url = "https://www.sbir.gov/api/firm.json?state=VA") {
data <- url %>% fromJSON(simplifyDataFrame = T) %>% as_tibble() %>%
.munge_sbir_names()
data <- data %>%
mutate_if(is.character,
list(function(x) {
x %>% stringi::stri_trans_general("Latin-ASCII") %>% str_squish() %>%
str_remove_all(" & ") %>%
str_to_upper() %>% gsub("\\s+", " ", .)
})) %>%
unite(
locationCompany,
addressStreet1,
addressStreet2,
city,
codeState,
zipcode,
sep = " ",
remove = F
) %>%
.munge_data(clean_address = F)
}
.parse_sbir_agency_json <-
function(url = "https://www.sbir.gov/api/awards.json?keyword=mars&agency=DOE") {
data <-
url %>% fromJSON(simplifyDataFrame = T) %>%
as_tibble() %>% .munge_sbir_names()
if (data %>% hasName("title")) {
data <-
data %>%
rename(nameSBIR = title)
}
data <-
data %>%
mutate(
idSBIR = urlSBIRAward %>% str_remove_all("https://www.sbir.gov/sbirsearch/detail/") %>% as.integer()
)
if (data %>% hasName("nameAgency")) {
df_agency <-
data %>%
select(idSBIR, nameAgency) %>%
unnest() %>%
group_by(idSBIR) %>%
summarise(nameAgency = nameAgency %>% str_c(collapse = " | ")) %>%
ungroup()
data <-
data %>% select(-nameAgency) %>%
left_join(df_agency, by = "idSBIR")
}
date_cols <- data %>% select(matches("date")) %>% names()
if (length(date_cols) > 0) {
data <- data %>%
mutate_at(date_cols,
list(function(x) {
as.POSIXct(x, origin = "1970-01-01", tz = "UTC") %>% as.Date()
}))
}
data <-
data %>%
.munge_data() %>%
mutate(urlSBIRAPI = url)
data
}
#' SBIR Award count
#'
#' Returns SBIR award counts for
#' each SBIR company
#'
#' @return
#' @export
#'
#' @examples
#' sbir_company_award_count()
sbir_company_award_count <-
function() {
url <-
"https://www.sbir.gov/sbirsearch/firm/all?fDOTtitle=&duns=&city=&zip=&page=1&print=xls&per_page=5000000"
data <-
download_excel_file(url = url, has_col_names = T) %>%
set_names(
c(
"rankCompany",
"nameCompany",
"slugDUNS",
"countAwards",
"addressStreet1",
"addressStreet2",
"city",
"codeState",
"zipcode",
"urlCompany"
)
)
data <- data %>%
mutate(zipcode = ifelse(zipcode == "-", "", zipcode)) %>%
mutate_if(is.character, list(function(x) {
ifelse(is.na(x), "", x)
}))
data <-
data %>%
unite(addressStreet,
addressStreet1,
addressStreet2,
sep = " ",
remove = F) %>%
unite(cityState,
city,
codeState,
sep = ", ",
remove = F) %>%
unite(cityStateZip,
cityState,
zipcode,
sep = " ",
remove = F) %>%
unite(locationCompany,
addressStreet,
cityStateZip,
sep = ", ") %>%
mutate(locationCompany = str_squish(locationCompany) %>% str_replace_all("\\ , ", "\\, ")) %>%
mutate_if(is.character, list(function(x) {
ifelse(x == "", NA, x)
})) %>%
mutate_all(list(function(x) {
x %>% stringi::stri_trans_general("Latin-ASCII") %>% str_squish() %>%
str_remove_all(" & ") %>%
str_to_upper() %>% gsub("\\s+", " ", .)
})) %>%
.munge_data(clean_address = F)
data <-
data %>%
select(nameCompany, idDUNS, everything()) %>%
arrange(desc(countAwards)) %>%
select(-one_of("rankCompany")) %>%
mutate(countAwards = countAwards %>% comma(digits = 0))
data <-
data %>%
separate(
nameCompany,
extra = "merge",
into = c("nameCompany", "nameCompanyDBA"),
sep = " DBA |\\(DBA| DBA,| DBA$|DBA: \\.DBA"
) %>%
separate(nameCompany,
into = c("nameCompany", "nameCompanyPrior"),
sep = "FORMERLY") %>%
mutate_at(c("nameCompany", "nameCompanyPrior", "nameCompanyDBA"),
list(function(x) {
x %>%
gsub("\\(", " ", .) %>%
gsub("\\)", " ", .) %>%
str_remove_all("\\,$") %>%
str_squish()
})) %>%
select(idDUNS,
nameCompany,
nameCompanyDBA,
nameCompanyPrior,
everything())
data <- data %>%
mutate(idDUNS = case_when(nchar(idDUNS) <= 3 ~ NA_real_,
TRUE ~ idDUNS))
data <- data %>%
select(-one_of("cityState"))
data
}
#' SBIR Annual awards
#'
#' @return
#' @export
#'
#' @examples
sbir_annual_awards <-
memoise::memoise(function() {
data <-
"https://www.sbir.gov/analytics-dashboard/xls?view_by=Year&xls_table=year&dataid=SbirAnalyticDashboardSqlYear" %>%
download_excel_file(has_col_names = F)
data <-
data %>% dplyr::slice(3:nrow(data)) %>%
set_names(c(
"year",
"countAwards",
"countFirms",
"amountAwarded",
"amountObligated"
)) %>%
.munge_data() %>%
mutate(amountAwarded = ifelse(is.na(amountAwarded), amountObligated, amountAwarded)) %>%
select(-amountObligated) %>%
.munge_data()
data
})
#' SBIR Agency Awards
#'
#' @return
#' @export
#'
#' @examples
#' sbir_agency_awards()
sbir_agency_awards <-
memoise::memoise(function() {
data <-
"https://www.sbir.gov/analytics-dashboard/xls?view_by=Year&xls_table=agency&dataid=SbirAnalyticDashboardSqlAgency" %>%
download_excel_file(has_col_names = F)
data <-
data %>% dplyr::slice(3:nrow(data)) %>%
set_names(c(
"slugAgency",
"countAwards",
"countFirms",
"amountAwarded",
"amountObligated"
)) %>%
.munge_data() %>%
mutate(amountObligated = ifelse(is.na(amountObligated), 0, amountAwarded)) %>%
mutate(amountSpent = amountAwarded + amountObligated) %>%
select(-c(amountAwarded, amountObligated))
data
})
#' SBIR obligations
#'
#' Data ends in 2016
#'
#' @return
#' @export
#'
#' @examples
#' sbir_obligations()
sbir_obligations <-
memoise::memoise(function() {
data <-
"https://www.sbir.gov/awards/annual-reports/xls?xls_table=SBIR_obligation&dataid=SbirAnnualReportsSummarySqlYearSbir" %>% download_excel_file()
data <- data %>% dplyr::slice(3:nrow(data)) %>%
setNames(
c(
"year",
"amountObligated",
"amountBudget",
"amountPhase1Obligation",
"amountPhase2Obligation"
)
) %>% .munge_data() %>%
mutate(year = as.numeric(year))
data
})
.parse_sbir_award_url <-
function(url = "https://www.sbir.gov/sbirsearch/detail/1547673",
return_message = T) {
if (return_message) {
glue::glue("Parsing: {url}") %>% message()
}
page <- url %>% read_html()
parts <- url %>% str_split("/") %>%
flatten_chr()
idSBIR <-
parts[length(parts)] %>% as.numeric()
labels <-
page %>% html_nodes(".open-label") %>% html_text() %>% str_remove_all("\\:")
values <-
page %>% html_nodes(".open-description") %>% html_text()
business <-
page %>% html_nodes(".sbc-name-wrapper a") %>% html_text() %>% str_to_upper() %>% str_squish()
sub_text <-
page %>% html_nodes(".award-sub-description")
timeline <-
page %>% html_node(".timeline-info-wrapper")
address_business <-
page %>% html_nodes(".sbc-address-wrapper") %>% html_text() %>% str_squish()
abstract <-
page %>% html_nodes(".abstract-wrapper") %>% html_text() %>% str_split("\n") %>%
flatten_chr() %>%
str_squish() %>%
purrr::discard(function(x) {
x %in% c("", "Abstract")
}) %>%
str_c(collapse = "") %>%
str_squish()
solicitation <-
page %>% html_nodes(".page-header") %>% html_text() %>% unique() %>% .[[1]] %>% str_remove_all("&amp;amp;") %>% str_squish()
if (length(abstract) == 0) {
abstract <- solicitation
}
data <-
tibble(nameSBIR = labels, value = values) %>%
left_join(dictionary_sbir_names(), by = "nameSBIR")
if (data %>% filter(is.na(nameActual)) %>% nrow() > 0) {
missing <- data %>% filter(is.na(nameActual)) %>% pull(nameSBIR)
missing %>%
walk(function(word) {
glue("Missing {word}") %>% message()
})
data <- data %>%
mutate(nameActual = ifelse(is.na(nameActual), nameSBIR, nameActual))
}
data <- data %>%
select(nameActual, value)
col_order <- data$nameActual
data <-
data %>%
spread(nameActual, value) %>%
select(one_of(col_order), everything())
data <-
data %>%
mutate_at(
c("amountContract", "yearAward", "idDUNS", "yearSolicitation"),
list(readr::parse_number)
)
logical_cols <- names(data)[names(data) %in% c(
"isHUBZoneOwned",
"isWomanOwned",
"isSociallyandEconomicallyDisadvantaged"
)]
if (length(logical_cols) > 0) {
data <-
data %>%
mutate_at(
logical_cols,
list(function(x) {
ifelse(x == "Y", TRUE, FALSE)
})
)
}
df_contact <-
seq_along(sub_text) %>%
map(function(x) {
if (x %in% c(1, 2)) {
type <-
case_when(x == 1 ~ "Principal",
TRUE ~ "BusinessContact")
values <-
sub_text[x] %>% html_text() %>% str_split("Name:|Phone:|Email:|\n|Title:") %>%
flatten_chr() %>%
str_squish() %>%
discard(function(x) {
x == ""
}) %>%
str_remove_all("^ ")
values <- values %>% unique()
if (length(values) == 0) {
return(tibble())
}
name =
values[[1]]
if (name %>% str_remove_all("\\)|\\(|\\-|\\ |[0-9]") == "" &
(length(values) == 1)) {
return(tibble())
}
if (values %>% str_remove_all("\\)|\\(|\\-|\\ ") %>% readr::parse_number() %>% discard(is.na) %>% length() > 0) {
vals <-
values %>% str_remove_all("\\)|\\(|\\-|\\ ") %>% readr::parse_number()
remove_val <- vals[!vals %>% is.na()] %>% max()
rows <- tibble(vals) %>%
mutate(idRow = 1:n()) %>%
mutate(vals = ifelse(is.na(vals), 0 , vals)) %>%
filter(vals != remove_val) %>%
pull(idRow)
values <- values[rows]
}
if (name %>% str_detect("\\(")) {
name <-
name %>% str_split("\\(") %>% flatten_chr() %>% str_split("\\)") %>% flatten_chr() %>% str_squish() %>% str_c(collapse = " ")
}
if (values %>% str_count("\\@") %>% sum(na.rm = T) > 0) {
email <-
values[values %>% str_detect("@")] %>% str_c(collapse = " | ")
} else {
email <- NA_character_
}
telephone_no <-
values %>% grep("[0-9][0-9][0-9]-[0-9]", .)
if (length(telephone_no) == 0) {
telephone_no <- 0
}
if (telephone_no > 0) {
telephone <-
values[telephone_no]
} else {
telephone <- NA_character_
}
if (values[!values %in% c(email, name, telephone)] %>% length() > 0) {
title <-
values[!values %in% c(email, name, telephone)] %>%
str_remove_all("\\(|\\)|\\ -") %>%
discard(function(x) {
x == ""
})
if (length(title) == 0) {
title <- NA_character_
} else {
title <-
title %>%
.[[1]]
}
} else {
title <- NA_character_
}
items <- c("name", "email", "telephone", "title")
values <- c(name, email, telephone, title)
items <- items %>% str_c(type)
data <-
tibble(items, values, idSBIR) %>%
spread(items, values)
return(data)
}
text <-
sub_text[x] %>% html_text() %>% str_split("Name:|Phone:|Email:|\n|Type:|Name:|Contact:|Address:") %>%
flatten_chr() %>%
str_squish() %>%
discard(function(x) {
x == ""
}) %>%
unique()
if (length(text) == 1) {
text <-
text %>%
.[[1]]
if (text == "N/A") {
text <- NA_character_
}
df <- tibble(idSBIR, nameResearchInstitution = text)
return(df)
}
items <- c(
"nameResearchInstitution",
"nameContactResearchInstitution",
"addressResearchInstitution",
"telephoneResearchInstitution",
"typeResearchInstitution"
)
tibble(item = items[1:length(text)], value = text, idSBIR) %>%
spread(item, value) %>%
select(idSBIR, one_of(items))
}) %>%
discard(function(x) {
x %>% nrow() == 0
})
df_contact <- df_contact %>%
reduce(left_join, by = "idSBIR")
if (length(business) == 0) {
business <- "UNKNOWN"
}
data <-
data %>%
mutate(
nameAward = solicitation,
nameCompany = business,
addressCompany = address_business,
idSBIR,
descriptionAward = abstract
) %>%
select(idSBIR, idSolicitation, nameCompany, nameAward, everything()) %>%
left_join(df_contact, by = "idSBIR")
char_names <-
data %>% select_if(is.character) %>% select(-matches("url")) %>% names()
data <-
data %>%
mutate_if(is.character,
list(function(x) {
ifelse(x == "N/A", NA_character_, x) %>% gsub("\\s+", " ", .)
}))
data <-
data %>% mutate_at(char_names, list(function(x) {
x %>% str_squish() %>% str_to_upper()
})) %>%
mutate(urlSBIRAward = url)
data
}
.parse_sbir_award_urls <-
function(urls = "https://www.sbir.gov/sbirsearch/detail/1547673",
sleep_time = NULL,
return_message = T) {
df <-
tibble()
success <- function(res) {
url <-
res$url
.parse_sbir_award_url_safe <-
possibly(.parse_sbir_award_url, tibble())
all_data <-
.parse_sbir_award_url_safe(url = url, return_message = T)
if (length(sleep_time) > 0) {
Sys.sleep(time = sleep_time)
}
df <<-
df %>%
bind_rows(all_data)
}
failure <- function(msg) {
tibble()
}
urls %>%
map(function(x) {
curl_fetch_multi(url = x, success, failure)
})
multi_run()
df
}
#' Parse SBIR URLS
#'
#' @param urls vector of URLs
#' @param sleep_time if not \code{NULL} sleep time between scrapes
#' @param return_message if \code{TRUE} return message
#' @param clean_entities if \code{TRUE} cleans entity data
#' @param snake_names if \code{TRUE} snakes names
#'
#' @return \code{data_frame}
#' @export
#'
#' @examples
#' parse_sbir_award_urls(urls = "https://www.sbir.gov/sbirsearch/detail/1584295")
parse_sbir_award_urls <-
function(urls = NULL,
sleep_time = NULL,
clean_entities = T,
snake_names = F,
return_message = F) {
if (length(urls) == 0) {
"Enter urls" %>% message()
return(invisible())
}
.parse_sbir_award_urls_safe <-
possibly(.parse_sbir_award_url, tibble())
data <-
urls %>%
future_map_dfr(function(url) {
.parse_sbir_award_urls_safe(url = url, return_message = return_message)
})
data <-
data %>%
.fix_bad_sbir() %>%
mutate(idDUNS = case_when(idDUNS == 0 ~ NA_real_,
TRUE ~ idDUNS))
#
#
# data <-
# .parse_sbir_award_urls_safe(urls = urls,
# sleep_time = sleep_time,
# return_message = return_message)
phone_names <-
data %>%
select(matches("telephone")) %>% names()
if (data %>% hasName("dateAward")) {
data <- data %>%
mutate(
isAwardDateActual = !is.na(dateAward),
dateAward = case_when(
is.na(dateAward) ~ glue("{yearAward}-05-15") %>% as.character(),
TRUE ~ dateAward
) %>% ymd()
)
} else {
if (data %>% hasName("yearAward")) {
data <-
data %>%
mutate(
isAwardDateActual = F,
dateAward = glue("{yearAward}-05-15") %>% as.character() %>% ymd()
)
}
}
data <-
data %>%
mutate_at(phone_names,
list(function(x) {
x %>% str_remove_all("\\(|\\)") %>% str_replace_all("\\ ", "\\-")
})) %>%
mutate_at(phone_names,
list(function(x) {
ifelse(x == "--", NA_character_, x)
}))
id_names <-
data %>%
dplyr::select(one_of(c("idContract", "idSolicitation"))) %>%
names()
if (length(id_names) > 0) {
data <- data %>%
mutate_at(id_names,
list(function(x) {
x %>% str_remove_all("\\-")
}))
}
if (clean_entities) {
data <-
data %>%
entities::refine_columns(entity_columns = "nameCompany")
data <-
data %>%
entities::refine_columns(entity_columns = c(
"nameResearchInstiution",
numgram = 1,
weight = c(
d = 0,
i = 0,
s = 0,
t = 0
)
))
}
data <-
data %>%
.munge_data(clean_address = F)
if (data %>% hasName("amountContract")) {
data <- data %>%
mutate(amountContract = as.integer(amountContract))
}
if (data %>% hasName("idContract")) {
data <- data %>%
mutate(idContract = idContract %>% str_remove_all("\\ |\\,|\\-")) %>%
mutate(
idContractResolved = case_when(
idContract %>% nchar() > 13 ~ idContract %>% substr(1, 13),
TRUE ~ idContract
)
) %>%
select(idSBIR, matches("idContract"), everything())
}
data <- data %>%
mutate(
idContract = case_when(
nchar(idContract) == 12 &
idContract %>% str_detect("FA875119P") ~ idContract %>% str_replace_all("FA875119P", "FA875119PA"),
TRUE ~ idContract
),
idContractResolved = case_when(
nchar(idContractResolved) == 12 &
idContractResolved %>% str_detect("FA875119P") ~ idContractResolved %>% str_replace_all("FA875119P", "FA875119PA"),
TRUE ~ idContractResolved
),
) %>%
mutate(
idContract = idContract %>% str_replace_all("FA865219PHB", "FA865219P01"),
idContractResolved = idContractResolved %>% str_replace_all("FA865219PHB", "FA865219P01")
)
data <-
data %>% mutate(
dateAward = as.character(dateAward),
dateAward = dateAward %>% str_replace_all("2109", "2019") %>% lubridate::ymd()
)
data <-
data %>%
munge_lite(snake_names = snake_names) %>%
.remove_na()
data
}
.parse_sbir_award_page <-
function(page) {
solicit_nodes <-
page %>%
html_nodes(".title a")
solicitations <-
solicit_nodes %>%
html_text()
urls <-
solicit_nodes %>% html_attr('href') %>% str_c("https://www.sbir.gov/", .)
ids <-
solicit_nodes %>% html_attr("href") %>% str_remove_all("/sbirsearch/detail/") %>% readr::parse_number()
data <-
tibble(idSBIR = ids, nameAward = solicitations)
df_labels <-
seq_along(ids) %>%
map_df(function(x) {
labels <-
page %>% html_nodes(glue::glue(".search-result:nth-child({x}) .label-info")) %>%
html_text()
if (length(labels) > 0) {
data <- tibble(idSBIR = ids[x], listLabels = list(labels))
return(data)
}
tibble(idSBIR = ids[x])
})
df_company <-
seq_along(ids) %>%
map_df(function(x) {
details <-
page %>% html_nodes(glue::glue(
".search-result:nth-child({x}) .search-result-sub-title span"
)) %>%
html_text() %>%
str_squish() %>%
discard(list(function(x) {
x == ""
}))
if (length(details) > 0) {
data <-
tibble(item = details) %>%
separate(
item,
into = c("nameSBIR", "value"),
extra = "merge",
sep = "\\:"
) %>%
mutate_all(str_squish) %>%
left_join(tibble(
nameActual = c("nameCompany", "groupSolicitation"),
nameSBIR = c("SBC", "Topic")
), by = "nameSBIR") %>%
select(nameActual, value) %>%
mutate(idSBIR = ids[x]) %>%
spread(nameActual, value)
return(data)
}
tibble(idSBIR = ids[x])
})
data <-
data %>%
left_join(df_company, by = "idSBIR") %>%
left_join(df_labels, by = "idSBIR")
if (data %>% hasName("dateAward")) {
data <- data %>%
mutate(
isDateEstimated = case_when(is.na(dateAward) ~ TRUE,
TRUE ~ FALSE),
dateAward = case_when(
is.na(dateAward) ~ glue("{yearAward}-05-15") %>% as.character() %>% ymd(),
TRUE ~ dateAward
),
dateAwardEnd = case_when(is.na(dateAwardEnd) ~ dateAward + 360,
TRUE ~ dateAwardEnd)
)
}
data
}
.parse_sbir_list_url <-
function(url = "https://www.sbir.gov/sbirsearch/award/all?page=0",
return_message = T) {
if (return_message) {
glue::glue("Parsing {url}") %>% message()
}
page <- url %>% read_html()
numberPage <-
url %>% str_split("=") %>% flatten_chr() %>% .[[2]] %>% readr::parse_number()
data <-
.parse_sbir_award_page(page = page) %>%
mutate(
urlSBIRPage = url,
numberPage,
urlSBIRAward = glue::glue("https://www.sbir.gov/sbirsearch/detail/{idSBIR}") %>% as.character()
)
char_names <-
data %>% select_if(is.character) %>% select(-matches("url|email")) %>% names()
data <-
data %>% mutate_at(char_names, list(function(x) {
x %>% str_squish() %>% str_to_upper()
}))
data
}
#' Parse Vector of Paged SBIR Award URLS
#'
#' @param urls
#' @param return_message
#'
#' @return
#' @export
#'
#' @examples
#' parse_sbir_award_list_urls(urls = "https://www.sbir.gov/sbirsearch/award/all?page=0")
parse_sbir_award_list_urls <-
function(urls = NULL,
return_message = T) {
if (length(urls) == 0) {
stop("Please enter SBIR award urls")
}
.parse_sbir_list_url_safe <-
possibly(.parse_sbir_list_url, tibble())
data <-
urls %>%
furrr::future_map_dfr(function(url) {
.parse_sbir_list_url_safe(url = url, return_message = return_message)
})
data <- data %>%
mutate_if(is.character,
list(function(x) {
case_when(x == "N/A" ~ NA_character_,
TRUE ~ as.character(x))
}))
data
}
#' SBIR Award Dictionary
#'
#' @return
#' @export
#'
#' @examples
#' sbir_award_urls()
sbir_award_urls <-
function() {
page <-
"https://www.sbir.gov/sbirsearch/award/all?page=0" %>% read_html()
max_page <-
page %>% html_nodes(".pager-last a") %>% html_attr("href") %>% str_split("=") %>% flatten_chr() %>% .[[2]] %>% readr::parse_number()
pages <- 0:max_page
urls <-
glue::glue("https://www.sbir.gov/sbirsearch/award/all?page={pages}") %>% as.character()
data <- tibble(numberPage = pages + 1, urlSBIRPage = urls)
data
}
#' SBIR awards
#'
#' Parses all public sbirs
#'
#' @param return_message if \code{TRUE} returns message
#'
#' @return \code{tibble}
#' @export
#'
#' @examples
#' \dontrun{
#' sbir_awards()
#' }
sbir_awards <-
function(return_message = F) {
d <- sbir_award_urls()
data <-
parse_sbir_award_list_urls(urls = d$urlSBIRPage, return_message = return_message) %>%
mutate(dateData = Sys.Date())
data
}
#' Acquires cached SBIR awards
#'
#' @param snake_names if \code{TRUE} returns snake names
#' @param create_text_description if \code{TRUE} creates a joined
#' text description field from the award name and award description fields
#' @param exclude_na_agencies if \code{TRUE} removes any awards with no known agencies
#'
#' @return \code{tibble}
#' @export
#'
#' @examples
#' \dontrun{
#' sbirs_historic()
#' }
sbirs_historic <-
memoise::memoise(function(create_text_description = F,
exclude_na_agencies = T,
snake_names = F) {
data <-
"https://asbcllc.com/r_packages/govtrackR/data/all_sbir_sba.rda" %>%
read_rda()
if (create_text_description) {
data <-
data %>%
mutate_at(c("nameAward", "descriptionAward"),
list(function(x) {
x %>% coalesce("")
})) %>%
create_text_block(
id_column = "idSBIR",
text_columns = c("nameAward", "descriptionAward")
) %>%
mutate_at(c("nameAward", "descriptionAward", "textSBIR"),
list(function(x) {
case_when(x == "" ~ NA_character_,
TRUE ~ x)
})) %>%
select(
idSBIR:idContractResolved,
idContract,
idSolicitation,
groupSolicitation,
idDUNS,
nameCompanyClean,
nameCompany:nameAward,
textSBIR,
everything()
)
}
data <-
data %>%
filter(!is.na(nameAgency))
data <-
data %>%
munge_lite(
snake_names = snake_names,
no_extra = T,
exclude_bloat = T
)
data <- data %>%
mutate(idAgencyTrackingClean = idAgencyTracking %>% str_remove_all("\\-"))
if (snake_names) {
data <- data %>% janitor::clean_names()
}
data
})
import_sbir_awards <-
function(start_override = NULL,
end_override = NULL,
sleep_time = NULL,
return_message = T,
folder_path = "Desktop/data/sbir/pages") {
df_urls <- dictionary_sbir_award_urls()
if (length(start_override) > 0) {
df_urls <-
df_urls %>%
filter(numberPage >= start_override)
}
if (length(end_override) > 0) {
df_urls <-
df_urls %>%
filter(numberPage <= end_override)
}
urls <- df_urls$urlSBIRPage
.parse_sbir_list_url_safe <-
purrr::possibly(.parse_sbir_list_url, tibble())
oldwd <- getwd()
setwd("~")
.build_folder(folder_path)
setwd(folder_path)
urls %>%
walk(function(url) {
page_no <-
df_urls %>% filter(urlSBIRPage == url) %>% pull(numberPage)
data <-
.parse_sbir_list_url_safe(url = url, return_message = return_message)
file <- glue::glue("{page_no}.rda")
if (length(sleep_time) > 0) {
Sys.sleep(time = sleep_time)
}
data %>% save(file = file)
})
if (getwd() != oldwd) {
setwd(oldwd)
}
return(invisible())
}
# sbir_api ----------------------------------------------------------------
.parse_sbir_api_slor <-
function(url = "https://sbir.defensebusiness.org/search/SolrQuery?core=topic&method=query&q=data&start=0&AspxAutoDetectCookieSupport=1") {
url %>% fromJSON()
}
# solicitations -----------------------------------------------------------
sbir_solicitation_urls <-
function() {
page <-
"https://www.sbir.gov/sbirsearch/topic/current/" %>% read_html()
max_page <-
page %>% html_nodes(".pager-last a") %>% html_attr("href") %>% str_split("=") %>% flatten_chr() %>% .[[2]] %>% readr::parse_number()
pages <- 0:max_page
urls <-
glue::glue("https://www.sbir.gov/sbirsearch/topic/current?page={pages}") %>% as.character()
data <- tibble(numberPage = pages + 1, urlSBIRPage = urls)
data
}
.parse_sbir_solicitation <-
function(page) {
solicit_nodes <-
page %>%
html_nodes(".title a")
titles <-
solicit_nodes %>% html_text() %>% str_squish()
slug <- solicit_nodes %>% html_attr("href")
urlSolicitation <- str_c("https://www.sbir.gov", slug)
idSolicitation <-
parse_number(slug)
descriptions <-
page %>% html_nodes(".search-snippet") %>% html_text() %>% str_squish() %>% str_to_upper()
result_nodes <- page %>% html_nodes(".search-result")
data <-
seq_along(result_nodes) %>%
map_dfr(function(x) {
tags <-
result_nodes[[x]] %>% html_nodes(".label-info") %>% html_text() %>% str_c(collapse = "|")
dates <-
result_nodes[[x]] %>% html_nodes(".solr-search-span") %>%
html_text()
d <-
tibble(dates) %>%
separate(
dates,
into = c("typeDate", "value"),
extra = "merge",
sep = "\\: "
) %>%
mutate(
item = case_when(
typeDate %>% str_detect("Release") ~ "dateRelease",
typeDate %>% str_detect("Open") ~ "dateOpen",
typeDate %>% str_detect("Due") ~ "dateDue",
TRUE ~ "dateClose"
)
) %>%
select(item, value) %>%
spread(item, value) %>%
mutate_all(mdy)
d %>%
mutate(idRow = x, tags) %>%
select(idRow, everything())
})
data <-
tibble(
idSolicitation,
nameSolicitation = titles,
descriptionSolicitationPage = descriptions,
urlSolicitation
) %>%
mutate(idRow = 1:n()) %>%
left_join(data, by = "idRow") %>%
select(-idRow)
data
}
.parse_sbir_solicitation_page_url <-
function(url = "https://www.sbir.gov/sbirsearch/topic/current?page=0",
return_message = T) {
if (return_message) {
glue("Parsing {url}") %>% message()
}
read_html(x = url) %>%
.parse_sbir_solicitation() %>%
mutate(urlPage = url)
}
#' SBIR Solications
#'
#' @param include_details if \code{TRUE} returns detailed solicitation descriptions
#' @param return_message if \code{TRUE} returns a message
#'
#' @return
#' @export
#'
#' @examples
sbir_solicitations <-
function(include_details = T,
return_message = T) {
data <- sbir_solicitation_urls()
.parse_sbir_solicitation_page_url_safe <-
possibly(.parse_sbir_solicitation_page_url, tibble())
all_data <-
data$urlSBIRPage %>%
future_map_dfr(function(url) {
.parse_sbir_solicitation_page_url_safe(url = url, return_message = return_message)
})
all_data <-
all_data %>%
.munge_data()
if (!include_details) {
return(all_data)
}
df_details <-
parse_sbir_solicitation_detail_urls(urls = all_data$urlSolicitation,
return_message = return_message)
all_data <-
all_data %>%
left_join(df_details, by = c("idSolicitation", "urlSolicitation"))
all_data <-
all_data %>%
.munge_data() %>%
mutate(
isActive = dateClose >= Sys.Date(),
countDaysOpen = dateClose - dateOpen,
countDaysRemaining = pmax(0, dateClose - Sys.Date()),
nameBranch = case_when(nameBranch == "N/A" ~ NA_character_,
TRUE ~ nameBranch)
)
all_data
}
.parse_sbir_solicitation_detail_url <-
function(url = "https://www.sbir.gov/sbirsearch/detail/1620819",
return_message = T) {
if (return_message) {
glue("Parsing {url}") %>% message()
}
page <- read_html(url)
parts <- url %>% str_split("/") %>% flatten_chr()
idSolicitation <-
parts[length(parts)] %>% as.numeric()
parts <-
page %>% html_nodes(".col-sm-6") %>%
html_text() %>% str_split("\n\n") %>%
flatten_chr() %>%
str_squish() %>%
discard(function(x) {
x == ""
})
data <-
tibble(parts) %>%
separate(
parts,
into = c("nameSBIR", "value"),
extra = "merge",
sep = "\\: "
) %>%
mutate_all(str_squish)
data <-
data %>%
left_join(dictionary_sbir_names(), by = "nameSBIR") %>%
distinct() %>%
select(nameActual, value)
cols <-
data %>% pull(nameActual)
data <-
data %>%
spread(nameActual, value) %>%
select(one_of(cols))
if (data %>% hasName("programPhaseYear")) {
data <-
data %>%
separate(
programPhaseYear,
into = c("slugProgram", "idPhase", "yearSolicitation"),
extra = "merge",
sep = "\\|"
) %>%
mutate_all(str_squish) %>%
mutate(yearSolicitation = as.numeric(yearSolicitation))
}
description <-
page %>% html_nodes(".solicitation-description p") %>% html_text() %>% str_squish() %>%
stringi::stri_trans_general("Latin-ASCII") %>% str_squish() %>%
str_remove_all(" & ") %>%
str_to_upper() %>% gsub("\\s+", " ", .) %>%
str_split("\\•") %>%
flatten_chr() %>%
str_squish()
if (length(description) > 0) {
data <- data %>%
mutate(descriptionSolicitation = description %>% str_c(description, collapse = "|"))
}
data <- data %>%
mutate(idSolicitation,
urlSolicitation = url) %>%
select(idSolicitation, everything())
data
}
#' Parse list of SBIR solication URLs
#'
#' @param urls vector of urls
#' @param return_message if \code{TRUE} return a message
#'
#' @return \code{tibble}
#' @export
#'
#' @examples
parse_sbir_solicitation_detail_urls <-
function(urls = NULL,
return_message = T) {
if (length(urls) == 0) {
stop("Enter SBIR Solicitation URLs")
}
.parse_sbir_solicitation_detail_url_safe <-
possibly(.parse_sbir_solicitation_detail_url, tibble())
urls %>%
future_map_dfr(function(url) {
.parse_sbir_solicitation_detail_url_safe(url = url, return_message = return_message)
})
}
# dod_sbir ----------------------------------------------------------------
#' DOD Phase II SBIR enhancements
#'
#' Third party investments from DOD
#' Phase II companies from 1999 to 2012
#'
#' @return
#' @export
#'
#' @examples
sbir_enhancements_dod <-
function() {
page <-
"https://www.acq.osd.mil/osbp/sbir/awards/enhancement-awards.shtml" %>%
read_html()
data <- page %>% html_table(fill = F) %>% .[[1]] %>% as_tibble()
data <- data %>% .munge_sbir_names()
data <-
data %>%
mutate(
yearFiscal =
case_when(
yearFiscal == 99 ~ 1999L,
nchar(yearFiscal) == 1 ~ glue("200{yearFiscal}") %>% as.integer(),
yearFiscal %in% c(10:20) ~ glue("20{yearFiscal}") %>% as.integer()
) %>% as.integer()
) %>%
.munge_data(clean_address = F)
data <-
data %>%
separate(
nameCompany,
into = c("nameCompany", "nameCompanyDBA"),
extra = "merge",
sep = "\\("
) %>%
mutate(nameCompanyDBA = nameCompanyDBA %>% str_remove_all("DBA|FORMERLY")) %>%
mutate_if(is.character, str_squish) %>%
select(yearFiscal, slugAgency, nameCompanyClean, everything())
data
}
.parse_year_dod_sbir_solicitation_url <-
function(url = "https://www.acq.osd.mil/osbp/sbir/solicitations/sbir20192/index.shtml",
return_message = T) {
if (return_message) {
glue("Parsing {url}") %>% message()
}
url_year <- url
page <- url %>% read_html()
base <-
url %>% str_remove_all(".shtml") %>% str_remove_all("index")
nameTopic <-
page %>% html_nodes("table+ table td:nth-child(1) strong") %>% html_text()
if (length(nameTopic) == 0) {
nameTopic <-
page %>% html_nodes("td:nth-child(1) strong") %>% html_text() %>% str_squish()
}
dateLastModified <-
page %>% html_nodes("table+ table td:nth-child(2)") %>% html_text() %>% mdy()
if (length(dateLastModified) == 0) {
dateLastModified <-
page %>% html_nodes("td:nth-child(2)") %>% html_text() %>% mdy()
}
if (length(dateLastModified) > length(nameTopic)) {
start <- length(dateLastModified) - length(nameTopic)
dateLastModified <-
dateLastModified[(start + 1):length(dateLastModified)]
}
all_data <-
tibble(nameTopic, dateLastModified) %>%
.munge_data()
links <- page %>% html_nodes(".docs a")
topics <- links %>% html_attr("title")
url <-
links %>% html_attr("href") %>% str_c(base, .)
df_urls <-
tibble(topics, url) %>%
separate(
topics,
sep = "\\[",
extra = "merge",
into = c("nameTopic", "type")
) %>%
mutate_all(list(function(x) {
x %>% str_remove_all("]") %>%
str_squish()
})) %>%
.munge_data()
df_topics <-
distinct(df_urls, nameTopic) %>%
mutate(
nameTopicActual = all_data$nameTopic,
dateLastModified = all_data$dateLastModified
)
all_data <-
df_urls %>%
left_join(df_topics, by = "nameTopic") %>%
select(-nameTopic) %>%
rename(nameTopic = nameTopicActual) %>%
mutate(type = case_when(type %>% str_detect("WORD|word") ~ "Word",
TRUE ~ type)) %>%
mutate(type = glue("urlDODYearSolicitation{type}") %>% as.character()) %>%
spread(type, url) %>%
.munge_data()
all_data %>%
mutate(urlYearSolicitation = url_year)
}
.parse_year_dod_sbir_solicitation_urls <-
function(urls = "https://www.acq.osd.mil/osbp/sbir/solicitations/sbir20192/index.shtml",
return_message = T) {
.parse_year_dod_sbir_solicitation_url_safe <-
possibly(.parse_year_dod_sbir_solicitation_url, tibble())
urls %>%
map_dfr(function(url) {
.parse_year_dod_sbir_solicitation_url_safe(url = url, return_message =
)
})
}
#' DOD SBIR Solicitation url dictionary
#'
#' @param parse_years if \code{TRUE} parses years solicitations for links to the
#' underlying solicitation files
#'
#' @return
#' @export
#'
#' @examples
dictionary_dod_sbir_solicitations <-
function(parse_years = F,
return_message = T) {
page <-
read_html("https://www.acq.osd.mil/osbp/sbir/solicitations/archives.shtml")
data <-
page %>%
html_table(fill = T) %>%
flatten_df() %>%
setNames(c("yearFiscal", "nameSolicitation", "dateClose")) %>%
.munge_data() %>%
mutate(
yearFiscal = year(dateClose),
typeSolicitation = case_when(nameSolicitation %>% str_detect("SBIR") ~ "SBIR",
TRUE ~ "STTR")
) %>%
filter(!is.na(dateClose)) %>%
select(dateClose, yearFiscal, typeSolicitation, everything())
nodes <- page %>% html_nodes("td a")
nameSolicitation <- nodes %>% html_text()
urlYearSolicitation <-
nodes %>% html_attr("href") %>% str_c("https://www.acq.osd.mil/osbp/sbir/solicitations/", .)
df_urls <-
tibble(nameSolicitation, urlYearSolicitation) %>% .munge_data()
data <-
data %>%
left_join(df_urls, by = "nameSolicitation")
if (parse_years) {
df_urls <-
.parse_year_dod_sbir_solicitation_urls(urls = data$urlYearSolicitation,
return_message = return_message)
}
data
}
# dod ---------------------------------------------------------------------
.dictionary_sbir_award_tables <-
function() {
tibble(
table = c(
"aw_program_type_name",
"org_state_name_abbr",
"agency_name_abbr",
"aw_fiscal_year",
"org_ipo_flag",
"small_business_flag",
"sdb_flag",
"wosb_flag",
"hub_zone_flag",
"sub_vet_owned_flag",
"sdvosb_flag",
"fed_facility_flag",
"corp_entity_flag",
"foreign_national_flag",
"animal_research_flag",
"solicitation_name",
"ta_air_platform_flag",
"ta_chem_bio_defense_flag",
"ta_info_systems_flag",
"ta_ground_sea_flag",
"ta_materials_flag",
"ta_bio_medical_flag",
"ta_sensors_flag",
"ta_electronics_flag",
"ta_battlespace_flag",
"ta_space_platforms_flag",
"ta_human_systems_flag",
"ta_weapons_flag",
"ta_nuclear_flag",
"topic_itar_restricted_flag",
"submission_type",
"topic_renew_energy_flag",
"manufacturing_type_name",
"sub_direct2_phase2_flag"
),
nameActual =
c(
"nameProgram",
"slugState",
"slugAgency",
"yearFiscal",
"hasIPO",
"isSmallBusiness",
"isSmallDisadvantagedBusiness",
"isWomanOwnedSmallBusiness",
"isHubZoneCompany",
"isVetSmallDisadvantagedBusiness",
"isServiceDisabledOwnedSmallBusiness",
"isFedFacility",
"isCorporateEntity",
"isForeignNational",
"hasAnimalResearch",
"nameSolicitation",
"isAirPlatformTopic",
"isBioChemicalDefenseTopic",
"isInformationSystemTopic",
"isGroundSeaTopic",
"isMaterialsTopic",
"isBioMedicalTopic",
"isSensorsTopic",
"isElectronicsTopic",
"isBattleSpaceTopic",
"isSpaceTopic",
"isHumanSystemsTopic",
"isWeaponsTopic",
"isNuclearTopic",
"isITARRestrictedTopic",
"typeSubmission",
"isRenewableEnergy",
"typeManufacturing",
"isDirectToPhase2"
)
)
}
#' Department of Defense SBIR Award Summary
#'
#' Returns summary couunts for
#' DOD agency SBIR award winners by table.
#'
#' @return
#' @export
#'
#' @examples
sbir_award_summary_dod <-
function() {
json_data <-
"https://sbir.defensebusiness.org/search/SolrQuery?core=award&method=query&start=0" %>%
fromJSON(simplifyDataFrame = T)
fields <- json_data$facet_counts$facet_fields
tibble(table = names(fields))
all_data <-
seq_along(fields) %>%
map_dfr(function(x) {
values <- fields[[x]]
table <- names(fields)[[x]]
tibble(table,
item = values[c(T, F)],
count = values[c(F, T)] %>% as.numeric())
})
all_data <-
all_data %>%
left_join(.dictionary_sbir_award_tables(), by = "table") %>%
select(table, nameActual, item, everything()) %>%
mutate(count = count %>% formattable::comma(digits = 0))
all_data
}
.parse_dod_sbir_award_json <-
function(json_data) {
data <- json_data$response$docs %>% as_tibble()
data <-
data %>%
.munge_sbir_names() %>%
.munge_data(clean_address = F)
data <-
data %>%
filter(!is.na(nameCompany) |
!is.na(idContract) |
!is.na(keywordsSBIRCompany))
data
}
.parse_sbir_dod_url_api <-
function(url = "https://sbir.defensebusiness.org/search/SolrQuery?core=award&method=query&q=&start=0") {
json_data <- fromJSON(url)
data <-
json_data %>%
.parse_dod_sbir_award_json() %>%
mutate_if(is.character,
list(function(x) {
if_else(x == "", NA_character_, x)
})) %>%
.remove_na()
data
}
.generate_dod_award_sbir_url <-
function(year = 2018,
return_message = T) {
if (return_message) {
glue("Generating DOD SBIR urls for {year}") %>% message()
}
url <-
glue(
"https://sbir.defensebusiness.org/search/SolrQuery?core=award&fq=(aw_fiscal_year:{year})&method=query&start=0"
) %>%
as.character()
json_data <-
url %>%
fromJSON()
total_pages <-
json_data$response$numFound
options(scipen = 99999)
pages <- seq(from = 0, to = total_pages, by = 10)
urls <-
glue(
"https://sbir.defensebusiness.org/search/SolrQuery?core=award&fq=(aw_fiscal_year:{year})&method=query&start={pages}"
) %>%
as.character()
tibble(yearTopic = year, urlAPI = urls)
}
.generate_dod_award_sbir_urls <-
function(years = 1983:2019,
all_years = F,
return_message = T) {
if (all_years) {
start <- 1983
end <- Sys.Date() %>% lubridate::year()
years <- start:end
}
all_data <-
years %>%
map_dfr(function(year) {
.generate_dod_award_sbir_url(year = year, return_message = return_message)
})
all_data
}
.parse_sbir_dod_url_api_safe <-
purrr::possibly(.parse_sbir_dod_url_api, tibble::tibble())
.parse_sbir_dod_urls <-
function(urls, return_message = T) {
all_data <-
urls %>%
map_dfr(function(url) {
if (return_message) {
url %>% message()
}
d <-
.parse_sbir_dod_url_api_safe(url = url) %>%
select(-matches("urlSpending")) %>%
mutate(urlAPI = url)
d
})
all_data
}
.dod_years_sbirs <-
function(years = 1983:2019,
all_years = F,
return_message = T) {
df_urls <-
.generate_dod_award_sbir_urls(years = years,
all_years = all_years,
return_message = return_message)
all_data <-
.parse_sbir_dod_urls(urls = df_urls$urlAPI, return_message = return_message)
all_data
}
.all_dod_sbirs <-
function(return_message = T) {
url <-
"https://sbir.defensebusiness.org/search/SolrQuery?core=award&method=query&start=0"
json_data <-
url %>%
fromJSON()
total_pages <-
json_data$response$numFound
options(scipen = 99999)
pages <- seq(from = 0, to = total_pages, by = 10)
urls <-
glue(
"https://sbir.defensebusiness.org/search/SolrQuery?core=award&method=query&q=&start={pages}"
) %>%
as.character()
all_data <-
urls %>%
.parse_sbir_dod_urls(return_message = return_message)
all_data <-
all_data %>%
.remove_na()
all_data
}
#' DOD SBIR Solicitations
#'
#' All active and histric DOD
#' SBIR/STTR solicitations.
#'
#' @return \code{tibble}
#' @export
#'
#' @examples
#' dod_sbir_solicitations()
dod_sbir_solicitations <-
function() {
data <- "https://sbir.defensebusiness.org/search/solicitations" %>%
fromJSON() %>%
as_tibble()
data <-
data %>%
.munge_sbir_names() %>%
select(-contains("remove"))
data <-
data %>%
.munge_data()
data
}
.dod_sbir_topics <-
memoise::memoise(function() {
json <-
"https://sbir.defensebusiness.org/topics/gettopics" %>%
fromJSON(simplifyDataFrame = T)
data <-
json$Results %>% as_tibble()
data <-
data %>%
.munge_sbir_names()
data <- data %>%
.distinct_sbir_cols()
data <-
data %>%
.munge_data(clean_address = F)
if (data %>% hasName("yearAward")) {
data <- data %>%
rename(yearTopic = yearAward)
}
data <-
data %>%
select(one_of(c(
"yearTopic",
"typeProgram",
"typeProgram",
"idTopic"
)),
everything()) %>%
mutate(
urlDODSBIRTopicAPI = glue(
"https://sbir.defensebusiness.org/topics/GetTopicDetails?topicId={idTopic}"
) %>% as.character()
)
data
})
.parse_dod_topic_dates <-
function(data) {
date_cols <- data %>% select(matches("datetime")) %>% names()
if (length(date_cols) > 0) {
data <-
data %>%
mutate_at(date_cols,
list(function(x) {
x %>%
parse_number() / 1000
})) %>%
mutate_at(date_cols,
list(function(x) {
x %>%
as.POSIXct(origin = "1970-01-01", tz = "UTC") %>% as.Date()
}))
}
data
}
.parse_dod_sbir_topic_url <-
function(url = "https://sbir.defensebusiness.org/topics/GetTopicDetails?topicId=307349",
return_message = T) {
if (return_message) {
glue("Parsing {url}") %>% message()
}
data <-
fromJSON(url)
d <- data %>%
enframe(name = "table") %>%
filter(table == "topic") %>%
mutate(value = value %>% map(enframe)) %>%
unnest_legacy()
d <- d %>%
mutate(
type = value %>% map(class) %>% flatten_chr(),
nested = type %>% str_detect("data|list")
)
has_nested <- d %>% filter(nested) %>% nrow()
df_bases <-
d %>%
filter(!nested) %>%
filter(type != "NULL") %>%
select(type, value, name)
types <-
df_bases$type %>% unique()
df_bases <-
types %>%
map(function(x) {
df_bases %>%
filter(type == x) %>%
select(name, value) %>%
unnest(cols = c(value)) %>%
spread(name, value) %>%
mutate(urlDODSBIRTopicAPI = url)
})
df_base <-
df_bases %>%
reduce(left_join, by = "urlDODSBIRTopicAPI") %>%
.munge_sbir_names()
df_base <-
df_base %>%
.distinct_sbir_cols() %>%
.parse_dod_topic_dates() %>%
.munge_data(
clean_address = F,
unformat = T,
parse_dates = F
) %>%
.remove_na()
if (df_base %>% hasName("hasTPOCEmail")) {
df_base <- df_base %>%
mutate(hasTPOCEmail = as.logical(hasTPOCEmail))
}
if (df_base %>% hasName("typeProgram")) {
df_base <- df_base %>%
rename(idProgram = typeProgram)
}
if (df_base %>% hasName("typeProgramString")) {
df_base <-
df_base %>%
rename(typeProgram = typeProgramString)
}
if (!has_nested) {
return(df_base)
}
d <- d %>%
filter(nested)
types <-
d$type %>% unique()
types <- types[types %>% str_detect("data")]
d_nested <-
types %>%
map(function(x) {
d_row <-
d %>%
filter(type == x) %>%
select(name, value) %>%
unnest(cols = c(value))
if (x == "data.frame") {
d_row <-
d_row %>%
select(-one_of("ReferenceFiles")) %>%
select(-name) %>%
.munge_sbir_names() %>%
.distinct_sbir_cols() %>%
.parse_dod_topic_dates() %>%
.munge_data(
clean_address = F,
unformat = T,
parse_dates = F
)
d_row <- d_row %>%
group_by(idTopic) %>%
nest() %>%
ungroup() %>%
rename(dataTopicReferences = data) %>%
mutate(
hasTopicReferences = T,
countReferences = dataTopicReferences %>% map_dbl(nrow),
urlDODSBIRTopicAPI = url
) %>%
select(idTopic,
hasTopicReferences,
countReferences,
everything())
return(d_row)
}
if (x == "list") {
d_row <-
d_row %>%
mutate(type = value %>% map(class) %>% flatten_chr()) %>%
unique()
types <-
d_row$type %>% unique()
types <-
types[!types %>% str_detect("logical")]
d_bases <-
types %>%
map(function(y) {
d_row %>%
filter(type == y) %>%
select(name, value) %>%
unnest(cols = c(value)) %>%
filter(value != "") %>%
group_by(name) %>%
mutate(id = 1:n()) %>%
filter(id == max(id)) %>%
ungroup() %>%
select(-id) %>%
filter(value != "--") %>%
spread(name, value) %>%
mutate(urlDODSBIRTopicAPI = url)
})
d_bases <-
d_bases %>% reduce(left_join, by = "urlDODSBIRTopicAPI")
if (nrow(d_bases) == 0) {
return(NULL)
}
d_bases <-
d_bases %>%
.munge_sbir_names() %>%
.distinct_sbir_cols() %>%
.parse_dod_topic_dates() %>%
.munge_data(
clean_address = F,
unformat = T,
parse_dates = F
)
return(d_bases)
}
})
if (length(d_nested) > 0) {
d_nested <-
d_nested %>%
discard(function(x) {
x %>% is_null()
}) %>%
reduce(left_join, by = "urlDODSBIRTopicAPI") %>%
select(-one_of("idTopic"))
data <-
df_base %>%
bind_cols(d_nested %>% select(-urlDODSBIRTopicAPI))
} else {
data <- df_base
}
if (data %>% hasName("nameFirstContact") &
data %>% hasName("nameLastContact")) {
data <-
data %>%
unite(
nameContactPrimary,
nameFirstContact,
nameLastContact,
sep = " ",
remove = F
) %>%
mutate(emailContactPrimary = emailContact)
df_contacts <-
data %>%
select(matches("nameFirstContact|nameLastContact|emailContact")) %>%
select(-matches("AP|emailContactPrimary")) %>%
gather(name, value) %>%
mutate(
numberPerson = name %>% str_extract("[0-9]") %>% as.integer()-1 ,
numberPerson = numberPerson %>% coalesce(0) + 1,
name = name %>% str_remove_all("[0-9]")
) %>%
spread(name, value) %>%
unite(
nameContact,
nameFirstContact,
nameLastContact,
sep = " ",
remove = F
) %>%
mutate(urlDODSBIRTopicAPI = url) %>%
group_by(urlDODSBIRTopicAPI) %>%
nest() %>%
ungroup() %>%
rename(dataContacts = data) %>%
mutate(hasContacts = T)
data <-
data %>%
left_join(df_contacts, by = "urlDODSBIRTopicAPI")
remove_cols <-
data %>% select(matches("nameFirst|nameLast|^email")) %>%
select(-matches("AP$|Primary")) %>%
names()
if (length(remove_cols) > 0) {
data <-
data %>%
select(-one_of(remove_cols))
}
}
if (data %>% hasName("nameFirstAP") &
data %>% hasName("nameLastAP")) {
data <-
data %>%
unite(nameContactAP,
nameFirstAP,
nameLastAP,
sep = " ",
remove = T)
}
data <-
data %>%
select(one_of("idTopic", "titleTopic"),
matches("topic|name"),
everything())
data
}
#' Parse Department of Defense SBIR Open Topic URLs
#'
#' Parses DOD SBIR Open Topics from vector
#' of API urls
#'
#' @param urls of API urls
#' @param return_message if \code{TRUE} returns a message
#'
#' @return \code{tibble()}
#' @export
#'
#' @examples
#' parse_dod_sbir_topic_urls(urls ="https://sbir.defensebusiness.org/topics/GetTopicDetails?topicId=307428")
parse_dod_sbir_topic_urls <-
function(urls = NULL,
return_message = F) {
if (length(urls) == 0) {
stop("Enter DOD topic URLs")
}
all_data <-
urls %>%
future_map_dfr(function(url) {
.parse_dod_sbir_topic_url(url = url, return_message = return_message)
})
all_data <-
all_data %>%
mutate(
hasContacts = dataContacts %>% map_dbl(length) > 0,
hasTopicReferences = dataTopicReferences %>% map_dbl(length) > 0
) %>%
mutate_if(is.character,
list(function(x) {
case_when(x == "N/A" ~ NA_character_,
TRUE ~ x) %>% stri_enc_toascii()
}))
all_data <- all_data %>%
select(
idTopic,
titleTopic,
nameAgency,
nameCommand,
nameContactPrimary,
matches("^topic|Topic$"),
matches("description"),
everything()
)
all_data <-
all_data %>%
mutate(
hasSemi = keywordsTopicSBIR %>% str_detect("\\;"),
keywordsTopicSBIR = case_when(
hasSemi ~ keywordsTopicSBIR,
TRUE ~ keywordsTopicSBIR %>% str_replace_all("\\,", "\\; ")
)
) %>%
select(-hasSemi)
df_keywords <-
all_data %>%
select(idTopic, keywordsTopicSBIR) %>%
separate_rows(keywordsTopicSBIR, sep = "\\; ") %>%
mutate_at("keywordsTopicSBIR",
function(x) {
x %>% str_remove_all("\\.|\\ :") %>% str_squish()
}) %>%
mutate_at("keywordsTopicSBIR",
function(x) {
case_when(
x %in% c("AI", "ARTIFICIAL INTELLIGENCE, DEEP LEARNING") ~ "ARTIFICIAL INTELLIGENCE",
x %in% c("AJ", "ANTI-JAM") ~ "ANTIJAM",
x == "EW" ~ "ELECTRONIC WARFARE",
x == "HPM" ~ "HIGH POWER MICROWAVE",
x == "ML" ~ "MACHINE LEARNING",
x %in% c("RF") ~ "RADIO FREQUENCY",
x %in% c("QCL", "QUANTUM COMPUTING") ~ "QUANTUM COMPUTATION",
x %in% c("AR") ~ "AUGMENTED REALITY",
TRUE ~ x
)
}) %>%
rename(keywordTopicSBIR = keywordsTopicSBIR) %>%
filter(keywordTopicSBIR != "") %>%
distinct() %>%
group_by(idTopic) %>%
nest() %>%
ungroup() %>%
rename(dataTopicKeywords = data) %>%
mutate(countKeywords = dataTopicKeywords %>% map_dbl(nrow),
hasKeywords = T)
all_data <-
all_data %>%
left_join(df_keywords, by = "idTopic")
all_data
}
#' Department of Defense SBIR/STTR Open Topics
#'
#' Returns details for all open DOD SBIR/STTR
#' opportunities
#'
#' @param include_details if \code{TRUE} uses \link{parse_dod_sbir_topic_urls} to parse detailed topic data
#' @param return_message if \code{TRUE} a returns message
#'
#' @return
#' @export
#'
#' @examples
#' \dontrun{
#' dod_sbir_open_topics()
#' }
dod_sbir_open_topics <-
function(include_details = T,
include_text_block = T,
return_message = F) {
data <-
.dod_sbir_topics() %>%
mutate(dateSearch = Sys.Date()) %>%
select(dateSearch, everything())
if (!include_details) {
return(data)
}
df_details <-
parse_dod_sbir_topic_urls(urls = data$urlDODSBIRTopicAPI,
return_message = return_message)
missing_cols <-
names(
df_details)[!names(df_details) %in% names(data)]
data <-
data %>%
left_join(df_details %>% select(idTopic, one_of(missing_cols)),
by = "idTopic") %>%
select(one_of(names(df_details)), everything())
if (include_text_block) {
text_columns <-
data %>% select_if(is.character) %>%
select(matches("description|topic|Topic")) %>%
select(-matches(
"descriptionSitisAdditionalInfo|^url|keywordsTopicSBIR"
)) %>%
names()
data <-
create_text_block(data = data,
id_column = "idTopic",
text_columns = text_columns)
}
data
}
# nasa --------------------------------------------------------------------
.sbirs_nasa <-
function() {
data <-
"https://sbir.nasa.gov/sbir_search_csv?searchText=&searchTextType=any&sort_solicit_program%5B%5D=&sort_solicit_program_year%5B%5D=&sort_solicit_program_phase_value%5B%5D=&sort_field_award_md_short_name%5B%5D=&sort_field_award_ctr_name%5B%5D=&sort_field_firm_state%5B%5D=&sort_field_award_tech_area%5B%5D=&sortm_field_award_tech_taxonomy%5B%5D=&searchType=award&sol_search_submit=Submit&form_build_id=form-05iVhnODgWrol2quyjY5xRlVNeQZ0KqfTmK4Nv8_3Yc&form_id=sbir_awards_search_form" %>%
fread(quote = "", showProgress = FALSE) %>%
as_tibble()
data <-
data %>% .munge_sbir_names() %>% .munge_data(clean_address = F)
data
}
#' NASA SBIRs
#'
#' NASA SBIR data
#'
#' @return
#' @export
#'
#' @examples
sbirs_nasa <- function() {
.tt <- memoise::memoise(.sbirs_nasa)
data <- .tt()
data
}
# doe ---------------------------------------------------------------------
# https://pamspublic.science.energy.gov/WebPAMSExternal/Interface/Awards/AwardSearchExternal.aspx?controlName=ContentTabs
# epa ---------------------------------------------------------------------
.parse_epa_sbirs <-
function() {
"https://cfpub.epa.gov/ncer_abstracts/index.cfm/fuseaction/outlinks.sbir/fullList/Yes/showYear/all"
}
# dhs ---------------------------------------------------------------------
# https://sbir2.st.dhs.gov/portal/public/Awards
#' DOD SBIRs
#'
#' All SBIRs from
#' Department of Defense SBIR
#' API
#'
#' @param group_filters if not \code{NULL} the solicitation groups to isolate
#' @param snake_names if \code{TRUE} returns snake case names
#' #' @param clean_companies if \code{TRUE} returns snake case names
#' @param create_text_description if \code{TRUE} returns a blocked text description field
#' @param clean_keywords if \code{TRUE} cleans Topic and company keywords
#' @param exclude_null_contracts
#' @param clean_companies
#' @param clean_contracts
#' @param unformat
#'
#' @return \code{tibble()}
#' @export
#'
#' @examples
#' dod_sbir_historic()
dod_sbir_historic <-
function(group_filters = NULL,
snake_names = F,
exclude_null_contracts = T,
create_text_description = T,
unformat = T,
clean_companies = T,
clean_contracts = T,
clean_keywords = T) {
urls <-
glue(
"https://asbcllc.com/r_packages/govtrackR/data/dod_sbir_cache/dod_sbir/dod_sbir_{1:10}.rda"
) %>% as.character()
read_rda_m <- memoise::memoise(read_rda)
data <-
urls %>%
map_dfr(function(url) {
read_rda_m(file = url)
})
if (exclude_null_contracts) {
data <-
data %>%
filter(!is.na(idContract))
}
if (length(group_filters) > 0) {
data <- data %>% filter(groupSolicitation %in% group_filters)
}
if (clean_contracts) {
contract_cols <-
data %>% select(matches("idContract")) %>% names()
data <- data %>%
mutate_at(contract_cols, list(function(x) {
x %>% str_remove_all("\\-")
})) %>%
mutate(
idOfficeAward = case_when(
is.na(idContract) ~ NA_character_,
TRUE ~ idContract %>% substr(1, 6)
),
slugBudgetAward = case_when(
is.na(idContract) ~ NA_character_,
TRUE ~ idContract %>% substr(7, 8)
),
codeAcquisitionAward = case_when(
is.na(idContract) ~ NA_character_,
TRUE ~ idContract %>% substr(9, 9)
),
codeNumberAward =
case_when(
is.na(idContract) ~ NA_character_,
TRUE ~ idContract %>% substr(10, nchar(idContract))
)
)
}
data <-
data %>%
rename(
nameAward = titleSBIR,
descriptionAward = descriptionAbstract,
nameTopic = titleTopic
)
data <-
data %>%
select(-one_of("yearFiscalAwarwd"))
data <-
data %>%
mutate(
dateAward = as.Date(datetimeAward),
dateAwardEnd = as.Date(datetimeAwardEnd)
) %>%
mutate(
yearAward = case_when(
lubridate::month(dateAward) %>% as.numeric() >= 10 ~ lubridate::year(dateAward) %>% as.numeric()+1,
TRUE ~ lubridate::year(dateAward) %>% as.numeric()
)
)
if (create_text_description) {
data <-
data %>%
mutate_at(
c(
"nameAward",
"descriptionAward",
"descriptionPrincipalBenefit"
),
list(function(x) {
x %>% coalesce("")
})
) %>%
mutate(idSBIRDeptDefense = 1:n()) %>%
create_text_block(
id_column = "idSBIRDeptDefense",
text_columns = c(
"nameAward",
"descriptionAward",
"descriptionPrincipalBenefit"
)
) %>%
select(-idSBIRDeptDefense) %>%
mutate_at(
c(
"nameAward",
"descriptionAward",
"descriptionPrincipalBenefit",
"textSBIRDeptDefense"
),
list(function(x) {
case_when(x == "" ~ NA_character_,
TRUE ~ x)
})
)
}
if (clean_keywords) {
glue("\n\nCleaning DOD company and topic SBIR/STTR keywords\n\n") %>% message()
data <-
data %>%
mutate(row = 1:n()) %>%
select(row, everything())
df_sbir <-
data %>%
select(row, keywordsTopicSBIR) %>%
separate_rows(keywordsTopicSBIR, convert = T, sep = "\\,|\\;|^&|[0-9]. |\\ & ") %>%
separate_rows(keywordsTopicSBIR, convert = T, sep = "\\ [0-9].") %>%
separate_rows(keywordsTopicSBIR, sep = "CONTACT:|POC:|TECHNICAL POINT OF CONTACT|PHONE:|TITLE:|PROGNOSTICS:|KEYWORDS: |TOPIC WRITERS:|ND:", convert = T) %>%
mutate_if(is.character, str_squish) %>%
filter(keywordsTopicSBIR != "") %>%
filter(!keywordsTopicSBIR %>% str_detect("FAX:|EMAIL:|^:|E MAIL")) %>%
rename(keyword = keywordsTopicSBIR) %>%
mutate(
keyword = keyword %>% str_squish() %>% str_remove_all("\\.|\u001a|\\(|\\)") %>%
str_replace_all("\\-", " ") %>% str_replace_all("3 D", "3D")
) %>%
filter(!keyword %in% c("", "AND")) %>%
filter(!keyword %>% str_detect("EMAIL|FAX| PHONE$")) %>%
mutate_if(is.character, str_squish) %>%
mutate_at("keyword",
function(x) {
case_when(
x %in% c("AI", "ARTIFICIAL INTELLIGENCE, DEEP LEARNING") ~ "ARTIFICIAL INTELLIGENCE",
x %in% c("AJ", "ANTI-JAM") ~ "ANTIJAM",
x == "EW" ~ "ELECTRONIC WARFARE",
x == "MEMS" ~ "MICROELECTROMECHANICAL SYSTEMS",
x == "CFD" ~ "Computational fluid dynamics",
x == "HPM" ~ "HIGH POWER MICROWAVE",
x == "ML" ~ "MACHINE LEARNING",
x %in% c("RF") ~ "RADIO FREQUENCY",
x %in% c("QCL", "QUANTUM COMPUTING") ~ "QUANTUM COMPUTATION",
x %in% c("AR") ~ "AUGMENTED REALITY",
TRUE ~ x
)
}) %>%
filter(!keyword %in% c("", "\u001a\u001a")) %>%
mutate(
keyword = case_when(
keyword %in% c("AI", "AUTOMATED AI") ~ "ARTIFICIAL INTELLIGENCE",
keyword %in% c("AR") ~ "AUGMENTED REALITY",
keyword %in% c("IOT") ~ "INTERNET OF THINGS",
keyword %in% c("UAV", "UAS", "DRONES", "DRONE", "UNMANNED") ~ "Unmanned aerial vehicle" %>% str_to_upper(),
keyword %in% c("FPGA") ~ "Field Programmable Gate Array" %>% str_to_upper(),
keyword %in% c("RF") ~ "Radio Frequency" %>% str_to_upper(),
keyword %in% c("ISR") ~ "Intelligence Surveillance and reconnaissance" %>% str_to_upper(),
keyword %in% c("GAN") ~ "Generative adversarial network" %>% str_to_upper(),
keyword %in% c("MWIR") ~ "Midwave Infrared" %>% str_to_upper(),
keyword %in% c("IR") ~ "Infrared" %>% str_to_upper(),
keyword %in% c("UUV") ~ "Unmanned underwater vehicle" %>% str_to_upper(),
keyword %in% c("SIC") ~ "single silicon carbide",
keyword %in% c("IED") ~ "Improvised explosive device",
keyword %in% c("NDE") ~ "Non Destructive Evaluation",
keyword %in% c("MMIC") ~ "Monolithic microwave integrated circuit",
keyword %in% c("IMU") ~ "inertial measurement unit",
keyword %in% c("ATR") ~ "assisted target recognition",
keyword %in% c("GMTI") ~ "Ground Moving Target Indicator",
keyword %in% "SIGINT" ~ "Signals intelligence",
keyword %in% "MANET" ~ "Mobile Ad Hoc Networks",
keyword %in% "ROIC" ~ "Readout Integrated Circuit",
keyword %in% "MIMO" ~ "Multiple input multiple output",
keyword %in% "GPU" ~ "Graphical Processing Unit",
keyword %in% "HMD" ~ "head mounted displays",
keyword %in% "SPINEL" ~ "strength transparent magnesium aluminate",
keyword %in% "DACS" ~ "Divert and Attitude Control System",
keyword %in% "COTS" ~ "Commercial Off The Shelf",
keyword %in% "LED" ~ "light emitting diode",
keyword %in% "SDR" ~ "Software Defined Radio",
keyword %in% "ADC" ~ "Analog to Digital Conversion",
keyword %in% "UGV" ~ "Unmanned Ground Vehicle",
keyword %in% "JSF" ~ "Joint Strike Fighter",
keyword %in% "MAV" ~ "Micro Air Vehicles",
keyword %>% str_detect("ADDITIVE MANUFACTURE") ~ "ADDITIVE MANUFACTURE",
keyword %>% str_detect("ADVANCED MATERIAL") ~ "ADVANCED MATERIALS",
keyword %>% str_detect("ADVANCED MANUFACTURE|ADVANCED MANUFACTURING") ~ "ADVANCED MANUFACTURING",
keyword %>% str_detect("^AI|AI/ML|SYSML|MACHINE LEARNING") ~ "ARTIFICIAL INTELLIGENCE",
keyword %in% "ASIC" ~ "application specific integrated circuit",
keyword %in% "CMC" ~ "Ceramic Matrix Composites",
keyword %in% "MOCVD" ~ "METAL ORGANIC CHEMICAL VAPOR DEPOSITION",
keyword %in% "MBE" ~ "the molecular beam epitaxy",
keyword %in% "SOFC" ~ "Solid Oxide Fuel Cell",
keyword %in% "SAR" ~ "Synthetic Aperture Radar",
keyword %in% "JTRS" ~ "Joint Tactical Radio System",
keyword %in% "ASW" ~ "Antisubmarine Warfare",
keyword %in% "FLP" ~ "focal plane array",
keyword == "JP" ~ "Joint Publication",
keyword == "LWIR" ~ "Long Wavelength Infrared",
keyword %in% c("LADAR", "LIDAR") ~ "LAser Detection And Ranging",
TRUE ~ keyword
),
keyword = keyword %>% str_to_upper() %>% str_remove_all("\\?")
) %>%
filter(!keyword %>% str_detect("ORG: |E MAIL: ")) %>%
filter(nchar(keyword) > 1)
df_sbir <-
df_sbir %>%
mutate(
count_total = nchar(keyword),
count_bad = keyword %>% str_count("[0-9]|\\/|\\ |\\-")
) %>%
filter(count_bad != count_total) %>%
select(-c(count_total, count_bad)) %>%
rename(keywordsTopicSBIR = keyword)
data <-
data %>%
select(-keywordsTopicSBIR) %>%
left_join(
df_sbir %>%
group_by(row) %>%
summarise(
keywordsTopicSBIR = unique(keywordsTopicSBIR) %>% sort() %>% str_c(collapse = " | "),
countKeywordsTopic = n()
),
by = "row"
)
df_sbir <-
data %>%
select(row, keywordsSBIRCompany) %>%
separate_rows(keywordsSBIRCompany, convert = T, sep = "\\,|\\;|^&|[0-9]. |\\ & ") %>%
separate_rows(keywordsSBIRCompany, convert = T, sep = "\\ [0-9].") %>%
separate_rows(keywordsSBIRCompany, sep = "CONTACT:|POC:|TECHNICAL POINT OF CONTACT|PHONE:|TITLE:|PROGNOSTICS:|KEYWORDS: |TOPIC WRITERS:|ND:", convert = T) %>%
mutate_if(is.character, str_squish) %>%
filter(keywordsSBIRCompany != "") %>%
filter(!keywordsSBIRCompany %>% str_detect("FAX:|EMAIL:|^:|E MAIL")) %>%
rename(keyword = keywordsSBIRCompany) %>%
mutate(
keyword = keyword %>% str_squish() %>% str_remove_all("\\.|\u001a|\\(|\\)") %>%
str_replace_all("\\-", " ") %>% str_replace_all("3 D", "3D")
) %>%
filter(!keyword %in% c("", "AND")) %>%
filter(!keyword %>% str_detect("EMAIL|FAX| PHONE$")) %>%
mutate_if(is.character, str_squish) %>%
mutate_at("keyword",
function(x) {
case_when(
x %in% c("AI", "ARTIFICIAL INTELLIGENCE, DEEP LEARNING") ~ "ARTIFICIAL INTELLIGENCE",
x %in% c("AJ", "ANTI-JAM") ~ "ANTIJAM",
x == "EW" ~ "ELECTRONIC WARFARE",
x == "MEMS" ~ "MICROELECTROMECHANICAL SYSTEMS",
x == "CFD" ~ "Computational fluid dynamics",
x == "HPM" ~ "HIGH POWER MICROWAVE",
x == "ML" ~ "MACHINE LEARNING",
x %in% c("RF") ~ "RADIO FREQUENCY",
x %in% c("QCL", "QUANTUM COMPUTING") ~ "QUANTUM COMPUTATION",
x %in% c("AR") ~ "AUGMENTED REALITY",
TRUE ~ x
)
}) %>%
filter(!keyword %in% c("", "\u001a\u001a")) %>%
mutate(
keyword = case_when(
keyword %in% c("AI", "AUTOMATED AI") ~ "ARTIFICIAL INTELLIGENCE",
keyword %in% c("AR") ~ "AUGMENTED REALITY",
keyword %in% c("IOT") ~ "INTERNET OF THINGS",
keyword %in% c("UAV", "UAS", "DRONES", "DRONE", "UNMANNED") ~ "Unmanned aerial vehicle" %>% str_to_upper(),
keyword %in% c("FPGA") ~ "Field Programmable Gate Array" %>% str_to_upper(),
keyword %in% c("RF") ~ "Radio Frequency" %>% str_to_upper(),
keyword %in% c("ISR") ~ "Intelligence Surveillance and reconnaissance" %>% str_to_upper(),
keyword %in% c("GAN") ~ "Generative adversarial network" %>% str_to_upper(),
keyword %in% c("MWIR") ~ "Midwave Infrared" %>% str_to_upper(),
keyword %in% c("IR") ~ "Infrared" %>% str_to_upper(),
keyword %in% c("UUV") ~ "Unmanned underwater vehicle" %>% str_to_upper(),
keyword %in% c("SIC") ~ "single silicon carbide",
keyword %in% c("IED") ~ "Improvised explosive device",
keyword %in% c("NDE") ~ "Non Destructive Evaluation",
keyword %in% c("MMIC") ~ "Monolithic microwave integrated circuit",
keyword %in% c("IMU") ~ "inertial measurement unit",
keyword %in% c("ATR") ~ "assisted target recognition",
keyword %in% c("GMTI") ~ "Ground Moving Target Indicator",
keyword %in% "SIGINT" ~ "Signals intelligence",
keyword %in% "MANET" ~ "Mobile Ad Hoc Networks",
keyword %in% "ROIC" ~ "Readout Integrated Circuit",
keyword %in% "MIMO" ~ "Multiple input multiple output",
keyword %in% "GPU" ~ "Graphical Processing Unit",
keyword %in% "HMD" ~ "head mounted displays",
keyword %in% "SPINEL" ~ "strength transparent magnesium aluminate",
keyword %in% "DACS" ~ "Divert and Attitude Control System",
keyword %in% "COTS" ~ "Commercial Off The Shelf",
keyword %in% "LED" ~ "light emitting diode",
keyword %in% "SDR" ~ "Software Defined Radio",
keyword %in% "ADC" ~ "Analog to Digital Conversion",
keyword %in% "UGV" ~ "Unmanned Ground Vehicle",
keyword %in% "JSF" ~ "Joint Strike Fighter",
keyword %in% "MAV" ~ "Micro Air Vehicles",
keyword %>% str_detect("ADDITIVE MANUFACTURE") ~ "ADDITIVE MANUFACTURE",
keyword %>% str_detect("ADVANCED MATERIAL") ~ "ADVANCED MATERIALS",
keyword %>% str_detect("ADVANCED MANUFACTURE|ADVANCED MANUFACTURING") ~ "ADVANCED MANUFACTURING",
keyword %>% str_detect("^AI|AI/ML|SYSML|MACHINE LEARNING") ~ "ARTIFICIAL INTELLIGENCE",
keyword %in% "ASIC" ~ "application specific integrated circuit",
keyword %in% "CMC" ~ "Ceramic Matrix Composites",
keyword %in% "MOCVD" ~ "METAL ORGANIC CHEMICAL VAPOR DEPOSITION",
keyword %in% "MBE" ~ "the molecular beam epitaxy",
keyword %in% "SOFC" ~ "Solid Oxide Fuel Cell",
keyword %in% "SAR" ~ "Synthetic Aperture Radar",
keyword %in% "JTRS" ~ "Joint Tactical Radio System",
keyword %in% "ASW" ~ "Antisubmarine Warfare",
keyword %in% "FLP" ~ "focal plane array",
keyword == "JP" ~ "Joint Publication",
keyword == "LWIR" ~ "Long Wavelength Infrared",
keyword %in% c("LADAR", "LIDAR") ~ "LAser Detection And Ranging",
TRUE ~ keyword
),
keyword = keyword %>% str_to_upper() %>% str_remove_all("\\?")
) %>%
filter(!keyword %>% str_detect("ORG: |E MAIL: ")) %>%
filter(nchar(keyword) > 1)
df_sbir <-
df_sbir %>%
mutate(
count_total = nchar(keyword),
count_bad = keyword %>% str_count("[0-9]|\\/|\\ |\\-")
) %>%
filter(count_bad != count_total) %>%
select(-c(count_total, count_bad)) %>%
rename(keywordsSBIRCompany = keyword)
data <-
data %>%
select(-keywordsSBIRCompany) %>%
left_join(
df_sbir %>%
group_by(row) %>%
summarise(
keywordsSBIRCompany = unique(keywordsSBIRCompany) %>% sort() %>% str_c(collapse = " | "),
countKeywordsSBIRCompany = n()
),
by = "row"
) %>%
select(-row)
}
if (clean_companies) {
data <-
data %>%
entities::refine_columns(entity_columns = "nameCompany")
}
data <- data %>%
rename(
groupRevenueCompany = amountRevenueCompanyPriorYear,
cageCompany = slugCageCompany,
addressStreet1Company = adressStreet1Company,
slugAcquisitionProgramName = slugAcquisitionrProgramName,
)
data <-
data %>%
mutate_at(c("pctWorkSTTR", "pctRevenueSBIR"),
list(function(x) {
x %>% as.numeric()
})) %>%
mutate_at(c("pctWorkSTTR", "pctRevenueSBIR"),
list(function(x) {
case_when(x == 0 ~ 0,
x <= 100 ~ x / 100,
x >= 3000 ~ x / 10000,
TRUE ~ x / 1000)
}))
data <-
data %>%
rename(locationCompany = addressFullCompany,
locationSTTR = addressFullSTTR)
data <-
data %>%
select(matches("^id[A-Z]|year|nameCompany|DUNS"), everything()) %>%
munge_lite(snake_names = snake_names, unformat = unformat)
data
}
# afwerx -------------------------------------------------------------------------------------------
## https://exporter.nih.gov/ExPORTER_Catalog.aspx
.sbir_afwerx_portfolio <-
function(resolve_for_duns = T,
only_open_topic = F,
include_naics = T,
include_psc = T,
include_business_types = T,
include_sba = T,
snake_names = F) {
options(warn = -1)
data <-
download_excel_file(url = "https://www.afwerx.af.mil/resources/afwerx-portfolio.xlsx", has_col_names = T)
data <-
data %>%
.munge_sbir_names()
if (data %>% hasName("idOffice")) {
data <- data %>% rename(groupTopic = idOffice)
}
if (data %>% hasName("keywordsTopicSBIR")) {
data <- data %>% rename(keywordsSBIRCompany = keywordsTopicSBIR)
}
data <-
data %>%
mutate(idPhase = case_when(idPhase %>% str_detect("1") ~ "PHASE I",
TRUE ~ "PHASE II"))
data <- data %>%
separate(
nameCompany,
sep = "\\(|D/B/A| DBA: | dba | DBA ",
into = c("nameCompany", "nameCompanyDBA"),
extra = "merge",
fill = "right"
) %>%
mutate(nameCompanyDBA = nameCompany %>% str_remove_all("\\)")) %>%
mutate_if(is.character, str_squish)
data <-
data %>%
separate(
descriptionTopic,
into = c('groupSolicitation', "nameSolicitation"),
sep = "\\(",
extra = "merge",
fill = "right"
) %>%
mutate(
nameSolicitation = nameSolicitation %>% str_remove_all("\\)"),
groupSolicitation = groupSolicitation %>% str_remove_all(" Phase 1| Phase 2|Phase I| P1| P2") %>% str_remove_all("\\.") %>% str_replace_all("^191", "AF191") %>%
str_replace_all("^182", "AF182") %>% str_replace_all("^183", "AF183")
) %>%
mutate_if(is.character, str_squish) %>%
separate(
groupSolicitation,
sep = "\\-",
remove = F,
into = c("idOffice", "idCohort")
) %>%
munge_data() %>%
mutate(
typeSolicitation = case_when(
nameSolicitation %>% str_detect("OPEN TOPIC") ~ "OPEN TOPIC",
nameSolicitation %>% str_detect("PITCH DAY") ~ "PITCH DAY",
TRUE ~ "OTHER"
)
) %>%
mutate(id = 1:n()) %>%
select(id, typeSolicitation, idProgram,
idPhase, everything())
data <- data %>%
mutate(nameCompany = nameCompany %>% str_remove_all("/") %>% str_trim())
data <- data %>%
entities::refine_columns(entity_columns = "nameCompany")
data <- data %>%
select(
id,
idProgram,
idPhase,
typeSolicitation,
nameSolicitation,
groupSolicitation,
idOffice,
idCohort,
nameCompanyClean,
everything()
)
data <- data %>%
mutate(nameCompanyClean = nameCompanyClean %>% str_remove_all("/") %>% str_trim())
tbl_keywords <-
data %>%
select(id, keywordsSBIRCompany) %>%
separate_rows(keywordsSBIRCompany, sep = "\\,|u001a|\\: |\u001a ") %>%
mutate_if(is.character, str_trim) %>%
mutate(keywordsSBIRCompany = keywordsSBIRCompany %>% stringi::stri_trans_general("Latin-ASCII")) %>%
mutate_at("keywordsSBIRCompany",
function(x) {
case_when(
x %in% c("AI", "ARTIFICIAL INTELLIGENCE, DEEP LEARNING") ~ "ARTIFICIAL INTELLIGENCE",
x %in% c("AJ", "ANTI-JAM") ~ "ANTIJAM",
x == "EW" ~ "ELECTRONIC WARFARE",
x == "HPM" ~ "HIGH POWER MICROWAVE",
x == "ML" ~ "MACHINE LEARNING",
x %in% c("RF") ~ "RADIO FREQUENCY",
x %in% c("QCL", "QUANTUM COMPUTING") ~ "QUANTUM COMPUTATION",
x %in% c("AR") ~ "AUGMENTED REALITY",
TRUE ~ x
)
}) %>%
filter(!keywordsSBIRCompany %in% c("", "\u001a\u001a")) %>%
mutate(keyword = keywordsSBIRCompany) %>%
mutate(
keyword = case_when(
keyword %in% c("AI", "AUTOMATED AI") ~ "ARTIFICIAL INTELLIGENCE",
keyword %in% c("AR") ~ "AUGMENTED REALITY",
keyword %in% c("IOT") ~ "INTERNET OF THINGS",
keyword %in% c("UAV", "UAS", "DRONES", "DRONE", "UNMANNED") ~ "Unmanned aerial vehicle" %>% str_to_upper(),
keyword %in% c("FPGA") ~ "Field Programmable Gate Array" %>% str_to_upper(),
keyword %in% c("RF") ~ "Radio Frequency" %>% str_to_upper(),
keyword %in% c("ISR") ~ "Intelligence Surveillance and reconnaissance" %>% str_to_upper(),
keyword %in% c("GAN") ~ "Generative adversarial network" %>% str_to_upper(),
keyword %in% c("MWIR") ~ "Midwave Infrared" %>% str_to_upper(),
keyword %in% c("IR") ~ "Infrared" %>% str_to_upper(),
keyword %in% c("UUV") ~ "Unmanned underwater vehicle" %>% str_to_upper(),
keyword %in% c("SIC") ~ "single silicon carbide",
keyword %in% c("IED") ~ "Improvised explosive device",
keyword %in% c("NDE") ~ "Non Destructive Evaluation",
keyword %in% c("MMIC") ~ "Monolithic microwave integrated circuit",
keyword %in% c("IMU") ~ "inertial measurement unit",
keyword %in% c("ATR") ~ "assisted target recognition",
keyword %in% c("GMTI") ~ "Ground Moving Target Indicator",
keyword %in% "SIGINT" ~ "Signals intelligence",
keyword %in% "MANET" ~ "Mobile Ad Hoc Networks",
keyword %in% "ROIC" ~ "Readout Integrated Circuit",
keyword %in% "MIMO" ~ "Multiple input multiple output",
keyword %in% "GPU" ~ "Graphical Processing Unit",
keyword %in% "HMD" ~ "head mounted displays",
keyword %in% "SPINEL" ~ "strength transparent magnesium aluminate",
keyword %in% "DACS" ~ "Divert and Attitude Control System",
keyword %in% "COTS" ~ "Commercial Off The Shelf",
keyword %in% "LED" ~ "light emitting diode",
keyword %in% "SDR" ~ "Software Defined Radio",
keyword %in% "ADC" ~ "Analog to Digital Conversion",
keyword %in% "UGV" ~ "Unmanned Ground Vehicle",
keyword %in% "JSF" ~ "Joint Strike Fighter",
keyword %in% "MAV" ~ "Micro Air Vehicles",
keyword %>% str_detect("ADDITIVE MANUFACTURE") ~ "ADDITIVE MANUFACTURE",
keyword %>% str_detect("ADVANCED MATERIAL") ~ "ADVANCED MATERIALS",
keyword %>% str_detect("ADVANCED MANUFACTURE|ADVANCED MANUFACTURING") ~ "ADVANCED MANUFACTURING",
keyword %>% str_detect("^AI|AI/ML|SYSML|MACHINE LEARNING") ~ "ARTIFICIAL INTELLIGENCE",
keyword %in% "ASIC" ~ "application specific integrated circuit",
keyword %in% "CMC" ~ "Ceramic Matrix Composites",
keyword %in% "MOCVD" ~ "METAL ORGANIC CHEMICAL VAPOR DEPOSITION",
keyword %in% "MBE" ~ "the molecular beam epitaxy",
keyword %in% "SOFC" ~ "Solid Oxide Fuel Cell",
TRUE ~ keyword
),
keyword = str_to_upper(keyword)
)
tbl_keywords <-
tbl_keywords %>%
group_by(id) %>%
summarise(
keywordAFWERX = unique(keywordsSBIRCompany) %>% sort() %>% str_c(collapse = " | "),
keywordCompanyCleanAFWERX = unique(keyword) %>% sort() %>% str_c(collapse = " | ")
) %>%
ungroup()
data <-
data %>%
select(-keywordsSBIRCompany) %>%
left_join(tbl_keywords, by = 'id') %>%
distinct()
data <- data %>%
mutate(isOpenTopicAwardee = typeSolicitation %in% c("OPEN TOPIC") |
groupSolicitation %>% str_detect("X")) %>%
select(idProgram, isOpenTopicAwardee, everything())
data <- data %>%
mutate(
url = urlCompany %>% str_remove_all("http://") %>% str_remove_all("http:/") %>% str_remove_all("https://|https//|www.") %>% str_squish() %>% map_chr(URLencode),
urlCompany = str_c("https://", url, sep = "")
) %>%
select(-url)
data <- data %>%
mutate(urlCompany = case_when(
urlCompany %in% c("https://NA", "https://0") ~ NA_character_,
TRUE ~ urlCompany
)) %>%
mutate(
urlCompany = case_when(
urlCompany == "https://.planeenglish.net" ~ "https://planeenglish.net",
TRUE ~ urlCompany
)
)
if (data %>% hasName("idOffice")) {
data <-
data %>%
mutate(groupTopic = idOffice)
}
if (resolve_for_duns) {
df_resolved <-
afwerx_open_topic_duns %>%
filter(!is.na(nameCompanyClean))
data <-
data %>%
mutate(id = 1:n())
df_matches <-
seq_along(1:nrow(data)) %>%
map_dfr(function(x) {
companies <-
data %>% slice(x) %>% select(matches("nameCompany")) %>%
gather(item, value) %>%
distinct(value) %>%
filter(!is.na(value)) %>%
pull() %>%
str_c(collapse = "|")
glue("Resolving {companies} for DUNS and registration data") %>% message()
d <- df_resolved %>%
filter(nameCompanyClean %>% str_detect(companies))
if (nrow(d) == 0) {
return(tibble(id = x))
}
d %>%
select(idDUNS) %>%
mutate(id = x)
})
df_matches <-
df_matches %>%
group_by(id) %>%
mutate(count = n_distinct(idDUNS)) %>%
ungroup() %>%
filter(count == 1) %>%
filter(!is.na(idDUNS)) %>%
select(id, idDUNS)
data <-
data %>%
select(-matches("idDUNS")) %>%
left_join(df_matches, by = "id") %>%
select(id:idCohort, idDUNS, everything())
df_sam <- bulk_sam_entities()
df_sam <-
df_sam %>%
entities::refine_columns(entity_columns = "nameCompanyLegal")
df_missing <-
data %>%
filter(is.na(idDUNS)) %>%
select(id, nameCompanyClean)
df_new <-
df_missing$id %>%
map_dfr(function(x) {
clean <-
data %>%
filter(id == x) %>% select(nameCompanyClean) %>%
pull()
companies <-
data %>%
filter(id == x) %>%
select(matches("nameCompany")) %>%
gather(item, value) %>%
filter(!is.na(value)) %>%
distinct(value) %>%
pull() %>%
str_c(collapse = "|^")
glue("Resolving {companies} for DUNS and registration data") %>% message()
search <- glue("^{companies}")
d <-
df_sam %>% filter(nameCompanyLegal == companies) %>%
select(matches("idDUNS|nameCompany|date"))
if (nrow(d) > 0) {
d <-
d %>%
filter(dateRegistrationInitial == max(dateRegistrationInitial)) %>%
select(idDUNS, nameCompanyLegalClean) %>%
mutate(id = x,
nameCompanyClean = clean)
return(d)
}
d <-
df_sam %>%
filter(nameCompanyLegal %>% str_detect(search)) %>%
select(idDUNS, nameCompanyLegalClean) %>%
mutate(id = x,
nameCompanyClean = clean)
if (nrow(d) == 0) {
return(tibble())
}
d
})
df_new <-
df_new %>%
group_by(id) %>%
mutate(count = n_distinct(idDUNS)) %>%
ungroup() %>%
filter(count <= 3) %>%
slice(1)
data <-
data %>%
select(-matches("idDUNS")) %>%
left_join(df_matches %>%
bind_rows(df_new %>% select(id, idDUNS)) %>%
distinct(),
by = "id") %>%
select(id:idCohort, idDUNS, everything())
df_details <-
data %>%
filter(!is.na(idDUNS)) %>%
distinct(idDUNS) %>%
left_join(df_sam %>% select(
matches(
"idDUNS|slugCAGE|nameCompany|urlCompany|locationCompany|emailPointOfContactElectronicBusiness|date"
)
) %>%
rename(urlCompanySAM = urlCompany),
by = "idDUNS")
remove_cols <-
names(
data)[names(data) %in% names(df_details)] %>%
discard(function(x){
x == "idDUNS"
})
data <-
data %>%
select(-one_of(remove_cols)) %>%
left_join(df_details %>% rename(nameCompanyDBASAM = nameCompanyDBA)
, by = "idDUNS") %>%
group_by(id) %>%
slice(1) %>%
ungroup()
}
if (data %>% hasName("nameCompanyClean")) {
data <- data %>%
mutate(
idDUNS = case_when(
nameCompanyClean %>% str_detect("CHESAPEAKE") ~ 124240222,
nameCompanyClean %>% str_detect("DVI GROUP") ~ 6317394,
nameCompanyClean %>% str_detect("SHOCKTECH") ~ 8307977,
nameCompanyClean %>% str_detect("THE DIFFERENCE A2C LLC") ~ 13750347,
nameCompanyClean %>% str_detect("RESERVOIR LABS") ~ 22423854,
nameCompanyClean %>% str_detect("ELDER RESEARCH") ~ 28211527,
nameCompanyClean %>% str_detect("PLEX SOLUTIONS") ~ 117504011,
nameCompanyClean %>% str_detect("WARRIOR CENTRIC") ~ 14217903,
nameCompanyClean %>% str_detect("ZEBULON SCIENCES") ~ 117342435,
nameCompanyClean %>% str_detect("YOU TUNE") ~ 117392260,
nameCompanyClean %>% str_detect("XPLORE INC") ~ 81358031,
nameCompanyClean %>% str_detect("AVILUTION") ~ 31980569,
nameCompanyClean %>% str_detect("ESPIN TECHNOLOGIES") ~ 36255094,
nameCompanyClean %>% str_detect("SPARKCOGNITION") ~ 41702227,
nameCompanyClean %>% str_detect("VS MERLOT INC") ~ 51540760,
nameCompanyClean %>% str_detect("FANTASY JOURNALIST") ~ 55329579,
nameCompanyClean %>% str_detect("SURFACE OPTICS") ~ 64390719,
nameCompanyClean %>% str_detect("ANALATOM") ~ 64744436,
nameCompanyClean %>% str_detect("FLITE ADVANTAGE") ~ 78299725,
nameCompanyClean %>% str_detect("VORTEX CONTROL") ~ 78718825,
nameCompanyClean %>% str_detect("RE3D") ~ 78732259,
nameCompanyClean %>% str_detect("NEARSPACE") ~ 78840927,
nameCompanyClean %>% str_detect("ONVECTOR") ~ 78856808,
nameCompanyClean %>% str_detect("BAZZE") ~ 79246108,
nameCompanyClean %>% str_detect("EVERALBUM") ~ 79404516,
nameCompanyClean %>% str_detect("FOCUS VENTURES") ~ 79440302,
nameCompanyClean %>% str_detect("BMNT PARTNERS") ~ 79444802,
nameCompanyClean %>% str_detect("SUDOTOUCH") ~ 79644764,
nameCompanyClean %>% str_detect("FIGURE INC") ~ 79686017,
nameCompanyClean %>% str_detect("CYBER 2020") ~ 80019157,
nameCompanyClean %>% str_detect("THOUGHT OBJECT") ~ 80033961,
nameCompanyClean %>% str_detect("INTENTIONET") ~ 80073865,
nameCompanyClean %>% str_detect("NANOVMS") ~ 80183826,
nameCompanyClean %>% str_detect("SAFE GROUP") ~ 80249783,
nameCompanyClean %>% str_detect("OPERANT NETWORKS") ~ 80269860,
nameCompanyClean %>% str_detect("JUST BOUNCE") ~ 80291548,
nameCompanyClean %>% str_detect("LOCOAL CHARCOAL") ~ 80305090,
nameCompanyClean %>% str_detect("ARIO LLC") ~ 80326845,
nameCompanyClean %>% str_detect("PUEO BUSINESS") ~ 80380333,
nameCompanyClean %>% str_detect("ASCEND INNOVATIONS") ~ 80413117,
nameCompanyClean %>% str_detect("CROWDBOTICS") ~ 80469320,
nameCompanyClean %>% str_detect("PLUGNIX") ~ 80504281,
nameCompanyClean %>% str_detect("ONCLAVE") ~ 80517762,
nameCompanyClean %>% str_detect("RAPID IMAGING") ~ 80665036,
nameCompanyClean %>% str_detect("RESCUE ROVER") ~ 80756456,
nameCompanyClean %>% str_detect("LLVR SYSTEMS") ~ 80788205,
nameCompanyClean %>% str_detect("BATTLE SIGHT") ~ 80809286,
nameCompanyClean %>% str_detect("DCODE") ~ 81117708,
nameCompanyClean %>% str_detect("CONSTELLATION NETWORK") ~ 81162812,
nameCompanyClean %>% str_detect("BRANDON") ~ 81296679,
nameCompanyClean %>% str_detect("UBIQUITILINK") ~ 94459684,
nameCompanyClean %>% str_detect("WWWCRIOTSOLUTIONSCOM") ~ 97211096,
nameCompanyClean %>% str_detect("ATC - NY") ~ 101321479,
nameCompanyClean %>% str_detect("HASHLIT") ~ 105377160,
nameCompanyClean %>% str_detect("QUICK MED") ~ 107955085,
nameCompanyClean %>% str_detect("OMITRON") ~ 108990649,
nameCompanyClean %>% str_detect("FUTURIST INSTITUTE") ~ 117073942,
nameCompanyClean %>% str_detect("BETA TECHNOLOGIES") ~ 117188780,
nameCompanyClean %>% str_detect("SOLUTE") ~ 117938311,
nameCompanyClean %>% str_detect("WANLYNX") ~ 118892723,
nameCompanyClean %>% str_detect("CHESAPEAKE") ~ 124240222,
nameCompanyClean %>% str_detect("CAROUSEL LABS LLC") ~ 117379887,
nameCompanyClean %>% str_detect("451 TECH") ~ 32132320,
nameCompanyClean %>% str_detect("LONGSHORTWAY") ~ 141877311,
nameCompanyClean %>% str_detect("WEBSEC") ~ 144541625,
nameCompanyClean %>% str_detect("ACREE") ~ 149397015,
nameCompanyClean %>% str_detect("ARCHIEMD") ~ 157649471,
nameCompanyClean %>% str_detect("ELPHEL") ~ 163315497,
nameCompanyClean %>% str_detect("EMAGINE") ~ 165636205,
nameCompanyClean %>% str_detect("FAAC INCORPORATED") ~ 175204163,
nameCompanyClean %>% str_detect("HIKINO") ~ 197593788,
nameCompanyClean %>% str_detect("TITUS HUMAN") ~ 557190209,
nameCompanyClean %>% str_detect("AVASCENT") ~ 791497550,
nameCompanyClean %>% str_detect("DIGIBEAM") ~ 798896564,
nameCompanyClean %>% str_detect("DANGELO TECHNOLOGIES") ~ 809577443,
nameCompanyClean %>% str_detect("STRATAGEM") ~ 831508903,
nameCompanyClean %>% str_detect("THE DISTI") ~ 884814930,
nameCompanyClean %>% str_detect("DARK WOLF SOLUTIONS") ~ 933645991,
nameCompanyClean %>% str_detect("ZANSORS") ~ 962122052,
nameCompanyClean %>% str_detect("MAINSTEM") ~ 963478297,
nameCompanyClean %>% str_detect("METIS DESIGN") ~ 963612291,
nameCompanyClean %>% str_detect("GENECAPTURE") ~ 964433838,
nameCompanyClean %>% str_detect("CLEANNG") ~ 967838967,
nameCompanyClean %>% str_detect("GIGAVATION") ~ 968402789,
nameCompanyClean %>% str_detect("KENIFIC") ~ 969270474,
TRUE ~ idDUNS
)
)
data <- data %>%
mutate(
idDUNS = case_when(
nameCompanyClean %>% str_detect("ACCESSWORKS") ~ 116812662,
nameCompanyClean %>% str_detect("ALPHA-1 AEROSPACE") ~ 117000188,
nameCompanyClean %>% str_detect("AMERICAN ECOTECH") ~ 91907811,
nameCompanyClean %>% str_detect("APPLIED DYNAMICS CORPORATION") ~ 177217189,
nameCompanyClean %>% str_detect("ARAGANTEAL") ~ 70346840,
nameCompanyClean %>% str_detect("ARCHITECTURE TECHNOLOGY") ~ 52062833,
nameCompanyClean %>% str_detect("ASSETAS") ~ 117340746,
nameCompanyClean %>% str_detect("ASTRO DIGITAL") ~ 21940207,
nameCompanyClean %>% str_detect("AUGMENTED TRAINING") ~ 117055781,
nameCompanyClean %>% str_detect("AUGMENTIR") ~ 81115414,
nameCompanyClean %>% str_detect("AVISARE") ~ 80138734,
nameCompanyClean %>% str_detect("AVOCADO") ~ 80168687,
nameCompanyClean %>% str_detect("BLIND TIGER") ~ 74580285,
nameCompanyClean %>% str_detect("BUBO LEARNING") ~ 80076103,
nameCompanyClean %>% str_detect("CAMGIAN MICROSYSTEMS") ~ 791134542,
nameCompanyClean %>% str_detect("CENTIL") ~ 80439080,
nameCompanyClean %>% str_detect("CESIUMASTRO") ~ 80537547,
nameCompanyClean %>% str_detect("COBALT SOLUTIONS") ~ 117394919,
nameCompanyClean %>% str_detect("CODER") ~ 80781662,
nameCompanyClean %>% str_detect("DAXOR") ~ 79725288,
nameCompanyClean %>% str_detect("DISCOVERY MACHINE") ~ 128516114,
nameCompanyClean %>% str_detect("DIVERSIFIED 3D") ~ 81469562,
nameCompanyClean %>% str_detect("FLIPPER") ~ 81107705,
nameCompanyClean %>% str_detect("GANTZ-MOUNTAIN") ~ 79381784,
nameCompanyClean %>% str_detect("GIGSTER") ~ 79830815,
nameCompanyClean %>% str_detect("GLOBAL AIR LOGISTICS") ~ 80243169,
nameCompanyClean %>% str_detect("GREEN REVOLUTION") ~ 828680186,
nameCompanyClean %>% str_detect("GUIDED PARTICLE") ~ 80190025,
nameCompanyClean %>% str_detect("HATCHBED") ~ 117150209,
nameCompanyClean %>% str_detect("HODLPAL") ~ 117346223,
nameCompanyClean %>% str_detect("INFEITER") ~ 117414624,
nameCompanyClean %>% str_detect("JOYLAB") ~ 79642134,
nameCompanyClean %>% str_detect("KANEY") ~ 603280723,
nameCompanyClean %>% str_detect("MALLINDA") ~ 79523497,
nameCompanyClean %>% str_detect("MICROSURGEONBOT") ~ 80945043,
nameCompanyClean %>% str_detect("NEXTGEN BALANCING") ~ 80579265,
nameCompanyClean %>% str_detect("OPTTEK") ~ 808711803,
nameCompanyClean %>% str_detect("ORIONS") ~ 78488412,
nameCompanyClean %>% str_detect("OUTERLINK") ~ 782809768,
nameCompanyClean %>% str_detect("OUTSIDE ANALYTICS") ~ 80880218,
nameCompanyClean %>% str_detect("P3 TECHNOLOGIES") ~ 80767240,
nameCompanyClean %>% str_detect("PARAGON ROBOTICS") ~ 806514985,
nameCompanyClean %>% str_detect("PARASANTI") ~ 43612924,
nameCompanyClean %>% str_detect("PHOSPHORUS") ~ 20017457,
nameCompanyClean %>% str_detect("PLANCK AEROSYSTEMS") ~ 79592990,
nameCompanyClean %>% str_detect("PROGRESSIVE TECHNOLOGY") ~ 836998120,
nameCompanyClean %>% str_detect("PTERODYNAMICS") ~ 80974858,
nameCompanyClean %>% str_detect("R-STOR") ~ 80666106,
nameCompanyClean %>% str_detect("RADARLOCK") ~ 54667383,
nameCompanyClean %>% str_detect("RDA PREP") ~ 117112463,
nameCompanyClean %>% str_detect("RE:3D INC") ~ 78732259,
nameCompanyClean %>% str_detect("RHEA SPACE") ~ 81318218,
nameCompanyClean %>% str_detect("SANGJEN") ~ 117305658,
nameCompanyClean %>% str_detect("SC2") ~ 122963112,
nameCompanyClean %>% str_detect("SENTECOR") ~ 78667179,
nameCompanyClean %>% str_detect("SIERRA PACIFIC") ~ 7032076,
nameCompanyClean %>% str_detect("SIGNAL FRAME") ~ 79669372,
nameCompanyClean %>% str_detect("SIMX") ~ 81046320,
nameCompanyClean %>% str_detect("SITCH AI") ~ 81117941,
nameCompanyClean %>% str_detect("SIX STAR SERVICES") ~ 116812953,
nameCompanyClean %>% str_detect("SOLIDDD") ~ 52833516,
nameCompanyClean %>% str_detect("SPARKCOGNTION") ~ 41702227,
nameCompanyClean %>% str_detect("SPECIAL OPERATIONS SUPPLY") ~ 81331701,
nameCompanyClean %>% str_detect("SPOTLIGHT LABS") ~ 557569626,
nameCompanyClean %>% str_detect("SUNSHOWERIO") ~ 81529416,
nameCompanyClean %>% str_detect("SUPERIOR FLOAT TANKS") ~ 79936042,
nameCompanyClean %>% str_detect("SYLABS") ~ 81320655,
nameCompanyClean %>% str_detect("TEMPLE MASSAGER") ~ 828723895,
nameCompanyClean %>% str_detect("TETRATEIO") ~ 97133609,
nameCompanyClean %>% str_detect("TEXAS RESEARCH INSTITUTE") ~ 625120902,
nameCompanyClean %>% str_detect("TGV ROCKETS") ~ 799713594,
nameCompanyClean %>% str_detect("INFORMATICS APPLICATIONS") ~ 65245750,
nameCompanyClean %>% str_detect("WHITE HOUSE PARTNERS") ~ 80131333,
nameCompanyClean %>% str_detect("THIRD INSIGHT") ~ 70916951,
nameCompanyClean %>% str_detect("THORTACTICAL") ~ 117057967,
nameCompanyClean %>% str_detect("TONICAI") ~ 111766248,
nameCompanyClean %>% str_detect("TRIFACTA") ~ 79275791,
nameCompanyClean == "VAN" ~ 614374643,
nameCompanyClean %>% str_detect("VERSTAAN") ~ 117400888,
nameCompanyClean %>% str_detect("VICTOR DAROLFI") ~ 117325916,
nameCompanyClean %>% str_detect("VIGILENT") ~ 78462380,
nameCompanyClean %>% str_detect("VITA INCLINATA") ~ 81135866,
nameCompanyClean %>% str_detect("XENESIS") ~ 81028906,
nameCompanyClean %>% str_detect("HNU PHOTONICS") ~ 788623473,
nameCompanyClean %>% str_detect("ALPHA RECON") ~ 79495048,
nameCompanyClean %>% str_detect("APPLIED DYNAMICS") ~ 76387984,
nameCompanyClean %>% str_detect("BEAR SYSTEMS") ~ 116902757,
nameCompanyClean %>% str_detect("ELEMENO HEALTH") ~ 80438162,
nameCompanyClean %>% str_detect("FOCUSMOTION") ~ 79440302,
nameCompanyClean %>% str_detect("GIGXR") ~ 117258144,
nameCompanyClean %>% str_detect("INTELESENSE TECHNOLOGIES") ~ 80921977,
nameCompanyClean %>% str_detect("MCP GOPV") ~ 22567908,
nameCompanyClean %>% str_detect("MEMCOMPUTNG INC") ~ 81259607,
nameCompanyClean %>% str_detect("NEIL FINEMAN") ~ 78825664,
nameCompanyClean %>% str_detect("PRAMOD") ~ 81263169,
nameCompanyClean %>% str_detect("SIGNALFRAME") ~ 79669372,
nameCompanyClean %>% str_detect("TAEKION") ~ 605505531,
TRUE ~ idDUNS
)
)
}
has_sam <- 'df_sam' %>% exists()
if (has_sam) {
df_missing_names <-
data %>%
filter(is.na(idDUNS)) %>%
select(nameCompanyClean) %>%
count(nameCompanyClean, sort = T) %>%
filter(!is.na(nameCompanyClean))
df_missing_names <-
df_missing_names %>%
count(nameCompanyClean, sort = T) %>%
filter(!is.na(nameCompanyClean))
df_missing_names <-
df_missing_names %>%
left_join(df_sam %>% select(
matches(
"idDUNS|slugCAGE|nameCompany|urlCompany|locationCompany|emailPointOfContactElectronicBusiness|date"
)
) %>%
mutate(nameCompanyClean = nameCompanyLegal),
by = "nameCompanyClean") %>%
select(-n)
df_matches <-
df_missing_names %>% filter(!is.na(dateSAMData)) %>%
group_by(idDUNS) %>%
filter(dateCompanyStart == max(dateCompanyStart)) %>%
ungroup()
if (nrow(df_matches) > 0) {
data <- data %>%
filter(nameCompanyClean %in% df_matches$nameCompanyClean) %>%
select(-idDUNS) %>%
left_join(df_matches %>% select(nameCompanyClean, idDUNS), by = "nameCompanyClean") %>%
bind_rows(data %>% filter(!nameCompanyClean %in% df_matches$nameCompanyClean)) %>%
group_by(id) %>%
slice(1) %>%
ungroup()
}
df_details <-
data %>%
filter(!is.na(idDUNS)) %>%
distinct(idDUNS) %>%
left_join(
df_sam %>% select(
matches(
"idDUNS|slugCAGE|nameCompany|urlCompany|locationCompany|emailPointOfContactElectronicBusiness|date"
)
) %>%
rename(urlCompanySAM = urlCompany,
nameCompanySAM = nameCompanyLegalClean),
by = "idDUNS"
)
remove_cols <-
names(
data)[names(data) %in% names(df_details)] %>%
discard(function(x){
x == "idDUNS"
})
data <-
data %>%
select(-one_of(remove_cols)) %>%
select(-one_of("nameCompanyDBASAM")) %>%
left_join(df_details %>% rename(nameCompanyDBASAM = nameCompanyDBA)
, by = "idDUNS")
data <- data %>%
group_by(id) %>%
slice(1) %>%
ungroup() %>%
select(-matches("urlFAR|urlDF"))
data <- data %>%
separate(
emailPointOfContactElectronicBusiness,
into = c("remove",
"urlPointOfContact"),
sep = "\\@",
remove = F
) %>%
select(-remove)
data <- data %>%
mutate(
urlPointOfContact = str_c("https://", urlPointOfContact, sep =
""),
urlCompany = case_when(
is.na(urlCompany) & !is.na(urlCompanySAM) ~ urlCompanySAM,
TRUE ~ urlCompanySAM
)
)
data <-
data %>%
mutate(
urlCompany = case_when(is.na(urlCompany) ~ urlPointOfContact,
TRUE ~ urlCompany),
urlCompany = case_when(
urlCompany %>% str_detect("gmail") ~ NA_character_,
TRUE ~ urlCompany
)
)
data <- data %>%
mutate(hasDUNS = !is.na(idDUNS))
}
matched_duns <-
data %>% filter(!is.na(idDUNS)) %>% distinct(idDUNS) %>% pull()
if (has_sam & include_naics) {
df_naics <-
df_sam %>%
filter(idDUNS %in% matched_duns) %>%
select(idDUNS, idNAICSPrimary,nameNAICSPrimary, nameIndustryGroupNAICS, countNAICS, countNAICSExceptions, dataNAICS)
if (snake_names) {
df_naics <- df_naics %>%
unnest() %>%
clean_names() %>%
group_by(
id_duns,
id_naics_primary,
name_naics_primary,
name_industry_group_naics,
is_primary_naics,
count_naics,
count_naics_exceptions
) %>%
nest() %>%
rename(data_naics = data)
data <-
data %>%
left_join(df_naics, by = c("idDUNS" = "id_duns")) %>%
mutate(has_naics = T)
} else {
data <- data %>%
left_join(df_naics, by = "idDUNS") %>%
mutate(hasNAICS = T)
}
data <- data %>%
select(-matches("^data"), everything())
}
if (has_sam & include_psc) {
df_psc <-
df_sam %>%
filter(idDUNS %in% matched_duns) %>%
select(idDUNS, countProductServiceCodes, dataProductServiceCodes)
if (snake_names) {
df_psc <-
df_psc %>%
unnest() %>%
clean_names() %>%
group_by(id_duns,
count_product_service_codes) %>%
nest() %>%
rename(data_psc = data)
data <-
data %>%
left_join(df_psc, by = c("idDUNS" = "id_duns")) %>%
mutate(has_product_service_codes = T)
} else {
data <-
data %>%
left_join(df_psc, by = "idDUNS") %>%
mutate(hasProductServiceCodes = T)
}
data <- data %>%
select(-matches("^data"), everything())
}
if (has_sam & include_business_types) {
df_bt <-
df_sam %>%
filter(idDUNS %in% matched_duns) %>%
select(idDUNS, countBusinessTypes, dataBusinessTypes)
if (snake_names) {
df_bt <-
df_bt %>%
unnest() %>%
clean_names() %>%
group_by(id_duns,
count_business_types) %>%
nest() %>%
rename(data_business_types = data)
data <-
data %>%
left_join(df_bt, by = c("idDUNS" = "id_duns")) %>%
mutate(has_business_types = T)
} else {
data <-
data %>%
left_join(df_bt, by = "idDUNS") %>%
mutate(hasBusinessTypes = T)
}
data <- data %>%
select(-matches("^data"), everything())
}
if (has_sam & include_sba) {
df_sba <-
df_sam %>%
filter(idDUNS %in% matched_duns) %>%
select(idDUNS, countSBATypes, dataSBA)
if (snake_names) {
df_sba <-
df_sba %>%
unnest() %>%
clean_names() %>%
group_by(id_duns,
count_sba_types) %>%
nest() %>%
rename(data_sba_types = data)
data <-
data %>%
left_join(df_bt, by = c("idDUNS" = "id_duns")) %>%
mutate(has_sba_types = T)
} else {
data <-
data %>%
left_join(df_sba, by = "idDUNS") %>%
mutate(hasSBATypes = T)
}
data <- data %>%
select(-matches("^data"), everything())
}
if (only_open_topic) {
data <- data %>% filter(isOpenTopicAwardee)
}
if (snake_names) {
data <- clean_names(data)
}
data
}
.sbir_afwerex_cached <-
memoise::memoise(function(snake_names = T, only_open_topic = F) {
data <- read_rda("https://asbcllc.com/r_packages/govtrackR/data/afwerx_portfolio.rda")
data <- data %>%
mutate_if(is.logical,
list(function(x){
case_when(is.na(x) ~ FALSE,
TRUE ~ x)
}))
if (only_open_topic) {
data <- data %>%
filter(isOpenTopicAwardee)
}
if (!snake_names) {
return(data)
}
data <- clean_names(data)
if (data %>% hasName("data_naics")) {
df_naics <- data %>%
select(id_duns, data_naics) %>%
unnest() %>%
clean_names() %>%
group_by(id_duns) %>%
nest() %>%
ungroup() %>%
rename(data_naics = data)
data <- data %>%
select(-data_naics) %>%
left_join(df_naics, by = "id_duns") %>%
select(-matches("data"), everything())
}
if (data %>% hasName("data_product_service_codes")) {
df_psc <- data %>%
select(id_duns, data_product_service_codes) %>%
unnest() %>%
clean_names() %>%
group_by(id_duns) %>%
nest() %>%
ungroup() %>%
rename(data_product_service_codes = data)
data <- data %>%
select(-data_product_service_codes) %>%
left_join(df_psc, by = "id_duns") %>%
select(-matches("data"), everything())
}
if (data %>% hasName("data_business_types")) {
df_bt <- data %>%
select(id_duns, data_business_types) %>%
unnest() %>%
clean_names() %>%
group_by(id_duns) %>%
nest() %>%
ungroup() %>%
rename(data_business_types = data)
data <- data %>%
select(-data_business_types) %>%
left_join(df_bt, by = "id_duns") %>%
select(-matches("data"), everything())
}
if (data %>% hasName("data_sba")) {
df_sba <-
data %>%
select(id_duns, data_sba) %>%
unnest() %>%
clean_names() %>%
group_by(id_duns) %>%
nest() %>%
ungroup() %>%
rename(data_sba = data)
data <- data %>%
select(-data_sba) %>%
left_join(df_sba, by = "id_duns") %>%
select(-matches("data"), everything())
}
data
})
#' SBIR AFWERX Companies
#'
#' Acquires and resolves information about
#'
#' @param resolve_for_duns if \code{TRUE} resolves for DUNS and other missing information
#' @param snake_names
#' @param only_open_topic
#' @param use_cached
#' @param include_naics
#' @param include_psc
#' @param include_sba
#' @param include_business_types
#'
#' @return \code{tibble}
#' @export
#'
#' @examples
#' sbir_afwerx_portfolio()
sbir_afwerx_portfolio <-
function(use_cached = F,
resolve_for_duns = T,
only_open_topic = F,
include_naics = F,
include_psc = F,
include_sba = T,
include_business_types = T,
snake_names = T) {
if (use_cached) {
data <- .sbir_afwerex_cached(snake_names = snake_names, only_open_topic = only_open_topic)
return(data)
}
.sbir_afwerx_portfolio(resolve_for_duns = resolve_for_duns,
only_open_topic = only_open_topic,
include_naics = include_naics,
include_psc = include_psc,
snake_names = snake_names,
include_business_types = include_business_types,
include_sba = include_sba)
}
#' Join DOD SBIR/STTR data to input tibble
#'
#' @param data input tibble to join dod data to
#' @param duns_column name of the duns column
#' @param dod_select_columns if not `NULL` subset of columns
#' from DOD API to join
#'
#' @return
#' @export
#'
#' @examples
tbl_dod_sbir_duns_data <-
function(data,
duns_column = NULL,
clean_contracts = T,
dod_select_columns = NULL) {
if (length(duns_column) == 0) {
"Enter DUNS column" %>% message()
return(invisible())
}
if (!data %>% hasName(duns_column)) {
glue("{duns_column} missing from data") %>% message()
return(invisible())
}
duns <-
data %>%
filter(!is.na(!!sym(duns_column))) %>%
distinct(!!sym(duns_column)) %>%
pull()
tbl_dod <-
dod_sbir_historic(snake_names = T) %>%
filter(id_duns %in% duns)
if (length(dod_select_columns) > 0) {
col_slugs <-
c("id_duns", dod_select_columns) %>% unique() %>% str_c(collapse = "|")
tbl_dod <- tbl_dod %>% select(matches(col_slugs))
}
tbl_dod <-
tbl_dod %>%
group_by(id_duns) %>%
mutate_if(is.numeric, as.numeric) %>%
nest() %>%
rename(UQ(duns_column) := id_duns, data_dod_sbirs = data) %>%
ungroup() %>%
mutate(has_dod_sbir = T,
count_dod_sbir = data_dod_sbirs %>% map_dbl(nrow)) %>%
select(-data_dod_sbirs, everything())
data <-
data %>%
left_join(tbl_dod, by = duns_column) %>%
mutate(has_dod_sbir = has_dod_sbir %>% coalesce(F))
data <- data %>% select(-matches("data_"), everything())
data
}
.tbl_sbir_awards <-
function(data,
only_open_topic = T,
sbir_columns = c("nameAward",
"descriptionAward"),
dod_text_columns = c(
"name_topic",
"name_award",
"description_award",
"description_narrative_agency",
"description_topic",
"description_topic_rationale",
"description_principal_benefit",
"topic_phase1",
"topic_phase2",
"topic_phase3"
),
join_sam_data = F,
snake_names = T
) {
tbl_afwerx <-
sbir_afwerx_portfolio(resolve_for_duns = T, snake_names = F, only_open_topic = only_open_topic)
matched_duns <- tbl_afwerx %>%
filter(!is.na(idDUNS)) %>%
distinct(idDUNS) %>%
pull()
tbl_afwerx <- tbl_afwerx %>%
clean_names()
df_sbir_historic <- sbirs_historic()
df_sbir_historic <-
df_sbir_historic %>%
filter(idDUNS %in% matched_duns) %>%
select(
idDUNS,
nameCompanySBIR = nameCompanyClean,
idContractResolved,
idSBIR,
dateAward,
amountContract,
nameAgency,
nameBranch,
one_of(sbir_columns)
)
df_sbir_historic <-
df_sbir_historic %>%
create_text_block(id_column = "idSBIR", text_columns = sbir_columns) %>%
select(-one_of(sbir_columns)) %>%
clean_names() %>%
group_by(id_duns) %>%
nest() %>%
rename(data_sbir = data) %>%
mutate(count_sbir_website = data_sbir %>% map_dbl(nrow),
has_sbirs_website = T) %>%
select(-matches("data"), everything()) %>%
ungroup()
df_dod_duns <-
dod_sbir_historic(snake_names = T) %>%
filter(id_duns %in% matched_duns)
df_dod_duns <-
df_dod_duns %>%
select(
group_solicitation,
id_duns,
id_contract_analysis,
date_award,
amount_contract,
keywords_sbir_company,
keywords_topic_sbir,
name_company_dod = name_company_clean,
one_of(dod_text_columns)
) %>%
mutate_if(
is.character,
list(function(x){
x %>% str_remove_all("\u001a") %>% str_squish()
})) %>%
create_text_block(id_column = "id_contract_analysis", text_columns = dod_text_columns) %>%
select(-matches("id_phase")) %>%
select(-one_of(dod_text_columns))
data_all_keywords <-
df_dod_duns %>%
select(id_duns, matches("keyword")) %>%
left_join(tbl_afwerx %>% select(id_duns, name_company_clean), by = "id_duns") %>%
gather(type, keyword, -c(id_duns, name_company_clean)) %>%
separate_rows(keyword, sep = "\\|") %>%
mutate(keyword = str_squish(keyword)) %>%
mutate(type = type %>% str_remove_all("keywords_")) %>%
bind_rows(
tbl_afwerx %>%
select(id_duns, name_company_clean, afwerx = keyword_company_clean_afwerx) %>%
gather(type, keyword, -c(id_duns, name_company_clean)) %>%
separate_rows(keyword, sep = "\\|") %>%
mutate(keyword = str_squish(keyword))
) %>%
distinct() %>%
group_by(id_duns, name_company_clean) %>%
nest() %>%
rename(data_keywords = data) %>%
ungroup() %>%
mutate(has_keywords = T,
count_keywords = data_keywords %>% map_dbl(nrow))
tbl_text_block <-
df_dod_duns %>%
select(id_duns, text = text_contract_analysis) %>%
bind_rows(
df_sbir_historic %>% unnest_legacy() %>% filter(!id_contract_resolved %in% df_dod_duns$id_contract_analysis) %>%
select(id_duns, text = text_sbir)
) %>%
left_join(tbl_afwerx %>% select(id_duns, name_company_clean), by = "id_duns") %>%
distinct() %>%
filter(!is.na(text)) %>%
group_by(id_duns, name_company_clean) %>%
summarise(text = str_c(text, collapse = " ")) %>%
ungroup() %>%
mutate(has_text_block = T)
df_dod_duns <- df_dod_duns %>%
group_by(id_duns) %>%
nest() %>%
rename(data_dod_awards = data) %>%
mutate(has_dod_sbir = T) %>%
ungroup()
tbl_afwerx <-
tbl_afwerx %>% group_by(id_duns, name_company_clean) %>% nest() %>%
rename(data_afwerx = data)
data <-
list(tbl_afwerx,
data_all_keywords,
tbl_text_block,
df_dod_duns,
df_sbir_historic) %>%
reduce(left_join) %>%
mutate_if(is.logical,
list(function(x){
case_when(is.na(x) ~ F,
TRUE ~ x)
})) %>%
ungroup()
data
}
#' Assembled SBIR data from source
#'
#' @param data
#' @param data_name
#' @param remove_columns
#' @param duns_column
#' @param exclude_dod_matches
#' @param sbir_id_column
#' @param sbir_select_columns
#' @param sbir_text_columns
#' @param dod_id_column
#' @param dod_select_columns
#' @param dod_text_columns
#' @param url_column
#' @param include_website_crux
#' @param crawl_websites
#' @param exclude_topic_keywords
#' @param exclude_custom_keywords
#' @param join_sam_data
#' @param stay_on_site
#' @param use_future
#' @param snake_names
#' @param assign_to_environment
#'
#' @return
#' @export
#'
#' @examples
#' \dontrun{
#' df <-
#' govtrackR::tbl_sbir_assemble(data = sbir_afwerx_portfolio(use_cached = T, snake_names = T), data_name = "afwerx", crawl_websites = F, include_website_crux = T)
#' }
tbl_sbir_assemble <-
function(data,
data_name = NULL,
data_id = "id",
nest_original_data = T,
remove_columns = "keyword_afwerx",
duns_column = "id_duns",
exclude_dod_matches = T,
sbir_id_column = "idSBIR",
sbir_select_columns = c(
"idSBIR",
"idDUNS",
"dateAward",
"idAgencyTracking",
"idContractResolved",
"idSolicitation",
"nameAgency",
"nameBranch",
"amountContract",
"namePrincipal"
),
sbir_text_columns = c("nameAward",
"descriptionAward"),
dod_id_column = "id_contract_analysis",
dod_select_columns = c(
"id_duns",
"name_office",
"slug_agency",
"name_full_point_of_contact_sttr",
"group_solicitation",
"keywords_sbir_company",
"keywords_topic_sbir",
"date_award_dod",
"amount_contract",
"name_company_dod_clean",
"longitude_company",
"latitude_company",
"slug_acquisition_program_name",
"count_patents_company",
"count_employees_current"
),
dod_text_columns = c(
"name_topic",
"name_award",
"description_award",
"description_narrative_agency",
"description_topic",
"description_principal_benefit"
),
url_column = "url_company",
exclude_urls = c("google", "outlook", "yahoo", "protonmail"),
include_website_crux = T,
crawl_websites = F,
exclude_topic_keywords = F,
exclude_custom_keywords = F,
join_sam_data = F,
stay_on_site = T,
use_future = F,
snake_names = T,
assign_to_environment = T) {
if (length(remove_columns) > 0) {
data <-
data %>%
select(-one_of(remove_columns))
}
if (snake_names) {
data <- data %>% clean_names()
duns_column <- duns_column %>% make_clean_names()
}
if (join_sam_data) {
data <-
data %>% tbl_sam_data(duns_column = duns_column, snake_names = snake_names)
}
duns_to_match <- data %>%
filter(!is.na(!!sym(duns_column))) %>%
select(one_of(duns_column)) %>%
distinct() %>%
pull()
tbl_dod_sbirs <-
dod_sbir_historic(
snake_names = T,
create_text_description = F,
unformat = T
)
tbl_dod_sbirs <-
tbl_dod_sbirs %>%
rename(name_company_dod_clean = name_company_clean) %>%
rename(date_award_dod = date_award) %>%
filter(id_duns %in% duns_to_match) %>%
select(one_of(c(
dod_id_column, dod_select_columns, dod_text_columns
))) %>%
mutate_if(
is.character,
list(function(x){
x %>% str_remove_all("\u001a") %>% str_squish()
})) %>%
create_text_block(id_column = dod_id_column, text_columns = dod_text_columns
)
tbl_keywords <-
tbl_dod_sbirs %>%
select(one_of(duns_column, dod_id_column), matches("keyword")) %>% rename(id := UQ(dod_id_column)) %>%
gather(type_keyword, keyword, -c(duns_column, id)) %>%
mutate(type_keyword = type_keyword %>% str_remove_all("^keywords_|^keyword_"),
id = as.character(id)) %>%
separate_rows(keyword, sep = "\\|") %>%
mutate_if(is.character,
list(function(x){
x %>% str_squish()
}))
tbl_texts <-
tbl_dod_sbirs %>%
select(id_duns := UQ(duns_column),
id := UQ(dod_id_column),
text = text_contract_analysis) %>%
mutate(source = "DOD") %>%
select(source, everything())
if (assign_to_environment) {
assign('tbl_dod_sbirs', value = tbl_dod_sbirs, envir = .GlobalEnv)
}
if (exclude_topic_keywords) {
tbl_keywords <-
tbl_keywords %>%
filter(type_keyword != "topic_sbir")
}
key_col <- data %>% select(matches("keyword")) %>% names()
if (length(key_col) & !exclude_custom_keywords) {
df_new_keywords <-
data %>%
select(one_of(duns_column, data_id), matches("keyword")) %>%
rename(id := UQ(data_id)) %>%
gather(type_keyword, keyword, -c(duns_column, id)) %>%
mutate(type_keyword = type_keyword %>% str_remove_all("^keywords_|^keyword_"),
id = as.character(id)) %>%
separate_rows(keyword, sep = "\\|") %>%
mutate_if(is.character,
list(function(x){
x %>% str_squish()
}))
if (nrow(df_new_keywords) > 0) {
tbl_keywords <-
tbl_keywords %>%
bind_rows(df_new_keywords) %>%
arrange(!!sym(duns_column))
}
}
tbl_keywords <-
tbl_keywords %>% filter(!is.na(keyword)) %>%
filter(!is.na(!!sym(duns_column))) %>%
filter(keyword != "")
if (assign_to_environment & nrow(tbl_keywords) > 0) {
assign(x = "tbl_keywords_sbirs", tbl_keywords, envir = .GlobalEnv)
}
matched_dod_contracts <-
tbl_dod_sbirs %>% distinct(!!sym(dod_id_column)) %>%
pull()
tbl_sbirs <- sbirs_historic()
tbl_sbirs <-
tbl_sbirs %>%
filter(idDUNS %in% duns_to_match) %>%
select(one_of(sbir_id_column, sbir_select_columns, sbir_text_columns)) %>%
create_text_block(id_column = sbir_id_column, text_columns = sbir_text_columns)
if (length(matched_dod_contracts) > 0 &
exclude_dod_matches) {
if (tbl_sbirs %>% hasName("idContractResolved")) {
tbl_sbirs <- tbl_sbirs %>%
filter(!idContractResolved %in% matched_dod_contracts)
}
}
if (snake_names) {
tbl_sbirs <- tbl_sbirs %>% clean_names()
sbir_id_column <- make_clean_names(sbir_id_column)
}
if (nrow(tbl_dod_sbirs) > 0) {
text_col <- tbl_sbirs %>% select(matches("text_")) %>% names()
tbl_texts <- tbl_texts %>%
bind_rows(
tbl_sbirs %>%
select(
id_duns := UQ(duns_column),
id := UQ(sbir_id_column),
text := UQ(text_col)
) %>%
mutate(source = "SBA",
id = as.character(id)) %>%
select(source, everything())
)
}
if (assign_to_environment & nrow(tbl_sbirs) > 0) {
assign("tbl_sba_sbirs", tbl_sbirs, envir = .GlobalEnv)
}
if (data %>% hasName(url_column) & include_website_crux) {
df_urls <-
data %>%
select(one_of(duns_column, url_column)) %>%
filter(!is.na(!!sym(url_column))) %>%
distinct()
if (length(exclude_urls) > 0) {
df_urls <- df_urls %>%
filter(!(!!sym(url_column) %>% str_detect(str_c(
exclude_urls, collapse = "|"
))))
}
df_crux <-
tbl_crux(data = df_urls,
snake_names = snake_names,
url_column = url_column)
df_crux <- df_crux %>% munge_data(snake_names = snake_names)
df_crux <- df_crux %>%
mutate_if(
is.character,
list(function(x){
x %>% str_remove_all("\u001a|\U001A") %>% str_squish()
}))
tbl_texts <-
tbl_texts %>%
bind_rows(
df_crux %>%
filter(!is.na(description_site_text)) %>%
select(id_duns, id = url_company, text = description_site_text) %>%
mutate(source = "CRUX")
)
if (assign_to_environment & nrow(df_crux) > 0)
{
assign("tbl_crux_sbir", df_crux, envir = .GlobalEnv)
}
}
if (data %>% hasName(url_column) & crawl_websites) {
df_urls <-
data %>%
select(one_of(duns_column, url_column)) %>%
filter(!is.na(!!sym(url_column))) %>%
distinct()
if (length(exclude_urls) > 0) {
df_urls <- df_urls %>%
filter(!(!!sym(url_column) %>% str_detect(str_c(
exclude_urls, collapse = "|"
))))
}
df_crawled <-
sheldon::crawl_for_urls(
urls = df_urls %>% pull(url_column),
stay_on_site = stay_on_site,
use_future = use_future
)
df_crawled <- df_crawled %>%
filter(!is.na(url_reference))
df_crawled_text <-
sheldon::scrape_text(urls = df_crawled$url_reference)
df_crawled_text <-
df_crawled_text %>%
select(url, text_site = text) %>%
rename(url_reference = url) %>%
left_join(df_crawled) %>%
rename(UQ(url_column) := url) %>%
left_join(df_urls) %>%
fill(duns_column) %>%
filter(!is.na(!!sym(duns_column)))
df_site <-
df_crawled_text %>%
filter(!is.na(url_reference)) %>%
filter(!is.na(text_site)) %>%
select(id_duns, id = url_reference, text = text_site) %>%
mutate(source = "CRAWL")
tbl_texts <-
tbl_texts %>%
bind_rows(df_site)
if (assign_to_environment & nrow(df_crux) > 0) {
assign("tbl_crawled_sbirs", df_crawled_text, envir = .GlobalEnv)
}
}
tbl_texts <-
tbl_texts %>%
mutate(type = "company") %>%
select(type, everything())
if (assign_to_environment) {
assign("tbl_text_items_sbirs", tbl_texts, envir = .GlobalEnv)
tbl_all <-
tbl_texts %>%
group_by(!!sym(duns_column)) %>%
filter(!is.na(text)) %>%
summarise(text = text %>% str_c(collapse = " ")) %>%
mutate(type = "company") %>%
select(type, everything())
assign("tbl_text_all_sbirs", tbl_all, envir = .GlobalEnv)
}
if (nest_original_data) {
d <-
data %>%
group_by(!!sym(duns_column)) %>%
nest() %>%
ungroup()
if (length(data_name) > 0) {
d <- d %>%
rename(UQ(data_name) := data)
}
} else {
d <-
data %>% group_by(!!sym(duns_column)) %>% slice(1) %>%
ungroup()
}
tbl_sbirs <- tbl_sbirs %>%
group_by(!!sym(duns_column)) %>%
nest() %>%
rename(data_sbir_sba = data) %>%
ungroup() %>%
mutate(has_sbir_sba = T)
tbl_dod_sbirs <-
tbl_dod_sbirs %>%
group_by(!!sym(duns_column)) %>%
nest() %>%
rename(data_sbir_dod = data) %>%
ungroup() %>%
mutate(has_sbir_dod = T)
tbl_keywords <-
tbl_keywords %>%
group_by(!!sym(duns_column)) %>%
nest() %>%
rename(data_keywords = data) %>%
ungroup() %>%
mutate(has_keywords = T)
tbl_texts <-
tbl_texts %>%
group_by(!!sym(duns_column)) %>%
nest() %>%
rename(data_text = data) %>%
ungroup() %>%
mutate(has_text = T)
data <-
list(d, tbl_texts, tbl_keywords, tbl_sbirs,
tbl_dod_sbirs) %>%
reduce(left_join, by = duns_column)
if (include_website_crux) {
data <-
data %>%
left_join(
tbl_crux_sbir %>%
group_by(!!!syms(c(
duns_column, url_column
))) %>%
nest() %>%
rename(data_crux = data) %>%
ungroup()
)
}
if (crawl_websites) {
data <- data %>%
left_join(
df_crawled_text %>%
group_by(!!!syms(c(
duns_column, url_column
))) %>%
nest() %>%
rename(data_crawled_text = data) %>%
ungroup()
)
}
data <- data %>%
mutate_if(is.logical,
list(function(x){
case_when(is.na(x) ~ FALSE,
TRUE ~ x)
})) %>%
select(-matches("data"), everything())
data
}
#' Assign Nested data
#'
#' @param data a `tibble`
#' @param duns_column name of `DUNS` column
#' @param other_select_columns if not `NULL` other columns to unnest
#'
#' @return
#' @export
#'
#' @examples
assign_nested_sbir_data <-
function(data, duns_column = "id_duns", other_select_columns = NULL) {
tbl_classes <-
data %>% map_df(class) %>%
gather(column, class)
nested_cols <-
tbl_classes %>%
filter(class %>% str_detect("list|data")) %>%
pull(column)
nested_cols %>%
walk(function(x){
d <-
data %>%
select(one_of(duns_column, other_select_columns, x)) %>%
unnest() %>%
distinct()
if (x == "data_text") {
assign("tbl_text_items_sbirs", d, envir = .GlobalEnv)
tbl_all <-
d %>%
group_by(!!sym(duns_column)) %>%
filter(!is.na(text)) %>%
summarise(text = text %>% str_c(collapse = " ")) %>%
mutate(type = "company") %>%
select(type, everything())
assign("tbl_text_all_sbirs", tbl_all, envir = .GlobalEnv)
return(invisible())
}
if (x == "data_keywords") {
assign(x = "tbl_keywords_sbirs", value = d, envir = .GlobalEnv)
return(invisible())
}
if (x == "data_sbir_sba") {
assign("tbl_sba_sbirs", value = d, envir = .GlobalEnv)
return(invisible())
}
if (x == "data_sbir_dod") {
assign('tbl_dod_sbirs', value = d, envir = .GlobalEnv)
return(invisible())
}
if (x == "data_crux") {
assign("tbl_crux_sbir", d, envir = .GlobalEnv)
return(invisible())
}
table_name <- x %>% str_replace_all("data", "tbl")
assign(x = table_name, value = d, envir = .GlobalEnv)
})
return(invisible())
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.