R/sbir.R

Defines functions sbir_solicitations_api .munge_sbir_names dictionary_sbir_names .parse_soliciation_descriptions .distinct_sbir_cols .fix_bad_sbir .build_folder

Documented in dictionary_sbir_names sbir_solicitations_api

.build_folder <-
  function(path = "Desktop/abresler.github.io/trelliscopes/jinkie/otr/kaute") {
    oldwd <- getwd()
    setwd("~")

    folder_exists <-
      dir.exists(paths = path)

    if (folder_exists) {
      setwd(oldwd)
      return(invisible())
    }

    parts <- path %>% str_split("/") %>% flatten_chr()

    seq_along(parts) %>%
      map(function(x) {
        if (x == 1) {
          directory <- parts[x]
          if (!dir.exists(directory)) {
            dir.create(directory)
          }
          return(invisible())
        }
        directory <- parts[1:x] %>% str_c(collapse = '/')
        if (!dir.exists(directory)) {
          dir.create(directory)
        }
        return(invisible())
      })

    setwd(oldwd)
    return(invisible())
  }

.fix_bad_sbir <-
  function(data) {
    data <- data %>%
      mutate(
        idContract = case_when(
          idContract == "H9222217P0004" ~ "H9222218P0004",
          idContract == "W911SR17P0022" ~ "W911SR18P0022",
          TRUE ~ idContract
        )
      )

    if (data %>% hasName("idContractResolved")) {
      data <- data %>%
        mutate(
          idContractResolved = case_when(
            idContractResolved == "H9222217P0004" ~ "H9222218P0004",
            idContractResolved == "W911SR17P0022" ~ "W911SR18P0022",
            TRUE ~ idContractResolved
          )
        )
    }

    data
  }
.distinct_sbir_cols <-
  function(data) {
    cols <-
      tibble(column = names(data)) %>%
      mutate(idColumn = 1:n()) %>%
      group_by(column) %>%
      filter(idColumn == max(idColumn)) %>%
      ungroup() %>%
      pull(idColumn)

    data <-
      data[, cols]

    data
  }

.parse_soliciation_descriptions <-
  function(description) {
    tibble(description) %>%
      separate(
        col = "description",
        extra = "merge",
        into = c("item", "value"),
        sep = "\\: "
      ) %>%
      filter(!is.na(value)) %>%
      mutate()
  }

#' SBIR name dictionary
#'
#' @return
#' @export
#'
#' @examples
dictionary_sbir_names <-
  function() {
    tibble(
      nameSBIR = c(
        "Agency",
        "Branch",
        "Contract",
        "Agency Tracking Number",
        "Amount",
        "Phase",
        "Program",
        "Awards Year",
        "Solicitation Year",
        "Solicitation Topic Code",
        "Solicitation Number",
        "DUNS",
        "HUBZone Owned",
        "Woman Owned",
        "Socially and Economically Disadvantaged",
        "title",
        "duns",
        "address1",
        "address2",
        "city",
        "state",
        "zip",
        "hubzone_owned",
        "minority_owned",
        "woman_owned",
        "link",
        "abstract",
        "agency",
        "program",
        "phase",
        "year",
        "firm",
        "reseach institution",
        "description",
        "source_url",
        "close_date",
        "release_date",
        "its_field_open_date",
        "status",
        "Program | Phase | Year",
        "Solicitation",
        "Topic Number",
        "NOTE",
        "FY",
        "AGENCY",
        "FIRM NAME",
        "TOPIC NUMBER",
        "THIRD PARTY INVESTOR",
        "THIRD PARTY INVESTMENT",
        "NEW TOTAL PH2 AWARD AMOUNT",
        "wosb_flag",
        "sdb_flag",
        "topic_year",
        "sttr_ri_city",
        "ta_bio_medical_flag",
        "org_ph2_enhanced_sbir_proposals",
        "submission_id",
        "sttr_ri_name",
        "topic_background",
        "topic_description",
        "ta_chem_bio_defense_flag",
        "aw_program_type_name",
        "pi_benefit",
        "topic_renew_energy_flag",
        "award_end_date",
        "org_award_count_ph2",
        "topic_title",
        "org_award_count_ph1",
        "topic_aq_program_name",
        "sttr_ri_url",
        "sub_direct2_phase2_flag",
        "agency_name_abbr",
        "pi_equivalent_work_flag",
        "topic_phase2",
        "org_ph2_sttr_proposals",
        "topic_phase1",
        "ta_info_systems_flag",
        "topic_itar_restricted_flag",
        "topic_phase3",
        "org_zip_code",
        "aw_topic_number",
        "org_total_investment",
        "org_name",
        "org_state_name_abbr",
        "org_total_sales",
        "ph2_prev_awarded_proposal_number",
        "manufacturing_type_name",
        "id",
        "org_poc_fullname",
        "org_addr2",
        "award_amount",
        "org_addr1",
        "org_emp_count_current",
        "topic_keywords",
        "ta_air_platform_flag",
        "sub_vet_owned_flag",
        "aw_fiscal_year",
        "hub_zone_flag",
        "org_duns_number",
        "org_firstaward_yrph2",
        "sttr_ri_zip",
        "org_firstaward_yrph1",
        "award_flag",
        "topic_objective",
        "org_geolat",
        "org_sbir_proposals",
        "fed_facility_flag",
        "pi_technical_abstract",
        "ta_nuclear_flag",
        "sttr_ri_pct_of_work_firm",
        "sttr_ri_work_criteria_flag",
        "sttr_ri_addr2",
        "sttr_ri_addr1",
        "foreign_national_flag",
        "animal_research_flag",
        "num_of_employees",
        "co_name",
        "org_full_address",
        "pm_name",
        "org_ipo_flag",
        "small_business_flag",
        "aw_proposal_number",
        "org_cage_code",
        "org_revenue_last_year",
        "ta_ground_sea_flag",
        "org_website",
        "org_ph2_sbir_proposals",
        "org_geolong",
        "org_year_founded",
        "org_city",
        "ta_electronics_flag",
        "pi_equivalent_work_details",
        "sttr_ri_poc_fullname",
        "ta_weapons_flag",
        "org_patent_count",
        "ta_sensors_flag",
        "org_ceo_email",
        "org_ph2_enhanced_sttr_proposals",
        "solicitation_name",
        "org_percentage_revenue_from_sbir",
        "org_ceo_full_name",
        "org_sttr_proposals",
        "org_num_ccr",
        "submission_title",
        "org_geo_latlong",
        "ta_human_systems_flag",
        "fpds_vendor_id",
        "contract_number",
        "submission_type",
        "ta_battlespace_flag",
        "award_date",
        "topic_aqp_statement_of_interest",
        "topic_id",
        "corp_entity_flag",
        "org_total_award",
        "aw_select_flag",
        "ta_materials_flag",
        "org_emp_count_firstaward_ph2",
        "ta_space_platforms_flag",
        "pi_keywords",
        "topic_rationale",
        "ph2_prev_awarded_submission_id",
        "org_certify_name",
        "sdvosb_flag",
        "_version_",
        "score",
        "org_narrative",
        "award_comments",
        "sba_id",
        "aw_field_office",

        "sttr_ri_type",
        "sttr_ri_state_name_abbr",
        "sttr_ri_full_address",

        "SolicitationDropdown_String",
        "SolicitationPreReleaseDate_String",
        "SolicitationOpenDate_String",
        "SolicitationCloseDate_String",
        "SelectionsDueDate_String",
        "AwardsDueDate_String",
        "TopicStage1StartDate_String",
        "TopicStage2StartDate_String",
        "TopicStage3aStartDate_String",
        "TopicStage3bStartDate_String",
        "TopicStage3cStartDate_String",
        "TopicStage3dStartDate_String",
        "TopicStage4StartDate_String",
        "TopicStage5StartDate_String",
        "TopicStage6StartDate_String",
        "TopicStage7StartDate_String",
        "TopicStage8StartDate_String",
        "TopicStage1EndDate_String",
        "TopicStage2EndDate_String",
        "TopicStage3aEndDate_String",
        "TopicStage3bEndDate_String",
        "TopicStage3cEndDate_String",
        "TopicStage3dEndDate_String",
        "TopicStage4EndDate_String",
        "TopicStage5EndDate_String",
        "TopicStage6EndDate_String",
        "TopicStage7EndDate_String",
        "TopicStage8EndDate_String",
        "IsTopic_Stage1",
        "IsTopic_Stage2",
        "IsTopic_Stage3a",
        "IsTopic_Stage3b",
        "IsTopic_Stage3c",
        "IsTopic_Stage3d",
        "IsTopic_Stage4",
        "IsTopic_Stage5",
        "IsTopic_Stage6",
        "IsTopic_Stage7",
        "IsTopic_Stage8",
        "IsSolicitation_PreRelease",
        "NumberOfSecondsBeforeSolicitationClose",
        "IsSolicitation_Open",
        "IsSolicitation_Closed",
        "IsSolicitation_SelectionsDue",
        "IsSolicitation_AwardsDue",
        "IsSolicitation_SITUS",
        "IsSolicitation_SITIS_Admin_Open",
        "AreTopicsEditable_SBIRManager",
        "AreTopicsEditable_Component",
        "IsPmCertifiedUploadOpen",
        "IsComponentInstructionUploadOpen",
        "IsComponentTopicSubmissionOpen",
        "IsDirty",
        "SolicitationId",
        "FiscalYear",
        "ProgramType",
        "SolicitationName",
        "OldSolicitationId",
        "CreationDtime",
        "DeleteFlag",
        "HistoricalFlag",
        "PmViewLockFlag",
        "PmViewLockDtime",
        "SubmissionLockFlag",
        "SubmissionLockDtime",
        "SitisFlag",
        "TopicStage1StartDate",
        "TopicStage1EndDate",
        "TopicStage2StartDate",
        "TopicStage2EndDate",
        "TopicStage3aStartDate",
        "TopicStage3aEndDate",
        "TopicStage3bStartDate",
        "TopicStage3bEndDate",
        "TopicStage3cStartDate",
        "TopicStage3cEndDate",
        "TopicStage3dStartDate",
        "TopicStage3dEndDate",
        "TopicStage4StartDate",
        "TopicStage4EndDate",
        "TopicStage5StartDate",
        "TopicStage5EndDate",
        "TopicStage6StartDate",
        "TopicStage6EndDate",
        "TopicStage7StartDate",
        "TopicStage7EndDate",
        "TopicStage8StartDate",
        "TopicStage8EndDate",
        "SolicitationPreReleaseDate",
        "SolicitationOpenDate",
        "SolicitationCloseDate",
        "SelectionsDueDate",
        "AwardsDueDate",
        "OutOfCycle",
        "Solicitation:",
        "Award Year",
        "Award Start Date (Proposal Award Date)",
        "Award End Date (Contract End Date)",
        "Proposal Number",
        "Proposal Title",
        "Business Official",
        "Email",
        "Firm",
        "Award Amount",
        "State",
        "Mission Directorate",
        "Center",
        "Year",
        "TopicId",
        "TopicNumber",
        "TopicTitle",
        "AgencyId",
        "CommandId",
        "PublishedQuestionCount",
        "PublishedAnswerCount",
        "TechAreaAirPlatform",
        "TechAreaBattlespace",
        "TechAreaBioMedical",
        "TechAreaChemBioDefense",
        "TechAreaElectronics",
        "TechAreaGroundSea",
        "TechAreaHumanSystems",
        "TechAreaInfoSystems",
        "TechAreaMaterials",
        "TechAreaNuclear",
        "TechAreaSensors",
        "TechAreaSpacePlatforms",
        "TechAreaWeapons",
        "SitisAdditionalInfo",
        "RT_Area_Microelectronics",
        "RT_Area_Cybersecurity_EW",
        "RT_Area_Quantum_Science",
        "RT_Area_Directed_Energy",
        "RT_Area_MachineLearning_AI",
        "AC_Fully_Networked_C3",
        "AC_Space",
        "AS_AOR_Autonomy",
        "AS_AOR_Hypersonic",
        "AS_AOR_Nuclear_Modernization",
        "RTAreaMicroelectronics",
        "RTAreaCybersecurityEW",
        "RTAreaQuantumScience",
        "RTAreaDirectedEnergy",
        "RTAreaMachineLearningAI",
        "ACFullyNetworkedC3",
        "ACSpace",
        "ASAORAutonomy",
        "ASAORHypersonic",
        "ASAORNuclearModernization",
        "AgencyName",
        "CommandName",
        "ProgramTypeName",
        "ShowTpocEmail",
        "ShowTpocName",
        "ShowTpocPhone",
        "urlDODSBIRTopicAPI",
        "ApPocEmail",
        "ApPocNameFirst",
        "ApPocNameLast",
        "ApProgramName",
        "ApStatementOfInterest",
        "Background",
        "Description",
        "DirectToPhase2",
        "ItarText",
        "Keywords",
        "LastUpdateDtime",
        "ManufacturingTypeString",
        "Objective",
        "Phase1",
        "Phase2",
        "Poc2Email",
        "Poc2NameFirst",
        "Poc2NameLast",
        "Poc2OfficeSymbol",
        "Poc3Email",
        "Poc3NameFirst",
        "Poc3NameLast",
        "Poc3OfficeSymbol",
        "Poc4Email",
        "Poc4NameFirst",
        "Poc4NameLast",
        "Poc4OfficeSymbol",
        "PocEmail",
        "PocNameFirst",
        "PocNameLast",
        "PocOfficeSymbol",
        "PriorTopicNumber",
        "ProgramTypeString",
        "Rationale",
        "SitusAdditionalInfo",
        "AwardCommandId",
        "CreationUserId",
        "DirectToPhaseIiFlag",
        "ImportFlag",
        "ItarRestrictFlag",
        "LastUpdateUserId",
        "ManufacturingType",
        "MeetTopicCriteriaFlag",
        "PrevApproveReviewFlag",
        "PrevSubmitToReviewFlag",
        "PriorSolicitationFlag",
        "PriorTopicId",
        "RenewableEnergyFlag",
        "TopicStatus",
        "ReferenceUploadFlag",
        "TopicReferenceId",
        "ReferenceIndex",
        "ReferenceText",
        "ReferenceCreateDtime",
        "PocPhoneInfo",

        "Topic",
        "SBIR or STTR?",
        "Company Name",
        "Company Website"

      ),
      nameActual = c(
        "nameAgency",
        "nameBranch",
        "idContract",
        "idAgencyTracking",
        "amountContract",
        "idPhase",
        "idProgram",
        "yearAward",
        "yearSolicitation",
        "groupSolicitation",
        "idSolicitation",
        "idDUNS",
        "isHUBZoneOwned",
        "isWomanOwned",
        "isSociallyAndEconomicallyDisadvantaged",
        "title",
        "slugDUNS",
        "addressStreet1",
        "addressStreet2",
        "city",
        "codeState",
        "zipcode",
        "isSBACertifiedHUBZone",
        "isMinorityOwned",
        "isWomanOwned",
        "urlSBIRAward",
        "descriptionAward",
        "nameAgency",
        "nameProgram",
        "idPhase",
        "yearAward",
        "nameCompany",
        "nameResearchInstitution",
        "descriptionSBIR",
        "urlSBIR",
        "dateClosed",
        "dateReleased",
        "dateFieldOpen",
        "statusSBIR",
        "programPhaseYear",
        "slugSolicitation",
        "slugTopic",
        "noteSolication",
        "yearFiscal",
        "slugAgency",
        "nameCompany",
        "slugTopic",
        "namesInvestorsThirdParty",
        "amountInvestmentThirdParty",
        "amountPhase2TotalNew",
        "isWomanOwned",
        "isSmallDisadvantagedBusiness",
        "yearTopic",
        "citySTTR",
        "isBioMedicalTopic",
        "isPhase2Enhanced",
        "idSubmission",
        "nameResearchInstitution",
        "backgroundTopic",
        "descriptionTopic",
        "isBioChemicalDefenseTopic",
        "typeProgram",
        "descriptionPrincipalBenefit",
        "isRenewableEnergy",
        "datetimeAwardEnd",
        "countCompanyPhase2Awards",
        "titleTopic",
        "countCompanyPhase1Awards",
        "slugAcquisitionProgramName",
        "urlSTTR",
        "isDirectToPhase2",
        "slugAgency",
        "hasEquivalentWorkFlag",
        "topicPhase2",
        "idPhase2STTRProposal",
        "topicPhase1",
        "isInformationSystemTopic",
        "isITARRestrictedTopic",
        "topicPhase3",
        "zipcodeCompany",
        "groupSolicitation",
        "amountInvestmentTotal",
        "nameCompany",
        "slugStateCompany",
        "amountRevenueCompany",
        "countPhase2Prior",
        "typeManufacturing",
        "id",
        "nameFullContact",
        "addressStreet2Company",
        "amountContract",
        "adressStreet1Company",
        "countEmployeesCurrent",
        "keywordsTopicSBIR",
        "isAirPlatformTopic",
        "isVetSmallDisadvantagedBusiness",
        "yearFiscalAwarwd",
        "isHubZoneCompany",
        "slugDUNS",
        "yearFirstPhase2Company",
        "zipcodeSTTR",
        "yearFirstPhase1Company",
        "isAwarded",
        "topicObjective",
        "latitudeCompany",
        "countSBIRProposals",
        "isFedFacility",
        "descriptionAward",
        "isNuclearTopic",
        "pctWorkSTTR",
        "hasRIWorkSTTR",
        "addressStreet2STTR",
        "addressStreet1STTR",
        "isForeignNational",
        "hasAnimalResearch",
        "countEmployees",
        "nameTitleBusinessContact",
        "addressFullCompany",
        "nameTitlePrincipalInvestigator",
        "hasIPO",
        "isSmallBusiness",
        "groupProposal",
        "slugCageCompany",
        "amountRevenueCompanyPriorYear",
        "isGroundSeaTopic",
        "urlCompany",
        "countSBIRPhase2ApplicationsCompany",
        "longitudeCompany",
        "yearCompanyFounded",
        "cityCompany",
        "isElectronicsTopic",
        "detailsEquivalentWork",
        "nameFullPointOfContactSTTR",
        "isWeaponsTopic",
        "countPatentsCompany",
        "isSensorsTopic",
        "emailCEO",
        "countPhase2EnhancedSTTRProposals",
        "nameSolicitation",
        "pctRevenueSBIR",
        "nameFullCEO",
        "countSTTRApplications",
        "idCCR",
        "titleSBIR",
        "companyLatLon",
        "isHumanSystemsTopic",
        "idVendorFPDS",
        "idContract",
        "idPhaseSBIR",
        "isBattleSpaceTopic",
        "datetimeAward",
        "topicAQPStatementOfInterest",
        "idTopic",
        "isCorporateEntity",
        "amountAwardsTotalCompany",
        "isAwardSelected",
        "isMaterialsTopic",
        "countEmployeesPhase2Initial",
        "isSpaceTopic",
        "keywordsSBIRCompany",
        "descriptionTopicRationale",
        "idSubmissionPhase2Prior",
        "nameCompanyCertifier",
        "isServiceDisabledOwnedSmallBusiness",
        "keyVersion",
        "scoreMatch",
        "descriptionNarrativeAgency",
        "commentsAward",
        "idSBA",
        "nameOffice",

        "idPhaseSTTR",
        "slugStateSTTR",
        "addressFullSTTR",
        "groupSolicitation",
        "dateSolicitationPreRelease",
        "dateSolicitationOpen",
        "dateSolicitationClose",
        "dateSelectionsDue",
        "dateAwardsDue",
        "dateTopicStage1Start",
        "dateTopicStage2Start",
        "dateTopicStage3aStart",
        "dateTopicStage3bStart",
        "dateTopicStage3cStart",
        "dateTopicStage3dStart",
        "dateTopicStage4Start",
        "dateTopicStage5Start",
        "dateTopicStage6Start",
        "dateTopicStage7Start",
        "dateTopicStage8Start",
        "dateTopicStage1End",
        "dateTopicStage2End",
        "dateTopicStage3aEnd",
        "dateTopicStage3bEnd",
        "dateTopicStage3cEnd",
        "dateTopicStage3dEnd",
        "dateTopicStage4End",
        "dateTopicStage5End",
        "dateTopicStage6End",
        "dateTopicStage7End",
        "dateTopicStage8End",
        "isTopicStage1",
        "isTopicStage2",
        "isTopicStage3a",
        "isTopicStage3b",
        "isTopicStage3c",
        "isTopicStage3d",
        "isTopicStage4",
        "isTopicStage5",
        "isTopicStage6",
        "isTopicStage7",
        "isTopicStage8",
        "isSolicitationPreRelease",
        "countSecondsBeforeSolicitationClose",
        "isSolicitationOpen",
        "isSolicitationClosed",
        "isSolicitationSelectionsDue",
        "isSolicitationAwardsDue",
        "isSolicitationSITUS",
        "isSolicitationSITISAdminOpen",
        "isTopicEditableSBIRManager",
        "isTopicEditableComponent",
        "isPmCertifiedUploadOpen",
        "isComponentInstructionUploadOpen",
        "isComponentTopicSubmissionOpen",
        "isDirty",
        "idSolicitation",
        "yearTopic",
        "typeProgram",
        "nameSolicitation",
        "idSolicitationOld",
        "datetimeCreated",
        "hasDeleteFlag",
        "hasHistoricalFlag",
        "hasPmViewLockFlag",
        "datetimeRemoveViewLock",
        "hasSubmissionLockFlag",
        "datetimeRemoveSubmissionLock",
        "hasSitisFlag",
        "TopicStage1StartDate",
        "TopicStage1EndDate",
        "TopicStage2StartDate",
        "TopicStage2EndDate",
        "TopicStage3aStartDate",
        "TopicStage3aEndDate",
        "TopicStage3bStartDate",
        "TopicStage3bEndDate",
        "TopicStage3cStartDate",
        "TopicStage3cEndDate",
        "TopicStage3dStartDate",
        "TopicStage3dEndDate",
        "TopicStage4StartDate",
        "TopicStage4EndDate",
        "TopicStage5StartDate",
        "TopicStage5EndDate",
        "TopicStage6StartDate",
        "TopicStage6EndDate",
        "TopicStage7StartDate",
        "TopicStage7EndDate",
        "TopicStage8StartDate",
        "TopicStage8EndDate",
        "SolicitationPreReleaseDate",
        "SolicitationOpenDate",
        "SolicitationCloseDate",
        "SelectionsDueDate",
        "AwardsDueDate",
        "isOutOfCycle",
        "groupSolicitation",
        "yearAward",
        "dateAward",
        "dateAwardEnd",
        "idSolicitation",
        "nameAward",
        "namePrincipal",
        "emailPrincipal",
        "nameCompany",
        "amountContract",
        "nameState",
        "desciptionMissionDirectorate",
        "codeCenter",
        "yearAward",
        "idTopic",
        "idSolicitation",
        "nameAward",
        "idAgencySBIR",
        "idCommandSBIR",
        "countPublishedQuestion",
        "countPublishedAnswer",
        "isTechAreaAirPlatform",
        "isTechAreaBattlespace",
        "isTechAreaBioMedical",
        "isTechAreaChemBioDefense",
        "isTechAreaElectronics",
        "isTechAreaGroundSea",
        "isTechAreaHumanSystems",
        "isTechAreaInfoSystems",
        "isTechAreaMaterials",
        "isTechAreaNuclear",
        "isTechAreaSensors",
        "isTechAreaSpacePlatforms",
        "isTechAreaWeapons",
        "descriptionSitisAdditionalInfo",
        "isRestrictedAreaMicroelectronics",
        "isRestrictedAreaCybersecurity",
        "isRestrictedAreaQuantumScience",
        "isRestrictedAreaDirectedEnergy",
        "isRestrictedAreaMachineLearning",
        "isACFullyNetworkedC3",
        "isACSpace",
        "isASAORAutonomy",
        "isASAORHypersonic",
        "isASAORNuclearModernization",
        "isRTAreaMicroelectronics",
        "isRTAreaCybersecurityEW",
        "isRTAreaQuantumScience",
        "isRTAreaDirectedEnergy",
        "isRTAreaMachineLearningAI",
        "isACFullyNetworkedC3",
        "isACSpace",
        "isASAORAutonomy",
        "isASAORHypersonic",
        "isASAORNuclearModernization",
        "nameAgency",
        "nameCommand",
        "typeProgram",
        "hasTPOCEmail",
        "hasTPOCName",
        "hasTPOCNPhone",
        "urlDODSBIRTopicAPI",
        "emailContactAP",
        "nameFirstAP",
        "nameLastAP",
        "nameProgramAP",
        "descriptionStatementOfInterestAP",
        "backgroundTopic",
        "descriptionTopic",
        "descriptionDirectToPhase2",
        "descriptionITAR",
        "keywordsTopicSBIR",
        "datetimeLastUpdated",
        "typeManufacturing",
        "topicObjective",
        "topicPhase1",
        "topicPhase2",
        "emailContact2",
        "nameFirstContact2",
        "nameLastContact2",
        "slugOfficeContact2",
        "emailContact3",
        "nameFirstContact3",
        "nameLastContact3",
        "slugOfficeContact3",
        "emailContact4",
        "nameFirstContact4",
        "nameLastContact4",
        "slugOfficeContact4",
        "emailContact",
        "nameFirstContact",
        "nameLastContact",
        "slugOfficeContact",
        "idSolicitationPrior",
        "typeProgramString",
        "topicRationale",
        "descriptionSitisAdditionalInfo",
        "idCommandAward",
        "idUserCreation",
        "hasDirectToPhase2",
        "hasImportFlag",
        "isITARRestrictedTopic",
        "idUserLastUpdated",
        "idManufacturing",
        "hasMeetTopicCriteriaFlag",
        "hasPrevApproveReviewFlag",
        "hasPrevSubmitToReviewFlag",
        "hasPriorSolicitationFlag",
        "idTopicPrior",
        "hasRenewableEnergyFlag",
        "idTopicStatus",
        "hasReferenceUploadFlag",
        "idTopicReference",
        "numberReference",
        "descriptionReference",
        "datetimeReferenceCreated",
        "telephoneContact",

        "descriptionTopic",
        "idProgram",
        "nameCompany",
        "urlCompany"
      )

    )
  }

.munge_sbir_names <-
  function(data) {
    dict_names <- dictionary_sbir_names()
    fdps_names <-
      names(data)

    actual_names <-
      fdps_names %>%
      map_chr(function(name) {
        df_row <-
          dict_names %>% filter(nameSBIR == name)
        if (nrow(df_row) == 0) {
          glue::glue("Missing {name}") %>% message()
          return(name)
        }

        df_row$nameActual
      })

    data %>%
      set_names(actual_names)
  }

# api

#' SBIR Solicitations
#'
#'
#'
#' @param only_open if \code{TRUE} only active solicitations
#'
#' @return
#' @export
#'
#' @examples
#' sbir_solicitations_api()
sbir_solicitations_api <-
  function(only_open = F)  {
    data <-
      "https://www.sbir.gov/api/solicitations.json?keyword=sbir" %>%
      fromJSON(simplifyDataFrame = T) %>%
      as_tibble() %>%
      .munge_sbir_names() %>%
      rename(nameSolicitation = title) %>%
      mutate_if(is.character, list(function(x) {
        x %>% stringi::stri_trans_general("Latin-ASCII")  %>% str_squish() %>%
          str_remove_all(" &amp; ") %>%
          str_to_upper() %>% gsub("\\s+", " ", .)
      }))



    data <-
      data %>%
      mutate(
        idSBIR = urlSBIRAward %>% str_to_lower() %>%
          str_remove_all("https://www.sbir.gov/sbirsearch/detail/") %>% as.integer()
      )

    data <-
      data %>% mutate_at(c("urlSBIRAward", "urlSBIR"), str_to_lower)

    if (data %>% hasName("nameAgency")) {
      df_agency <-
        data %>% select(idSBIR, nameAgency) %>% mutate(hasAgency = nameAgency %>% map_dbl(length) > 0) %>% filter(hasAgency) %>% unnest() %>% group_by(idSBIR) %>% summarise(nameAgencies = str_c(nameAgency, collapse =  " | "))

      data <-
        data %>% select(-nameAgency) %>%
        left_join(df_agency, by = "idSBIR") %>%
        select(nameAgencies, everything())
    }

    date_cols <- data %>% select(matches("date")) %>% names()

    if (length(date_cols) > 0) {
      data <- data %>%
        mutate_at(date_cols,
                  list(function(x) {
                    as.POSIXct(x, origin = "1970-01-01", tz = "UTC") %>% as.Date()
                  }))
    }

    data <-
      data %>%
      .munge_data()

    data <-
      data %>%
      mutate(isOpen = statusSBIR == "OPEN")

    if (only_open) {
      data <-
        data %>%
        filter(isOpen)
    }

    data
  }


#' SBIR Funded Companies
#'
#' @param return_message if \code{TRUE}  returns message
#'
#' @return a \code{tibble}
#' @export
#'
#' @examples
#' sbir_companies()
sbir_companies <-
  function(return_message = T) {
    data <-
      "https://www.sbir.gov/api/firm.json" %>%
      jsonlite::fromJSON(simplifyDataFrame = T) %>% as_tibble()

    data <- data %>%
      set_names(
        c(
          "nameCompany",
          "slugDUNS",
          "addressStreet1",
          "addressStreet2",
          "city",
          "codeState",
          "zipcode",
          "isSBACertifiedHUBZone",
          "isMinorityOwned",
          "isWomanOwned"
        )
      ) %>%
      mutate(zipcode = ifelse(zipcode == "-", "", zipcode))

    data <-
      data %>%
      unite(addressStreet,
            addressStreet1,
            addressStreet2,
            sep = " ",
            remove = F) %>%
      unite(cityState,
            city,
            codeState,
            sep = ", ",
            remove = F) %>%
      unite(cityStateZip,
            cityState,
            zipcode,
            sep = " ",
            remove = F) %>%
      unite(locationCompany,
            addressStreet,
            cityStateZip,
            sep = ", ") %>%
      mutate(locationCompany = str_squish(locationCompany) %>% str_replace_all("\\ , ", "\\, ")) %>%
      mutate_if(is.character, list(function(x) {
        ifelse(x == "", NA, x)
      })) %>%
      mutate_all(list(function(x) {
        x %>% stringi::stri_trans_general("Latin-ASCII")  %>% str_squish() %>%
          str_remove_all(" &amp; ") %>%
          str_to_upper() %>% gsub("\\s+", " ", .)
      })) %>%
      .munge_data(clean_address = F)

    data <-
      data %>%
      group_by(idDUNS, nameCompany) %>%
      dplyr::slice(1) %>%
      ungroup() %>%
      select(idDUNS, nameCompany, everything()) %>%
      mutate(idDUNS = case_when(nchar(idDUNS) <= 3 ~ NA_real_,
                                TRUE ~ idDUNS))

    if (return_message) {
      glue("Acquired {nrow(data) %>% formattable::comma(digits = 0)} SBIR funded companies") %>% message()
    }

    data
  }
.parse_sbir_firm_json <-
  function(url = "https://www.sbir.gov/api/firm.json?state=VA") {
    data <- url %>% fromJSON(simplifyDataFrame = T) %>% as_tibble() %>%
      .munge_sbir_names()

    data <- data %>%
      mutate_if(is.character,
                list(function(x) {
                  x %>% stringi::stri_trans_general("Latin-ASCII")  %>% str_squish() %>%
                    str_remove_all(" &amp; ") %>%
                    str_to_upper() %>% gsub("\\s+", " ", .)
                })) %>%
      unite(
        locationCompany,
        addressStreet1,
        addressStreet2,
        city,
        codeState,
        zipcode,
        sep = " ",
        remove = F
      ) %>%
      .munge_data(clean_address = F)
  }

.parse_sbir_agency_json <-
  function(url = "https://www.sbir.gov/api/awards.json?keyword=mars&agency=DOE") {
    data <-
      url %>% fromJSON(simplifyDataFrame = T) %>%
      as_tibble() %>% .munge_sbir_names()
    if (data %>% hasName("title")) {
      data <-
        data %>%
        rename(nameSBIR = title)
    }

    data <-
      data %>%
      mutate(
        idSBIR = urlSBIRAward %>% str_remove_all("https://www.sbir.gov/sbirsearch/detail/") %>% as.integer()
      )

    if (data %>% hasName("nameAgency")) {
      df_agency <-
        data %>%
        select(idSBIR, nameAgency) %>%
        unnest() %>%
        group_by(idSBIR) %>%
        summarise(nameAgency = nameAgency %>% str_c(collapse = " | ")) %>%
        ungroup()

      data <-
        data %>% select(-nameAgency) %>%
        left_join(df_agency, by = "idSBIR")
    }

    date_cols <- data %>% select(matches("date")) %>% names()

    if (length(date_cols) > 0) {
      data <- data %>%
        mutate_at(date_cols,
                  list(function(x) {
                    as.POSIXct(x, origin = "1970-01-01", tz = "UTC") %>% as.Date()
                  }))
    }

    data <-
      data %>%
      .munge_data() %>%
      mutate(urlSBIRAPI = url)

    data
  }


#' SBIR Award count
#'
#' Returns SBIR award counts for
#' each SBIR company
#'
#' @return
#' @export
#'
#' @examples
#' sbir_company_award_count()
sbir_company_award_count <-
  function() {
    url <-
      "https://www.sbir.gov/sbirsearch/firm/all?fDOTtitle=&duns=&city=&zip=&page=1&print=xls&per_page=5000000"
    data <-
      download_excel_file(url = url, has_col_names = T) %>%
      set_names(
        c(
          "rankCompany",
          "nameCompany",
          "slugDUNS",
          "countAwards",
          "addressStreet1",
          "addressStreet2",
          "city",
          "codeState",
          "zipcode",
          "urlCompany"
        )
      )

    data <- data %>%
      mutate(zipcode = ifelse(zipcode == "-", "", zipcode)) %>%
      mutate_if(is.character, list(function(x) {
        ifelse(is.na(x), "", x)
      }))

    data <-
      data %>%
      unite(addressStreet,
            addressStreet1,
            addressStreet2,
            sep = " ",
            remove = F) %>%
      unite(cityState,
            city,
            codeState,
            sep = ", ",
            remove = F) %>%
      unite(cityStateZip,
            cityState,
            zipcode,
            sep = " ",
            remove = F) %>%
      unite(locationCompany,
            addressStreet,
            cityStateZip,
            sep = ", ") %>%
      mutate(locationCompany = str_squish(locationCompany) %>% str_replace_all("\\ , ", "\\, ")) %>%
      mutate_if(is.character, list(function(x) {
        ifelse(x == "", NA, x)
      })) %>%
      mutate_all(list(function(x) {
        x %>% stringi::stri_trans_general("Latin-ASCII")  %>% str_squish() %>%
          str_remove_all(" &amp; ") %>%
          str_to_upper() %>% gsub("\\s+", " ", .)
      })) %>%
      .munge_data(clean_address = F)

    data <-
      data %>%
      select(nameCompany, idDUNS, everything()) %>%
      arrange(desc(countAwards)) %>%
      select(-one_of("rankCompany")) %>%
      mutate(countAwards = countAwards %>% comma(digits = 0))

    data <-
      data %>%
      separate(
        nameCompany,
        extra = "merge",
        into = c("nameCompany", "nameCompanyDBA"),
        sep = " DBA |\\(DBA| DBA,| DBA$|DBA: \\.DBA"
      ) %>%
      separate(nameCompany,
               into = c("nameCompany", "nameCompanyPrior"),
               sep = "FORMERLY") %>%
      mutate_at(c("nameCompany", "nameCompanyPrior", "nameCompanyDBA"),
                list(function(x) {
                  x %>%
                    gsub("\\(", " ", .) %>%
                    gsub("\\)", " ", .) %>%
                    str_remove_all("\\,$") %>%
                    str_squish()
                })) %>%
      select(idDUNS,
             nameCompany,
             nameCompanyDBA,
             nameCompanyPrior,
             everything())

    data <- data %>%
      mutate(idDUNS = case_when(nchar(idDUNS) <= 3 ~ NA_real_,
                                TRUE ~ idDUNS))

    data <- data %>%
      select(-one_of("cityState"))

    data

  }

#' SBIR Annual awards
#'
#' @return
#' @export
#'
#' @examples
sbir_annual_awards <-
  memoise::memoise(function() {
    data <-
      "https://www.sbir.gov/analytics-dashboard/xls?view_by=Year&xls_table=year&dataid=SbirAnalyticDashboardSqlYear" %>%
      download_excel_file(has_col_names = F)
    data <-
      data %>% dplyr::slice(3:nrow(data)) %>%
      set_names(c(
        "year",
        "countAwards",
        "countFirms",
        "amountAwarded",
        "amountObligated"
      )) %>%
      .munge_data() %>%
      mutate(amountAwarded = ifelse(is.na(amountAwarded), amountObligated, amountAwarded)) %>%
      select(-amountObligated) %>%
      .munge_data()

    data
  })

#' SBIR Agency Awards
#'
#' @return
#' @export
#'
#' @examples
#' sbir_agency_awards()
sbir_agency_awards <-
  memoise::memoise(function() {
    data <-
      "https://www.sbir.gov/analytics-dashboard/xls?view_by=Year&xls_table=agency&dataid=SbirAnalyticDashboardSqlAgency" %>%
      download_excel_file(has_col_names = F)

    data <-
      data %>% dplyr::slice(3:nrow(data)) %>%
      set_names(c(
        "slugAgency",
        "countAwards",
        "countFirms",
        "amountAwarded",
        "amountObligated"
      )) %>%
      .munge_data() %>%
      mutate(amountObligated = ifelse(is.na(amountObligated), 0, amountAwarded)) %>%
      mutate(amountSpent = amountAwarded + amountObligated) %>%
      select(-c(amountAwarded, amountObligated))

    data

  })

#' SBIR obligations
#'
#' Data ends in 2016
#'
#' @return
#' @export
#'
#' @examples
#' sbir_obligations()
sbir_obligations <-
  memoise::memoise(function() {
    data <-
      "https://www.sbir.gov/awards/annual-reports/xls?xls_table=SBIR_obligation&dataid=SbirAnnualReportsSummarySqlYearSbir" %>% download_excel_file()
    data <- data %>% dplyr::slice(3:nrow(data)) %>%
      setNames(
        c(
          "year",
          "amountObligated",
          "amountBudget",
          "amountPhase1Obligation",
          "amountPhase2Obligation"
        )
      ) %>% .munge_data() %>%
      mutate(year = as.numeric(year))
    data
  })



.parse_sbir_award_url <-
  function(url = "https://www.sbir.gov/sbirsearch/detail/1547673",
           return_message = T) {
    if (return_message) {
      glue::glue("Parsing: {url}") %>% message()
    }


    page <- url %>% read_html()

    parts <- url %>% str_split("/") %>%
      flatten_chr()

    idSBIR <-
      parts[length(parts)] %>% as.numeric()

    labels <-
      page %>% html_nodes(".open-label") %>% html_text() %>% str_remove_all("\\:")
    values <-
      page %>% html_nodes(".open-description") %>% html_text()
    business <-
      page %>% html_nodes(".sbc-name-wrapper a") %>% html_text() %>% str_to_upper() %>% str_squish()
    sub_text <-
      page %>% html_nodes(".award-sub-description")
    timeline <-
      page %>% html_node(".timeline-info-wrapper")

    address_business <-
      page %>% html_nodes(".sbc-address-wrapper") %>% html_text() %>% str_squish()

    abstract <-
      page %>% html_nodes(".abstract-wrapper") %>% html_text() %>% str_split("\n") %>%
      flatten_chr() %>%
      str_squish() %>%
      purrr::discard(function(x) {
        x %in% c("", "Abstract")
      }) %>%
      str_c(collapse = "") %>%
      str_squish()

    solicitation <-
      page %>% html_nodes(".page-header") %>% html_text() %>% unique() %>% .[[1]] %>% str_remove_all("&amp;amp;amp;") %>% str_squish()

    if (length(abstract) == 0) {
      abstract <- solicitation
    }

    data <-
      tibble(nameSBIR = labels, value = values) %>%
      left_join(dictionary_sbir_names(), by = "nameSBIR")

    if (data %>% filter(is.na(nameActual)) %>% nrow() > 0) {
      missing <- data %>% filter(is.na(nameActual)) %>% pull(nameSBIR)
      missing %>%
        walk(function(word) {
          glue("Missing {word}") %>% message()
        })

      data <- data %>%
        mutate(nameActual = ifelse(is.na(nameActual), nameSBIR, nameActual))
    }

    data <- data %>%
      select(nameActual, value)

    col_order <- data$nameActual

    data <-
      data %>%
      spread(nameActual, value) %>%
      select(one_of(col_order), everything())

    data <-
      data %>%
      mutate_at(
        c("amountContract", "yearAward", "idDUNS", "yearSolicitation"),
        list(readr::parse_number)
      )


    logical_cols <- names(data)[names(data) %in% c(
      "isHUBZoneOwned",
      "isWomanOwned",
      "isSociallyandEconomicallyDisadvantaged"
    )]

    if (length(logical_cols) > 0) {
      data <-
        data %>%
        mutate_at(
          logical_cols,
          list(function(x) {
            ifelse(x == "Y", TRUE, FALSE)
          })
        )
    }



    df_contact <-
      seq_along(sub_text) %>%
      map(function(x) {
        if (x %in% c(1, 2)) {
          type <-
            case_when(x == 1 ~ "Principal",
                      TRUE ~ "BusinessContact")
          values <-
            sub_text[x] %>% html_text() %>% str_split("Name:|Phone:|Email:|\n|Title:") %>%
            flatten_chr() %>%
            str_squish()  %>%
            discard(function(x) {
              x == ""
            }) %>%
            str_remove_all("^&nbsp")

          values <- values %>% unique()
          if (length(values) == 0) {
            return(tibble())
          }

          name =
            values[[1]]

          if (name %>% str_remove_all("\\)|\\(|\\-|\\ |[0-9]") == "" &
              (length(values) == 1)) {
            return(tibble())
          }

          if (values %>% str_remove_all("\\)|\\(|\\-|\\ ") %>% readr::parse_number() %>% discard(is.na) %>% length() > 0) {
            vals <-
              values %>% str_remove_all("\\)|\\(|\\-|\\ ") %>% readr::parse_number()

            remove_val <- vals[!vals %>% is.na()] %>% max()
            rows <- tibble(vals) %>%
              mutate(idRow = 1:n()) %>%
              mutate(vals = ifelse(is.na(vals), 0 , vals)) %>%
              filter(vals != remove_val) %>%
              pull(idRow)
            values <- values[rows]
          }

          if (name %>% str_detect("\\(")) {
            name <-
              name %>% str_split("\\(") %>% flatten_chr() %>% str_split("\\)") %>% flatten_chr() %>% str_squish() %>% str_c(collapse =  " ")
          }


          if (values %>% str_count("\\@") %>% sum(na.rm = T) > 0) {
            email <-
              values[values %>% str_detect("@")] %>% str_c(collapse = " | ")
          } else {
            email <- NA_character_
          }

          telephone_no <-
            values %>% grep("[0-9][0-9][0-9]-[0-9]", .)

          if (length(telephone_no) == 0) {
            telephone_no <- 0
          }

          if (telephone_no > 0) {
            telephone <-
              values[telephone_no]
          } else {
            telephone <- NA_character_
          }



          if (values[!values %in% c(email, name, telephone)] %>% length() > 0) {
            title <-
              values[!values %in% c(email, name, telephone)] %>%
              str_remove_all("\\(|\\)|\\ -") %>%
              discard(function(x) {
                x == ""
              })
            if (length(title) == 0) {
              title <- NA_character_
            } else {
              title <-
                title %>%
                .[[1]]
            }

          } else {
            title <- NA_character_
          }

          items <- c("name", "email", "telephone", "title")
          values <- c(name, email, telephone, title)
          items <- items %>% str_c(type)
          data <-
            tibble(items, values, idSBIR) %>%
            spread(items, values)
          return(data)
        }

        text <-
          sub_text[x] %>% html_text() %>% str_split("Name:|Phone:|Email:|\n|Type:|Name:|Contact:|Address:") %>%
          flatten_chr() %>%
          str_squish()  %>%
          discard(function(x) {
            x == ""
          }) %>%
          unique()

        if (length(text) == 1) {
          text <-
            text %>%
            .[[1]]

          if (text == "N/A") {
            text <- NA_character_
          }
          df <- tibble(idSBIR, nameResearchInstitution = text)
          return(df)
        }

        items <- c(
          "nameResearchInstitution",
          "nameContactResearchInstitution",
          "addressResearchInstitution",
          "telephoneResearchInstitution",
          "typeResearchInstitution"
        )

        tibble(item = items[1:length(text)], value = text, idSBIR) %>%
          spread(item, value) %>%
          select(idSBIR, one_of(items))

      }) %>%
      discard(function(x) {
        x %>% nrow() == 0
      })

    df_contact <- df_contact %>%
      reduce(left_join, by = "idSBIR")

    if (length(business) == 0) {
      business <- "UNKNOWN"
    }
    data <-
      data %>%
      mutate(
        nameAward = solicitation,
        nameCompany = business,
        addressCompany = address_business,
        idSBIR,
        descriptionAward = abstract
      ) %>%
      select(idSBIR, idSolicitation, nameCompany, nameAward, everything()) %>%
      left_join(df_contact, by = "idSBIR")

    char_names <-
      data %>% select_if(is.character) %>% select(-matches("url")) %>% names()
    data <-
      data %>%
      mutate_if(is.character,
                list(function(x) {
                  ifelse(x == "N/A", NA_character_, x) %>% gsub("\\s+", " ", .)
                }))


    data <-
      data %>% mutate_at(char_names, list(function(x) {
        x %>% str_squish() %>% str_to_upper()
      })) %>%
      mutate(urlSBIRAward = url)

    data
  }

.parse_sbir_award_urls <-
  function(urls = "https://www.sbir.gov/sbirsearch/detail/1547673",
           sleep_time = NULL,
           return_message = T) {
    df <-
      tibble()

    success <- function(res) {
      url <-
        res$url

      .parse_sbir_award_url_safe <-
        possibly(.parse_sbir_award_url, tibble())

      all_data <-
        .parse_sbir_award_url_safe(url = url, return_message = T)

      if (length(sleep_time) > 0) {
        Sys.sleep(time = sleep_time)
      }


      df <<-
        df %>%
        bind_rows(all_data)
    }
    failure <- function(msg) {
      tibble()
    }
    urls %>%
      map(function(x) {
        curl_fetch_multi(url = x, success, failure)
      })
    multi_run()
    df
  }


#' Parse SBIR URLS
#'
#' @param urls vector of URLs
#' @param sleep_time if not \code{NULL} sleep time between scrapes
#' @param return_message if \code{TRUE} return message
#' @param clean_entities if \code{TRUE} cleans entity data
#' @param snake_names if \code{TRUE} snakes names
#'
#' @return \code{data_frame}
#' @export
#'
#' @examples
#' parse_sbir_award_urls(urls = "https://www.sbir.gov/sbirsearch/detail/1584295")

parse_sbir_award_urls <-
  function(urls = NULL,
           sleep_time = NULL,
           clean_entities = T,
           snake_names = F,
           return_message = F) {
    if (length(urls) == 0) {
      "Enter urls" %>% message()
      return(invisible())
    }

    .parse_sbir_award_urls_safe <-
      possibly(.parse_sbir_award_url, tibble())

    data <-
      urls %>%
      future_map_dfr(function(url) {
        .parse_sbir_award_urls_safe(url = url,                                return_message = return_message)
      })

    data <-
      data %>%
      .fix_bad_sbir() %>%
      mutate(idDUNS = case_when(idDUNS == 0 ~ NA_real_,
                                TRUE ~ idDUNS))

    #
    #
    # data <-
    #   .parse_sbir_award_urls_safe(urls = urls,
    #                               sleep_time = sleep_time,
    #                               return_message = return_message)

    phone_names <-
      data %>%
      select(matches("telephone")) %>% names()


    if (data %>% hasName("dateAward")) {
      data <- data %>%
        mutate(
          isAwardDateActual = !is.na(dateAward),
          dateAward = case_when(
            is.na(dateAward) ~ glue("{yearAward}-05-15") %>% as.character(),
            TRUE ~ dateAward
          ) %>% ymd()
        )

    } else {
      if (data %>% hasName("yearAward")) {
        data <-
          data %>%
          mutate(
            isAwardDateActual = F,
            dateAward =  glue("{yearAward}-05-15") %>% as.character()  %>% ymd()
          )
      }
    }

    data <-
      data %>%
      mutate_at(phone_names,
                list(function(x) {
                  x %>% str_remove_all("\\(|\\)") %>% str_replace_all("\\ ", "\\-")
                })) %>%
      mutate_at(phone_names,
                list(function(x) {
                  ifelse(x == "--", NA_character_, x)
                }))

    id_names <-
      data %>%
      dplyr::select(one_of(c("idContract", "idSolicitation"))) %>%
      names()

    if (length(id_names) > 0) {
      data <- data %>%
        mutate_at(id_names,
                  list(function(x) {
                    x %>% str_remove_all("\\-")
                  }))
    }

    if (clean_entities) {
      data <-
        data %>%
        entities::refine_columns(entity_columns = "nameCompany")

      data <-
        data %>%
        entities::refine_columns(entity_columns = c(
          "nameResearchInstiution",
          numgram = 1,
          weight = c(
            d = 0,
            i = 0,
            s = 0,
            t = 0
          )
        ))
    }

    data <-
      data %>%
      .munge_data(clean_address = F)

    if (data %>% hasName("amountContract")) {
      data <- data %>%
        mutate(amountContract = as.integer(amountContract))
    }

    if (data %>% hasName("idContract")) {
      data <- data %>%
        mutate(idContract = idContract %>% str_remove_all("\\ |\\,|\\-")) %>%
        mutate(
          idContractResolved = case_when(
            idContract %>% nchar() > 13 ~ idContract %>% substr(1, 13),
            TRUE ~ idContract
          )
        ) %>%
        select(idSBIR, matches("idContract"), everything())
    }

    data <- data %>%
      mutate(
        idContract = case_when(
          nchar(idContract) == 12 &
            idContract %>% str_detect("FA875119P") ~ idContract %>% str_replace_all("FA875119P", "FA875119PA"),
          TRUE ~ idContract
        ),
        idContractResolved = case_when(
          nchar(idContractResolved) == 12 &
            idContractResolved %>% str_detect("FA875119P") ~ idContractResolved %>% str_replace_all("FA875119P", "FA875119PA"),
          TRUE ~ idContractResolved
        ),

      ) %>%
      mutate(
        idContract = idContract %>%  str_replace_all("FA865219PHB", "FA865219P01"),
        idContractResolved = idContractResolved  %>%  str_replace_all("FA865219PHB", "FA865219P01")
      )

    data <-
      data %>% mutate(
        dateAward = as.character(dateAward),
        dateAward = dateAward %>% str_replace_all("2109", "2019") %>% lubridate::ymd()
      )

    data <-
      data %>%
      munge_lite(snake_names = snake_names) %>%
      .remove_na()


    data

  }

.parse_sbir_award_page <-
  function(page) {
    solicit_nodes <-
      page %>%
      html_nodes(".title a")

    solicitations <-
      solicit_nodes %>%
      html_text()

    urls <-
      solicit_nodes %>% html_attr('href') %>% str_c("https://www.sbir.gov/", .)
    ids <-
      solicit_nodes %>% html_attr("href") %>% str_remove_all("/sbirsearch/detail/") %>% readr::parse_number()

    data <-
      tibble(idSBIR = ids, nameAward = solicitations)


    df_labels <-
      seq_along(ids) %>%
      map_df(function(x) {
        labels <-
          page %>% html_nodes(glue::glue(".search-result:nth-child({x}) .label-info")) %>%
          html_text()



        if (length(labels) > 0) {
          data <- tibble(idSBIR = ids[x], listLabels = list(labels))
          return(data)
        }

        tibble(idSBIR = ids[x])
      })

    df_company <-
      seq_along(ids) %>%
      map_df(function(x) {
        details <-
          page %>% html_nodes(glue::glue(
            ".search-result:nth-child({x}) .search-result-sub-title span"
          )) %>%
          html_text() %>%
          str_squish() %>%
          discard(list(function(x) {
            x == ""
          }))

        if (length(details) > 0) {
          data <-
            tibble(item = details) %>%
            separate(
              item,
              into = c("nameSBIR", "value"),
              extra = "merge",
              sep = "\\:"
            ) %>%
            mutate_all(str_squish) %>%
            left_join(tibble(
              nameActual = c("nameCompany", "groupSolicitation"),
              nameSBIR = c("SBC", "Topic")
            ), by = "nameSBIR") %>%
            select(nameActual, value) %>%
            mutate(idSBIR = ids[x]) %>%
            spread(nameActual, value)

          return(data)
        }
        tibble(idSBIR = ids[x])
      })

    data <-
      data %>%
      left_join(df_company, by = "idSBIR") %>%
      left_join(df_labels, by = "idSBIR")

    if (data %>% hasName("dateAward")) {
      data <- data %>%
        mutate(
          isDateEstimated = case_when(is.na(dateAward) ~ TRUE,
                                      TRUE ~ FALSE),

          dateAward = case_when(
            is.na(dateAward) ~ glue("{yearAward}-05-15") %>% as.character() %>% ymd(),
            TRUE ~ dateAward
          ),
          dateAwardEnd = case_when(is.na(dateAwardEnd) ~ dateAward + 360,
                                   TRUE ~ dateAwardEnd)
        )

    }
    data

  }

.parse_sbir_list_url <-
  function(url = "https://www.sbir.gov/sbirsearch/award/all?page=0",
           return_message = T) {
    if (return_message) {
      glue::glue("Parsing {url}") %>% message()
    }
    page <- url %>% read_html()
    numberPage <-
      url %>% str_split("=") %>% flatten_chr() %>% .[[2]] %>% readr::parse_number()

    data <-
      .parse_sbir_award_page(page = page) %>%
      mutate(
        urlSBIRPage = url,
        numberPage,
        urlSBIRAward = glue::glue("https://www.sbir.gov/sbirsearch/detail/{idSBIR}") %>% as.character()
      )

    char_names <-
      data %>% select_if(is.character) %>% select(-matches("url|email")) %>% names()

    data <-
      data %>% mutate_at(char_names, list(function(x) {
        x %>% str_squish() %>% str_to_upper()
      }))

    data
  }

#' Parse Vector of Paged SBIR Award URLS
#'
#' @param urls
#' @param return_message
#'
#' @return
#' @export
#'
#' @examples
#' parse_sbir_award_list_urls(urls = "https://www.sbir.gov/sbirsearch/award/all?page=0")
parse_sbir_award_list_urls <-
  function(urls = NULL,
           return_message = T) {
    if (length(urls) == 0) {
      stop("Please enter SBIR award urls")
    }
    .parse_sbir_list_url_safe <-
      possibly(.parse_sbir_list_url, tibble())
    data <-
      urls %>%
      furrr::future_map_dfr(function(url) {
        .parse_sbir_list_url_safe(url = url, return_message = return_message)
      })

    data <- data %>%
      mutate_if(is.character,
                list(function(x) {
                  case_when(x == "N/A" ~ NA_character_,
                            TRUE ~ as.character(x))
                }))
    data
  }


#' SBIR Award Dictionary
#'
#' @return
#' @export
#'
#' @examples
#' sbir_award_urls()
sbir_award_urls <-
  function() {
    page <-
      "https://www.sbir.gov/sbirsearch/award/all?page=0" %>% read_html()

    max_page <-
      page %>% html_nodes(".pager-last a") %>% html_attr("href") %>%  str_split("=") %>% flatten_chr() %>% .[[2]] %>% readr::parse_number()

    pages <- 0:max_page
    urls <-
      glue::glue("https://www.sbir.gov/sbirsearch/award/all?page={pages}") %>% as.character()

    data <- tibble(numberPage = pages + 1, urlSBIRPage = urls)

    data
  }

#' SBIR awards
#'
#' Parses all public sbirs
#'
#' @param return_message if \code{TRUE} returns message
#'
#' @return \code{tibble}
#' @export
#'
#' @examples
#' \dontrun{
#' sbir_awards()
#' }
sbir_awards <-
  function(return_message = F) {
    d <- sbir_award_urls()

    data <-
      parse_sbir_award_list_urls(urls = d$urlSBIRPage, return_message = return_message) %>%
      mutate(dateData = Sys.Date())

    data
  }



#' Acquires cached SBIR awards
#'
#' @param snake_names if \code{TRUE} returns snake names
#' @param create_text_description if \code{TRUE} creates a joined
#' text description field from the award name and award description fields
#' @param exclude_na_agencies if \code{TRUE} removes any awards with no known agencies
#'
#' @return \code{tibble}
#' @export
#'
#' @examples
#' \dontrun{
#' sbirs_historic()
#' }
sbirs_historic <-
  memoise::memoise(function(create_text_description = F,
                            exclude_na_agencies = T,
                            snake_names = F) {
    data <-
      "https://asbcllc.com/r_packages/govtrackR/data/all_sbir_sba.rda" %>%
      read_rda()



    if (create_text_description) {
      data <-
        data %>%
        mutate_at(c("nameAward", "descriptionAward"),
                  list(function(x) {
                    x %>% coalesce("")
                  })) %>%
        create_text_block(
          id_column = "idSBIR",
          text_columns = c("nameAward", "descriptionAward")
        ) %>%
        mutate_at(c("nameAward", "descriptionAward", "textSBIR"),
                  list(function(x) {
                    case_when(x == "" ~ NA_character_,
                              TRUE ~ x)
                  })) %>%
        select(
          idSBIR:idContractResolved,
          idContract,
          idSolicitation,
          groupSolicitation,
          idDUNS,
          nameCompanyClean,
          nameCompany:nameAward,
          textSBIR,
          everything()
        )
    }

    data <-
      data %>%
      filter(!is.na(nameAgency))

    data <-
      data %>%
      munge_lite(
        snake_names = snake_names,
        no_extra = T,
        exclude_bloat = T
      )

    data <- data %>%
      mutate(idAgencyTrackingClean = idAgencyTracking %>% str_remove_all("\\-"))

    if (snake_names) {
      data <- data %>% janitor::clean_names()
    }

    data
  })


import_sbir_awards <-
  function(start_override = NULL,
           end_override = NULL,
           sleep_time = NULL,
           return_message = T,
           folder_path = "Desktop/data/sbir/pages") {
    df_urls <- dictionary_sbir_award_urls()
    if (length(start_override) > 0) {
      df_urls <-
        df_urls %>%
        filter(numberPage >= start_override)
    }

    if (length(end_override) > 0) {
      df_urls <-
        df_urls %>%
        filter(numberPage <= end_override)
    }

    urls <- df_urls$urlSBIRPage
    .parse_sbir_list_url_safe <-
      purrr::possibly(.parse_sbir_list_url, tibble())
    oldwd <- getwd()
    setwd("~")
    .build_folder(folder_path)
    setwd(folder_path)

    urls %>%
      walk(function(url) {
        page_no <-
          df_urls %>% filter(urlSBIRPage == url) %>% pull(numberPage)

        data <-
          .parse_sbir_list_url_safe(url = url, return_message = return_message)

        file <- glue::glue("{page_no}.rda")

        if (length(sleep_time) > 0) {
          Sys.sleep(time = sleep_time)
        }

        data %>% save(file = file)
      })

    if (getwd() != oldwd) {
      setwd(oldwd)
    }
    return(invisible())
  }


# sbir_api ----------------------------------------------------------------


.parse_sbir_api_slor <-
  function(url = "https://sbir.defensebusiness.org/search/SolrQuery?core=topic&method=query&q=data&start=0&AspxAutoDetectCookieSupport=1") {
    url %>% fromJSON()
  }



# solicitations -----------------------------------------------------------

sbir_solicitation_urls <-
  function() {
    page <-
      "https://www.sbir.gov/sbirsearch/topic/current/" %>% read_html()

    max_page <-
      page %>% html_nodes(".pager-last a") %>% html_attr("href") %>%  str_split("=") %>% flatten_chr() %>% .[[2]] %>% readr::parse_number()

    pages <- 0:max_page
    urls <-
      glue::glue("https://www.sbir.gov/sbirsearch/topic/current?page={pages}") %>% as.character()

    data <- tibble(numberPage = pages + 1, urlSBIRPage = urls)

    data
  }
.parse_sbir_solicitation <-
  function(page) {
    solicit_nodes <-
      page %>%
      html_nodes(".title a")

    titles <-
      solicit_nodes %>% html_text() %>% str_squish()

    slug <- solicit_nodes %>% html_attr("href")
    urlSolicitation <- str_c("https://www.sbir.gov", slug)
    idSolicitation <-
      parse_number(slug)

    descriptions <-
      page %>% html_nodes(".search-snippet") %>% html_text() %>% str_squish() %>% str_to_upper()

    result_nodes <- page %>% html_nodes(".search-result")
    data <-
      seq_along(result_nodes) %>%
      map_dfr(function(x) {
        tags <-
          result_nodes[[x]] %>% html_nodes(".label-info") %>% html_text() %>% str_c(collapse = "|")
        dates <-
          result_nodes[[x]] %>% html_nodes(".solr-search-span") %>%
          html_text()
        d <-
          tibble(dates) %>%
          separate(
            dates,
            into = c("typeDate", "value"),
            extra = "merge",
            sep = "\\: "
          ) %>%
          mutate(
            item = case_when(
              typeDate %>% str_detect("Release") ~ "dateRelease",
              typeDate %>% str_detect("Open") ~ "dateOpen",
              typeDate %>% str_detect("Due") ~ "dateDue",
              TRUE ~ "dateClose"
            )
          ) %>%
          select(item, value) %>%
          spread(item, value) %>%
          mutate_all(mdy)

        d %>%
          mutate(idRow = x, tags) %>%
          select(idRow, everything())
      })

    data <-
      tibble(
        idSolicitation,
        nameSolicitation = titles,
        descriptionSolicitationPage = descriptions,
        urlSolicitation
      ) %>%
      mutate(idRow = 1:n()) %>%
      left_join(data, by = "idRow") %>%
      select(-idRow)

    data
  }

.parse_sbir_solicitation_page_url <-
  function(url = "https://www.sbir.gov/sbirsearch/topic/current?page=0",
           return_message = T) {
    if (return_message) {
      glue("Parsing {url}") %>% message()
    }
    read_html(x = url) %>%
      .parse_sbir_solicitation() %>%
      mutate(urlPage = url)
  }

#' SBIR Solications
#'
#' @param include_details if \code{TRUE} returns detailed solicitation descriptions
#' @param return_message if \code{TRUE} returns a message
#'
#' @return
#' @export
#'
#' @examples
sbir_solicitations <-
  function(include_details = T,
           return_message = T) {
    data <- sbir_solicitation_urls()
    .parse_sbir_solicitation_page_url_safe <-
      possibly(.parse_sbir_solicitation_page_url, tibble())

    all_data <-
      data$urlSBIRPage %>%
      future_map_dfr(function(url) {
        .parse_sbir_solicitation_page_url_safe(url = url, return_message = return_message)
      })

    all_data <-
      all_data %>%
      .munge_data()

    if (!include_details) {
      return(all_data)
    }

    df_details <-
      parse_sbir_solicitation_detail_urls(urls = all_data$urlSolicitation,
                                          return_message = return_message)

    all_data <-
      all_data %>%
      left_join(df_details, by = c("idSolicitation", "urlSolicitation"))

    all_data <-
      all_data %>%
      .munge_data() %>%
      mutate(
        isActive = dateClose >= Sys.Date(),
        countDaysOpen = dateClose - dateOpen,
        countDaysRemaining = pmax(0, dateClose - Sys.Date()),
        nameBranch = case_when(nameBranch == "N/A" ~ NA_character_,
                               TRUE ~ nameBranch)
      )

    all_data
  }


.parse_sbir_solicitation_detail_url <-
  function(url = "https://www.sbir.gov/sbirsearch/detail/1620819",
           return_message = T) {
    if (return_message) {
      glue("Parsing {url}") %>% message()
    }

    page <- read_html(url)
    parts <- url %>% str_split("/") %>% flatten_chr()
    idSolicitation <-
      parts[length(parts)] %>% as.numeric()


    parts <-
      page %>% html_nodes(".col-sm-6") %>%
      html_text() %>% str_split("\n\n") %>%
      flatten_chr() %>%
      str_squish() %>%
      discard(function(x) {
        x == ""
      })

    data <-
      tibble(parts) %>%
      separate(
        parts,
        into = c("nameSBIR", "value"),
        extra = "merge",
        sep = "\\: "
      ) %>%
      mutate_all(str_squish)

    data <-
      data %>%
      left_join(dictionary_sbir_names(), by = "nameSBIR") %>%
      distinct() %>%
      select(nameActual, value)

    cols <-
      data %>% pull(nameActual)

    data <-
      data %>%
      spread(nameActual, value) %>%
      select(one_of(cols))

    if (data %>% hasName("programPhaseYear")) {
      data <-
        data %>%
        separate(
          programPhaseYear,
          into = c("slugProgram", "idPhase", "yearSolicitation"),
          extra = "merge",
          sep = "\\|"
        ) %>%
        mutate_all(str_squish) %>%
        mutate(yearSolicitation = as.numeric(yearSolicitation))
    }



    description <-
      page %>% html_nodes(".solicitation-description p") %>% html_text() %>% str_squish() %>%
      stringi::stri_trans_general("Latin-ASCII")  %>% str_squish() %>%
      str_remove_all(" &amp; ") %>%
      str_to_upper() %>% gsub("\\s+", " ", .) %>%
      str_split("\\•") %>%
      flatten_chr() %>%
      str_squish()

    if (length(description) > 0) {
      data <- data %>%
        mutate(descriptionSolicitation = description %>% str_c(description, collapse = "|"))
    }


    data <- data %>%
      mutate(idSolicitation,
             urlSolicitation = url) %>%
      select(idSolicitation, everything())

    data

  }

#' Parse list of SBIR solication URLs
#'
#' @param urls vector of urls
#' @param return_message if \code{TRUE} return a message
#'
#' @return \code{tibble}
#' @export
#'
#' @examples
parse_sbir_solicitation_detail_urls <-
  function(urls = NULL,
           return_message = T) {
    if (length(urls) == 0) {
      stop("Enter SBIR Solicitation URLs")
    }

    .parse_sbir_solicitation_detail_url_safe <-
      possibly(.parse_sbir_solicitation_detail_url, tibble())

    urls %>%
      future_map_dfr(function(url) {
        .parse_sbir_solicitation_detail_url_safe(url = url, return_message = return_message)
      })



  }



# dod_sbir ----------------------------------------------------------------


#' DOD Phase II SBIR enhancements
#'
#' Third party investments from DOD
#' Phase II companies from 1999 to 2012
#'
#' @return
#' @export
#'
#' @examples
sbir_enhancements_dod <-
  function() {
    page <-
      "https://www.acq.osd.mil/osbp/sbir/awards/enhancement-awards.shtml" %>%
      read_html()

    data <- page %>% html_table(fill = F) %>% .[[1]] %>% as_tibble()

    data <- data %>% .munge_sbir_names()

    data <-
      data %>%
      mutate(
        yearFiscal =
          case_when(
            yearFiscal == 99 ~ 1999L,
            nchar(yearFiscal) == 1 ~ glue("200{yearFiscal}") %>% as.integer(),
            yearFiscal %in% c(10:20) ~ glue("20{yearFiscal}") %>% as.integer()
          ) %>% as.integer()
      ) %>%
      .munge_data(clean_address = F)

    data <-
      data %>%
      separate(
        nameCompany,
        into = c("nameCompany", "nameCompanyDBA"),
        extra = "merge",
        sep = "\\("
      ) %>%
      mutate(nameCompanyDBA = nameCompanyDBA %>% str_remove_all("DBA|FORMERLY")) %>%
      mutate_if(is.character, str_squish) %>%
      select(yearFiscal, slugAgency, nameCompanyClean, everything())



    data
  }

.parse_year_dod_sbir_solicitation_url <-
  function(url = "https://www.acq.osd.mil/osbp/sbir/solicitations/sbir20192/index.shtml",
           return_message = T) {
    if (return_message) {
      glue("Parsing {url}") %>% message()
    }
    url_year <- url
    page <- url %>% read_html()

    base <-
      url %>% str_remove_all(".shtml") %>% str_remove_all("index")

    nameTopic <-
      page %>% html_nodes("table+ table td:nth-child(1) strong") %>% html_text()

    if (length(nameTopic) == 0) {
      nameTopic <-
        page %>% html_nodes("td:nth-child(1) strong") %>% html_text() %>% str_squish()
    }

    dateLastModified <-
      page %>% html_nodes("table+ table td:nth-child(2)") %>% html_text() %>% mdy()

    if (length(dateLastModified) == 0) {
      dateLastModified <-
        page %>% html_nodes("td:nth-child(2)") %>% html_text() %>% mdy()
    }

    if (length(dateLastModified) > length(nameTopic)) {
      start <- length(dateLastModified) - length(nameTopic)
      dateLastModified <-
        dateLastModified[(start + 1):length(dateLastModified)]
    }

    all_data <-
      tibble(nameTopic, dateLastModified) %>%
      .munge_data()

    links <- page %>% html_nodes(".docs a")
    topics <- links %>% html_attr("title")

    url <-
      links %>% html_attr("href") %>% str_c(base, .)


    df_urls <-
      tibble(topics, url) %>%
      separate(
        topics,
        sep = "\\[",
        extra = "merge",
        into = c("nameTopic", "type")
      ) %>%
      mutate_all(list(function(x) {
        x %>% str_remove_all("]") %>%
          str_squish()
      })) %>%
      .munge_data()

    df_topics <-
      distinct(df_urls, nameTopic) %>%
      mutate(
        nameTopicActual = all_data$nameTopic,
        dateLastModified = all_data$dateLastModified
      )

    all_data <-
      df_urls %>%
      left_join(df_topics, by = "nameTopic") %>%
      select(-nameTopic) %>%
      rename(nameTopic = nameTopicActual) %>%
      mutate(type = case_when(type %>% str_detect("WORD|word") ~ "Word",
                              TRUE ~ type)) %>%
      mutate(type = glue("urlDODYearSolicitation{type}") %>% as.character()) %>%
      spread(type, url) %>%
      .munge_data()


    all_data %>%
      mutate(urlYearSolicitation = url_year)
  }


.parse_year_dod_sbir_solicitation_urls <-
  function(urls = "https://www.acq.osd.mil/osbp/sbir/solicitations/sbir20192/index.shtml",
           return_message = T) {
    .parse_year_dod_sbir_solicitation_url_safe <-
      possibly(.parse_year_dod_sbir_solicitation_url, tibble())

    urls %>%
      map_dfr(function(url) {
        .parse_year_dod_sbir_solicitation_url_safe(url = url, return_message =
        )
      })
  }

#' DOD SBIR Solicitation url dictionary
#'
#' @param parse_years if \code{TRUE} parses years solicitations for links to the
#' underlying solicitation files
#'
#' @return
#' @export
#'
#' @examples
dictionary_dod_sbir_solicitations <-
  function(parse_years = F,
           return_message = T) {
    page <-
      read_html("https://www.acq.osd.mil/osbp/sbir/solicitations/archives.shtml")

    data <-
      page %>%
      html_table(fill = T) %>%
      flatten_df() %>%
      setNames(c("yearFiscal", "nameSolicitation", "dateClose")) %>%
      .munge_data() %>%
      mutate(
        yearFiscal = year(dateClose),
        typeSolicitation = case_when(nameSolicitation %>% str_detect("SBIR") ~ "SBIR",
                                     TRUE ~ "STTR")
      ) %>%
      filter(!is.na(dateClose)) %>%
      select(dateClose, yearFiscal, typeSolicitation, everything())

    nodes <- page %>% html_nodes("td a")
    nameSolicitation <- nodes %>% html_text()
    urlYearSolicitation <-
      nodes %>% html_attr("href") %>% str_c("https://www.acq.osd.mil/osbp/sbir/solicitations/", .)

    df_urls <-
      tibble(nameSolicitation, urlYearSolicitation) %>% .munge_data()


    data <-
      data %>%
      left_join(df_urls, by = "nameSolicitation")

    if (parse_years) {
      df_urls <-
        .parse_year_dod_sbir_solicitation_urls(urls = data$urlYearSolicitation,
                                               return_message = return_message)
    }

    data


  }


# dod ---------------------------------------------------------------------



.dictionary_sbir_award_tables <-
  function() {
    tibble(
      table = c(
        "aw_program_type_name",
        "org_state_name_abbr",
        "agency_name_abbr",
        "aw_fiscal_year",
        "org_ipo_flag",
        "small_business_flag",
        "sdb_flag",
        "wosb_flag",
        "hub_zone_flag",
        "sub_vet_owned_flag",
        "sdvosb_flag",
        "fed_facility_flag",
        "corp_entity_flag",
        "foreign_national_flag",
        "animal_research_flag",
        "solicitation_name",
        "ta_air_platform_flag",
        "ta_chem_bio_defense_flag",
        "ta_info_systems_flag",
        "ta_ground_sea_flag",
        "ta_materials_flag",
        "ta_bio_medical_flag",
        "ta_sensors_flag",
        "ta_electronics_flag",
        "ta_battlespace_flag",
        "ta_space_platforms_flag",
        "ta_human_systems_flag",
        "ta_weapons_flag",
        "ta_nuclear_flag",
        "topic_itar_restricted_flag",
        "submission_type",
        "topic_renew_energy_flag",
        "manufacturing_type_name",
        "sub_direct2_phase2_flag"
      ),
      nameActual =
        c(
          "nameProgram",
          "slugState",
          "slugAgency",
          "yearFiscal",
          "hasIPO",
          "isSmallBusiness",
          "isSmallDisadvantagedBusiness",
          "isWomanOwnedSmallBusiness",
          "isHubZoneCompany",
          "isVetSmallDisadvantagedBusiness",
          "isServiceDisabledOwnedSmallBusiness",
          "isFedFacility",
          "isCorporateEntity",
          "isForeignNational",
          "hasAnimalResearch",
          "nameSolicitation",
          "isAirPlatformTopic",
          "isBioChemicalDefenseTopic",
          "isInformationSystemTopic",
          "isGroundSeaTopic",
          "isMaterialsTopic",
          "isBioMedicalTopic",
          "isSensorsTopic",
          "isElectronicsTopic",
          "isBattleSpaceTopic",
          "isSpaceTopic",
          "isHumanSystemsTopic",
          "isWeaponsTopic",
          "isNuclearTopic",
          "isITARRestrictedTopic",
          "typeSubmission",
          "isRenewableEnergy",
          "typeManufacturing",
          "isDirectToPhase2"
        )

    )
  }

#' Department of Defense SBIR Award Summary
#'
#' Returns summary couunts for
#' DOD agency SBIR award winners by table.
#'
#' @return
#' @export
#'
#' @examples
sbir_award_summary_dod <-
  function() {
    json_data <-
      "https://sbir.defensebusiness.org/search/SolrQuery?core=award&method=query&start=0" %>%
      fromJSON(simplifyDataFrame = T)

    fields <- json_data$facet_counts$facet_fields

    tibble(table = names(fields))

    all_data <-
      seq_along(fields) %>%
      map_dfr(function(x) {
        values <- fields[[x]]
        table <- names(fields)[[x]]
        tibble(table,
               item = values[c(T, F)],
               count = values[c(F, T)] %>% as.numeric())
      })

    all_data <-
      all_data %>%
      left_join(.dictionary_sbir_award_tables(), by = "table") %>%
      select(table, nameActual, item, everything()) %>%
      mutate(count = count %>% formattable::comma(digits = 0))

    all_data

  }

.parse_dod_sbir_award_json <-
  function(json_data) {
    data <- json_data$response$docs %>% as_tibble()

    data <-
      data %>%
      .munge_sbir_names() %>%
      .munge_data(clean_address = F)

    data <-
      data %>%
      filter(!is.na(nameCompany) |
               !is.na(idContract) |
               !is.na(keywordsSBIRCompany))

    data
  }

.parse_sbir_dod_url_api <-
  function(url = "https://sbir.defensebusiness.org/search/SolrQuery?core=award&method=query&q=&start=0") {
    json_data <- fromJSON(url)

    data <-
      json_data %>%
      .parse_dod_sbir_award_json() %>%
      mutate_if(is.character,
                list(function(x) {
                  if_else(x == "", NA_character_, x)
                })) %>%
      .remove_na()

    data

  }

.generate_dod_award_sbir_url <-
  function(year = 2018,
           return_message = T) {
    if (return_message) {
      glue("Generating DOD SBIR urls for {year}") %>% message()
    }

    url <-
      glue(
        "https://sbir.defensebusiness.org/search/SolrQuery?core=award&fq=(aw_fiscal_year:{year})&method=query&start=0"
      ) %>%
      as.character()

    json_data <-
      url %>%
      fromJSON()

    total_pages <-
      json_data$response$numFound
    options(scipen = 99999)
    pages <- seq(from = 0, to = total_pages, by = 10)

    urls <-
      glue(
        "https://sbir.defensebusiness.org/search/SolrQuery?core=award&fq=(aw_fiscal_year:{year})&method=query&start={pages}"
      ) %>%
      as.character()

    tibble(yearTopic = year, urlAPI = urls)
  }

.generate_dod_award_sbir_urls <-
  function(years = 1983:2019,
           all_years = F,
           return_message = T) {
    if (all_years) {
      start <- 1983
      end <- Sys.Date() %>% lubridate::year()
      years <- start:end
    }

    all_data <-
      years %>%
      map_dfr(function(year) {
        .generate_dod_award_sbir_url(year = year, return_message = return_message)
      })

    all_data
  }

.parse_sbir_dod_url_api_safe <-
  purrr::possibly(.parse_sbir_dod_url_api, tibble::tibble())

.parse_sbir_dod_urls <-
  function(urls, return_message = T) {
    all_data <-
      urls %>%
      map_dfr(function(url) {
        if (return_message) {
          url %>% message()
        }
        d <-
          .parse_sbir_dod_url_api_safe(url = url) %>%
          select(-matches("urlSpending")) %>%
          mutate(urlAPI = url)

        d
      })

    all_data
  }

.dod_years_sbirs <-
  function(years = 1983:2019,
           all_years = F,
           return_message = T) {
    df_urls <-
      .generate_dod_award_sbir_urls(years = years,
                                    all_years = all_years,
                                    return_message = return_message)

    all_data <-
      .parse_sbir_dod_urls(urls = df_urls$urlAPI, return_message = return_message)


    all_data

  }


.all_dod_sbirs <-
  function(return_message = T) {
    url <-
      "https://sbir.defensebusiness.org/search/SolrQuery?core=award&method=query&start=0"

    json_data <-
      url %>%
      fromJSON()

    total_pages <-
      json_data$response$numFound
    options(scipen = 99999)
    pages <- seq(from = 0, to = total_pages, by = 10)

    urls <-
      glue(
        "https://sbir.defensebusiness.org/search/SolrQuery?core=award&method=query&q=&start={pages}"
      ) %>%
      as.character()


    all_data <-
      urls %>%
      .parse_sbir_dod_urls(return_message = return_message)


    all_data <-
      all_data %>%
      .remove_na()


    all_data
  }

#' DOD SBIR Solicitations
#'
#' All active and histric DOD
#' SBIR/STTR solicitations.
#'
#' @return \code{tibble}
#' @export
#'
#' @examples
#' dod_sbir_solicitations()
dod_sbir_solicitations <-
  function() {
    data <- "https://sbir.defensebusiness.org/search/solicitations" %>%
      fromJSON() %>%
      as_tibble()

    data <-
      data %>%
      .munge_sbir_names() %>%
      select(-contains("remove"))

    data <-
      data %>%
      .munge_data()

    data
  }

.dod_sbir_topics <-
  memoise::memoise(function() {
    json <-
      "https://sbir.defensebusiness.org/topics/gettopics" %>%
      fromJSON(simplifyDataFrame = T)

    data <-
      json$Results %>% as_tibble()

    data <-
      data %>%
      .munge_sbir_names()

    data <- data %>%
      .distinct_sbir_cols()

    data <-
      data %>%
      .munge_data(clean_address = F)

    if (data %>% hasName("yearAward")) {
      data <- data %>%
        rename(yearTopic = yearAward)
    }

    data <-
      data %>%
      select(one_of(c(
        "yearTopic",
        "typeProgram",
        "typeProgram",
        "idTopic"
      )),
      everything()) %>%
      mutate(
        urlDODSBIRTopicAPI = glue(
          "https://sbir.defensebusiness.org/topics/GetTopicDetails?topicId={idTopic}"
        ) %>% as.character()
      )
    data
  })

.parse_dod_topic_dates <-
  function(data) {
    date_cols <- data %>% select(matches("datetime")) %>% names()
    if (length(date_cols) > 0) {
      data <-
        data %>%
        mutate_at(date_cols,
                  list(function(x) {
                    x %>%
                      parse_number() / 1000
                  })) %>%
        mutate_at(date_cols,
                  list(function(x) {
                    x %>%
                      as.POSIXct(origin = "1970-01-01", tz = "UTC") %>% as.Date()
                  }))
    }

    data
  }

.parse_dod_sbir_topic_url <-
  function(url = "https://sbir.defensebusiness.org/topics/GetTopicDetails?topicId=307349",
           return_message = T) {
    if (return_message) {
      glue("Parsing {url}") %>% message()
    }
    data <-
      fromJSON(url)
    d <- data %>%
      enframe(name = "table") %>%
      filter(table == "topic") %>%
      mutate(value = value %>% map(enframe)) %>%
      unnest_legacy()

    d <- d %>%
      mutate(
        type = value %>% map(class) %>% flatten_chr(),
        nested = type %>% str_detect("data|list")
      )

    has_nested <- d %>% filter(nested) %>% nrow()

    df_bases <-
      d %>%
      filter(!nested) %>%
      filter(type != "NULL") %>%
      select(type, value, name)

    types <-
      df_bases$type %>% unique()

    df_bases <-
      types %>%
      map(function(x) {
        df_bases %>%
          filter(type == x) %>%
          select(name, value) %>%
          unnest(cols = c(value)) %>%
          spread(name, value) %>%
          mutate(urlDODSBIRTopicAPI = url)
      })

    df_base <-
      df_bases %>%
      reduce(left_join, by = "urlDODSBIRTopicAPI") %>%
      .munge_sbir_names()

    df_base <-
      df_base %>%
      .distinct_sbir_cols() %>%
      .parse_dod_topic_dates() %>%
      .munge_data(
        clean_address = F,
        unformat = T,
        parse_dates = F
      ) %>%
      .remove_na()

    if (df_base %>% hasName("hasTPOCEmail")) {
      df_base <- df_base %>%
        mutate(hasTPOCEmail = as.logical(hasTPOCEmail))
    }
    if (df_base %>% hasName("typeProgram")) {
      df_base <- df_base %>%
        rename(idProgram = typeProgram)
    }

    if (df_base %>% hasName("typeProgramString")) {
      df_base <-
        df_base %>%
        rename(typeProgram = typeProgramString)
    }

    if (!has_nested) {
      return(df_base)
    }

    d <- d %>%
      filter(nested)

    types <-
      d$type %>% unique()

    types <- types[types %>% str_detect("data")]

    d_nested <-
      types %>%
      map(function(x) {
        d_row <-
          d %>%
          filter(type == x) %>%
          select(name, value) %>%
          unnest(cols = c(value))

        if (x == "data.frame") {
          d_row <-
            d_row %>%
            select(-one_of("ReferenceFiles")) %>%
            select(-name) %>%
            .munge_sbir_names() %>%
            .distinct_sbir_cols() %>%
            .parse_dod_topic_dates() %>%
            .munge_data(
              clean_address = F,
              unformat = T,
              parse_dates = F
            )

          d_row <- d_row %>%
            group_by(idTopic) %>%
            nest() %>%
            ungroup() %>%
            rename(dataTopicReferences = data) %>%
            mutate(
              hasTopicReferences = T,
              countReferences = dataTopicReferences %>% map_dbl(nrow),
              urlDODSBIRTopicAPI = url
            ) %>%
            select(idTopic,
                   hasTopicReferences,
                   countReferences,
                   everything())

          return(d_row)
        }

        if (x == "list") {
          d_row <-
            d_row %>%
            mutate(type =  value %>% map(class) %>% flatten_chr()) %>%
            unique()

          types <-
            d_row$type %>% unique()

          types <-
            types[!types %>% str_detect("logical")]

          d_bases <-
            types %>%
            map(function(y) {
              d_row %>%
                filter(type == y) %>%
                select(name, value) %>%
                unnest(cols = c(value)) %>%
                filter(value != "") %>%
                group_by(name) %>%
                mutate(id = 1:n()) %>%
                filter(id == max(id)) %>%
                ungroup() %>%
                select(-id) %>%
                filter(value != "--") %>%
                spread(name, value) %>%
                mutate(urlDODSBIRTopicAPI = url)
            })

          d_bases <-
            d_bases %>% reduce(left_join, by = "urlDODSBIRTopicAPI")

          if (nrow(d_bases) == 0) {
            return(NULL)
          }

          d_bases <-
            d_bases %>%
            .munge_sbir_names() %>%
            .distinct_sbir_cols() %>%
            .parse_dod_topic_dates() %>%
            .munge_data(
              clean_address = F,
              unformat = T,
              parse_dates = F
            )
          return(d_bases)
        }


      })


    if (length(d_nested) > 0) {
      d_nested <-
        d_nested %>%
        discard(function(x) {
          x %>% is_null()
        }) %>%
        reduce(left_join, by = "urlDODSBIRTopicAPI") %>%
        select(-one_of("idTopic"))

      data <-
        df_base %>%
        bind_cols(d_nested %>% select(-urlDODSBIRTopicAPI))

    } else {
      data <- df_base
    }

    if (data %>% hasName("nameFirstContact") &
        data %>% hasName("nameLastContact")) {
      data <-
        data %>%
        unite(
          nameContactPrimary,
          nameFirstContact,
          nameLastContact,
          sep = " ",
          remove = F
        ) %>%
        mutate(emailContactPrimary = emailContact)

      df_contacts <-
        data %>%
        select(matches("nameFirstContact|nameLastContact|emailContact")) %>%
        select(-matches("AP|emailContactPrimary")) %>%
        gather(name, value) %>%
        mutate(
          numberPerson = name %>% str_extract("[0-9]") %>% as.integer()-1 ,
          numberPerson = numberPerson %>% coalesce(0) + 1,
          name = name %>% str_remove_all("[0-9]")
        ) %>%
        spread(name, value) %>%
        unite(
          nameContact,
          nameFirstContact,
          nameLastContact,
          sep = " ",
          remove = F
        ) %>%
        mutate(urlDODSBIRTopicAPI = url) %>%
        group_by(urlDODSBIRTopicAPI) %>%
        nest() %>%
        ungroup() %>%
        rename(dataContacts = data) %>%
        mutate(hasContacts = T)

      data <-
        data %>%
        left_join(df_contacts, by = "urlDODSBIRTopicAPI")

      remove_cols <-
        data %>% select(matches("nameFirst|nameLast|^email")) %>%
        select(-matches("AP$|Primary")) %>%
        names()

      if (length(remove_cols) > 0) {
        data <-
          data %>%
          select(-one_of(remove_cols))
      }
    }

    if (data %>% hasName("nameFirstAP") &
        data %>% hasName("nameLastAP")) {
      data <-
        data %>%
        unite(nameContactAP,
              nameFirstAP,
              nameLastAP,
              sep = " ",
              remove = T)
    }

    data <-
      data %>%
      select(one_of("idTopic", "titleTopic"),
             matches("topic|name"),
             everything())

    data
  }

#' Parse Department of Defense SBIR Open Topic URLs
#'
#' Parses DOD SBIR Open Topics from vector
#' of API urls
#'
#' @param urls of API urls
#' @param return_message if \code{TRUE} returns a message
#'
#' @return \code{tibble()}
#' @export
#'
#' @examples
#' parse_dod_sbir_topic_urls(urls ="https://sbir.defensebusiness.org/topics/GetTopicDetails?topicId=307428")
parse_dod_sbir_topic_urls <-
  function(urls = NULL,
           return_message = F) {
    if (length(urls) == 0) {
      stop("Enter DOD topic URLs")
    }
    all_data <-
      urls %>%
      future_map_dfr(function(url) {
        .parse_dod_sbir_topic_url(url = url, return_message = return_message)
      })

    all_data <-
      all_data %>%
      mutate(
        hasContacts = dataContacts %>% map_dbl(length) > 0,
        hasTopicReferences = dataTopicReferences %>% map_dbl(length) > 0
      ) %>%
      mutate_if(is.character,
                list(function(x) {
                  case_when(x == "N/A" ~ NA_character_,
                            TRUE  ~ x) %>% stri_enc_toascii()
                }))

    all_data <- all_data %>%
      select(
        idTopic,
        titleTopic,
        nameAgency,
        nameCommand,
        nameContactPrimary,
        matches("^topic|Topic$"),
        matches("description"),
        everything()
      )

    all_data <-
      all_data %>%
      mutate(
        hasSemi = keywordsTopicSBIR %>% str_detect("\\;"),
        keywordsTopicSBIR = case_when(
          hasSemi ~ keywordsTopicSBIR,
          TRUE ~ keywordsTopicSBIR %>% str_replace_all("\\,", "\\; ")
        )
      ) %>%
      select(-hasSemi)

    df_keywords <-
      all_data %>%
      select(idTopic, keywordsTopicSBIR) %>%
      separate_rows(keywordsTopicSBIR, sep = "\\; ") %>%
      mutate_at("keywordsTopicSBIR",
                function(x) {
                  x %>% str_remove_all("\\.|\\	:") %>% str_squish()
                }) %>%
      mutate_at("keywordsTopicSBIR",
                function(x) {
                  case_when(
                    x %in% c("AI", "ARTIFICIAL INTELLIGENCE, DEEP LEARNING") ~ "ARTIFICIAL INTELLIGENCE",
                    x %in% c("AJ", "ANTI-JAM") ~ "ANTIJAM",
                    x == "EW" ~ "ELECTRONIC WARFARE",
                    x == "HPM" ~ "HIGH POWER MICROWAVE",
                    x == "ML" ~ "MACHINE LEARNING",
                    x %in% c("RF") ~ "RADIO FREQUENCY",
                    x %in% c("QCL", "QUANTUM COMPUTING") ~ "QUANTUM COMPUTATION",
                    x %in% c("AR") ~ "AUGMENTED REALITY",
                    TRUE ~ x
                  )
                }) %>%
      rename(keywordTopicSBIR = keywordsTopicSBIR) %>%
      filter(keywordTopicSBIR != "") %>%
      distinct() %>%
      group_by(idTopic) %>%
      nest() %>%
      ungroup() %>%
      rename(dataTopicKeywords = data) %>%
      mutate(countKeywords = dataTopicKeywords %>% map_dbl(nrow),
             hasKeywords = T)

    all_data <-
      all_data %>%
      left_join(df_keywords, by = "idTopic")


    all_data
  }

#' Department of Defense SBIR/STTR Open Topics
#'
#' Returns details for all open DOD SBIR/STTR
#' opportunities
#'
#' @param include_details if \code{TRUE} uses \link{parse_dod_sbir_topic_urls} to parse detailed topic data
#' @param return_message if \code{TRUE} a returns message
#'
#' @return
#' @export
#'
#' @examples
#' \dontrun{
#' dod_sbir_open_topics()
#' }
dod_sbir_open_topics <-
  function(include_details = T,
           include_text_block = T,
           return_message = F) {
    data <-
      .dod_sbir_topics() %>%
      mutate(dateSearch = Sys.Date()) %>%
      select(dateSearch, everything())

    if (!include_details) {
      return(data)
    }

    df_details <-
      parse_dod_sbir_topic_urls(urls = data$urlDODSBIRTopicAPI,
                                return_message = return_message)

    missing_cols <-
      names(
        df_details)[!names(df_details) %in% names(data)]
    data <-
      data %>%
      left_join(df_details %>% select(idTopic, one_of(missing_cols)),
                by = "idTopic") %>%
      select(one_of(names(df_details)), everything())


    if (include_text_block) {
      text_columns <-
        data %>% select_if(is.character) %>%
        select(matches("description|topic|Topic")) %>%
        select(-matches(
          "descriptionSitisAdditionalInfo|^url|keywordsTopicSBIR"
        )) %>%
        names()

      data <-
        create_text_block(data = data,
                          id_column = "idTopic",
                          text_columns = text_columns)

    }

    data
  }

# nasa --------------------------------------------------------------------

.sbirs_nasa <-
  function() {
    data <-
      "https://sbir.nasa.gov/sbir_search_csv?searchText=&searchTextType=any&sort_solicit_program%5B%5D=&sort_solicit_program_year%5B%5D=&sort_solicit_program_phase_value%5B%5D=&sort_field_award_md_short_name%5B%5D=&sort_field_award_ctr_name%5B%5D=&sort_field_firm_state%5B%5D=&sort_field_award_tech_area%5B%5D=&sortm_field_award_tech_taxonomy%5B%5D=&searchType=award&sol_search_submit=Submit&form_build_id=form-05iVhnODgWrol2quyjY5xRlVNeQZ0KqfTmK4Nv8_3Yc&form_id=sbir_awards_search_form" %>%
      fread(quote = "", showProgress = FALSE) %>%
      as_tibble()

    data <-
      data %>% .munge_sbir_names() %>% .munge_data(clean_address = F)

    data
  }


#' NASA SBIRs
#'
#' NASA SBIR data
#'
#' @return
#' @export
#'
#' @examples
sbirs_nasa <- function() {
  .tt <- memoise::memoise(.sbirs_nasa)
  data <- .tt()

  data
}




# doe ---------------------------------------------------------------------


# https://pamspublic.science.energy.gov/WebPAMSExternal/Interface/Awards/AwardSearchExternal.aspx?controlName=ContentTabs





# epa ---------------------------------------------------------------------

.parse_epa_sbirs <-
  function() {
    "https://cfpub.epa.gov/ncer_abstracts/index.cfm/fuseaction/outlinks.sbir/fullList/Yes/showYear/all"
  }


# dhs ---------------------------------------------------------------------

# https://sbir2.st.dhs.gov/portal/public/Awards

#' DOD SBIRs
#'
#' All SBIRs from
#' Department of Defense SBIR
#' API
#'
#' @param group_filters if not \code{NULL} the solicitation groups to isolate
#' @param snake_names if \code{TRUE} returns snake case names
#' #' @param clean_companies if \code{TRUE} returns snake case names
#' @param create_text_description  if \code{TRUE} returns a blocked text description field
#' @param clean_keywords if \code{TRUE} cleans Topic and company keywords
#' @param exclude_null_contracts
#' @param clean_companies
#' @param clean_contracts
#' @param unformat
#'
#' @return \code{tibble()}
#' @export
#'
#' @examples
#' dod_sbir_historic()
dod_sbir_historic <-
  function(group_filters = NULL,
           snake_names = F,
           exclude_null_contracts = T,
           create_text_description = T,
           unformat = T,
           clean_companies  = T,
           clean_contracts = T,
           clean_keywords = T) {
    urls <-
      glue(
        "https://asbcllc.com/r_packages/govtrackR/data/dod_sbir_cache/dod_sbir/dod_sbir_{1:10}.rda"
      ) %>% as.character()
    read_rda_m <- memoise::memoise(read_rda)
    data <-
      urls %>%
      map_dfr(function(url) {
        read_rda_m(file = url)
      })

    if (exclude_null_contracts) {
      data <-
        data %>%
        filter(!is.na(idContract))
    }

    if (length(group_filters) > 0) {
      data <- data %>% filter(groupSolicitation %in% group_filters)
    }

    if (clean_contracts) {
      contract_cols <-
        data %>% select(matches("idContract")) %>% names()

      data <- data %>%
        mutate_at(contract_cols, list(function(x) {
          x %>% str_remove_all("\\-")
        })) %>%
        mutate(
          idOfficeAward = case_when(
            is.na(idContract) ~ NA_character_,
            TRUE ~ idContract %>% substr(1, 6)
          ),
          slugBudgetAward = case_when(
            is.na(idContract) ~ NA_character_,
            TRUE ~ idContract %>% substr(7, 8)
          ),
          codeAcquisitionAward = case_when(
            is.na(idContract) ~ NA_character_,
            TRUE ~ idContract %>% substr(9, 9)
          ),
          codeNumberAward =
            case_when(
              is.na(idContract) ~ NA_character_,
              TRUE ~ idContract %>% substr(10, nchar(idContract))
            )
        )
    }

    data <-
      data %>%
      rename(
        nameAward = titleSBIR,
        descriptionAward = descriptionAbstract,
        nameTopic = titleTopic
      )

    data <-
      data %>%
      select(-one_of("yearFiscalAwarwd"))

    data <-
      data %>%
      mutate(
        dateAward = as.Date(datetimeAward),
        dateAwardEnd = as.Date(datetimeAwardEnd)
      ) %>%
      mutate(
        yearAward  = case_when(
          lubridate::month(dateAward) %>% as.numeric() >= 10 ~ lubridate::year(dateAward) %>% as.numeric()+1,
          TRUE ~ lubridate::year(dateAward) %>% as.numeric()
        )
      )

    if (create_text_description) {
      data <-
        data %>%
        mutate_at(
          c(
            "nameAward",
            "descriptionAward",
            "descriptionPrincipalBenefit"
          ),
          list(function(x) {
            x %>% coalesce("")
          })
        ) %>%
        mutate(idSBIRDeptDefense = 1:n()) %>%
        create_text_block(
          id_column = "idSBIRDeptDefense",
          text_columns = c(
            "nameAward",
            "descriptionAward",
            "descriptionPrincipalBenefit"
          )
        ) %>%
        select(-idSBIRDeptDefense) %>%
        mutate_at(
          c(
            "nameAward",
            "descriptionAward",
            "descriptionPrincipalBenefit",
            "textSBIRDeptDefense"
          ),
          list(function(x) {
            case_when(x == "" ~ NA_character_,
                      TRUE ~ x)
          })
        )
    }

    if (clean_keywords) {
      glue("\n\nCleaning DOD company and topic SBIR/STTR keywords\n\n") %>% message()

      data <-
        data %>%
        mutate(row = 1:n()) %>%
        select(row, everything())

      df_sbir <-
        data %>%
        select(row, keywordsTopicSBIR) %>%
        separate_rows(keywordsTopicSBIR, convert = T, sep = "\\,|\\;|^&|[0-9]. |\\ & ") %>%
        separate_rows(keywordsTopicSBIR, convert = T, sep = "\\ [0-9].") %>%
        separate_rows(keywordsTopicSBIR, sep = "CONTACT:|POC:|TECHNICAL POINT OF CONTACT|PHONE:|TITLE:|PROGNOSTICS:|KEYWORDS: |TOPIC WRITERS:|ND:", convert = T) %>%
        mutate_if(is.character, str_squish) %>%
        filter(keywordsTopicSBIR != "") %>%
        filter(!keywordsTopicSBIR %>% str_detect("FAX:|EMAIL:|^:|E MAIL")) %>%
        rename(keyword = keywordsTopicSBIR) %>%
        mutate(
          keyword = keyword %>% str_squish() %>% str_remove_all("\\.|\u001a|\\(|\\)") %>%
            str_replace_all("\\-", " ") %>% str_replace_all("3 D", "3D")
        ) %>%
        filter(!keyword %in% c("", "AND")) %>%
        filter(!keyword %>% str_detect("EMAIL|FAX| PHONE$")) %>%
        mutate_if(is.character, str_squish) %>%
        mutate_at("keyword",
                  function(x) {
                    case_when(
                      x %in% c("AI", "ARTIFICIAL INTELLIGENCE, DEEP LEARNING") ~ "ARTIFICIAL INTELLIGENCE",
                      x %in% c("AJ", "ANTI-JAM") ~ "ANTIJAM",
                      x == "EW" ~ "ELECTRONIC WARFARE",
                      x == "MEMS" ~ "MICROELECTROMECHANICAL SYSTEMS",
                      x == "CFD" ~ "Computational fluid dynamics",
                      x == "HPM" ~ "HIGH POWER MICROWAVE",
                      x == "ML" ~ "MACHINE LEARNING",
                      x %in% c("RF") ~ "RADIO FREQUENCY",
                      x %in% c("QCL", "QUANTUM COMPUTING") ~ "QUANTUM COMPUTATION",
                      x %in% c("AR") ~ "AUGMENTED REALITY",
                      TRUE ~ x
                    )
                  }) %>%
        filter(!keyword %in% c("", "\u001a\u001a")) %>%
        mutate(
          keyword = case_when(
            keyword %in% c("AI", "AUTOMATED AI") ~ "ARTIFICIAL INTELLIGENCE",
            keyword %in% c("AR") ~ "AUGMENTED REALITY",
            keyword %in% c("IOT") ~ "INTERNET OF THINGS",
            keyword %in% c("UAV", "UAS", "DRONES", "DRONE", "UNMANNED") ~ "Unmanned aerial vehicle" %>% str_to_upper(),
            keyword %in% c("FPGA") ~ "Field Programmable Gate Array" %>% str_to_upper(),
            keyword %in% c("RF") ~ "Radio Frequency" %>% str_to_upper(),
            keyword %in% c("ISR") ~ "Intelligence Surveillance and reconnaissance" %>% str_to_upper(),
            keyword %in% c("GAN") ~ "Generative adversarial network" %>% str_to_upper(),
            keyword %in% c("MWIR") ~ "Midwave Infrared" %>% str_to_upper(),
            keyword %in% c("IR") ~ "Infrared" %>% str_to_upper(),
            keyword %in% c("UUV") ~ "Unmanned underwater vehicle" %>% str_to_upper(),
            keyword %in% c("SIC") ~ "single silicon carbide",
            keyword %in% c("IED") ~ "Improvised explosive device",
            keyword %in% c("NDE") ~ "Non Destructive Evaluation",
            keyword %in% c("MMIC") ~ "Monolithic microwave integrated circuit",
            keyword %in% c("IMU") ~ "inertial measurement unit",
            keyword %in% c("ATR") ~ "assisted target recognition",
            keyword %in% c("GMTI") ~ "Ground Moving Target Indicator",
            keyword %in% "SIGINT" ~ "Signals intelligence",
            keyword %in% "MANET" ~ "Mobile Ad Hoc Networks",
            keyword %in% "ROIC" ~ "Readout Integrated Circuit",
            keyword %in% "MIMO" ~ "Multiple input multiple output",
            keyword %in% "GPU" ~ "Graphical Processing Unit",
            keyword %in% "HMD" ~ "head mounted displays",
            keyword %in% "SPINEL" ~ "strength transparent magnesium aluminate",
            keyword %in% "DACS" ~ "Divert and Attitude Control System",
            keyword %in% "COTS" ~ "Commercial Off The Shelf",
            keyword %in% "LED" ~ "light emitting diode",
            keyword %in% "SDR" ~ "Software Defined Radio",
            keyword %in% "ADC" ~ "Analog to Digital Conversion",
            keyword %in% "UGV" ~ "Unmanned Ground Vehicle",
            keyword %in% "JSF" ~ "Joint Strike Fighter",
            keyword %in% "MAV" ~ "Micro Air Vehicles",
            keyword %>% str_detect("ADDITIVE MANUFACTURE") ~ "ADDITIVE MANUFACTURE",
            keyword %>% str_detect("ADVANCED MATERIAL") ~ "ADVANCED MATERIALS",
            keyword %>% str_detect("ADVANCED MANUFACTURE|ADVANCED MANUFACTURING") ~ "ADVANCED MANUFACTURING",
            keyword %>% str_detect("^AI|AI/ML|SYSML|MACHINE LEARNING") ~ "ARTIFICIAL INTELLIGENCE",
            keyword %in% "ASIC" ~ "application specific integrated circuit",
            keyword %in% "CMC" ~ "Ceramic Matrix Composites",
            keyword %in% "MOCVD" ~ "METAL ORGANIC CHEMICAL VAPOR DEPOSITION",
            keyword %in% "MBE" ~ "the molecular beam epitaxy",
            keyword %in% "SOFC" ~ "Solid Oxide Fuel Cell",
            keyword %in% "SAR" ~ "Synthetic Aperture Radar",
            keyword %in% "JTRS" ~ "Joint Tactical Radio System",
            keyword %in% "ASW" ~ "Antisubmarine Warfare",
            keyword %in% "FLP" ~ "focal plane array",
            keyword == "JP" ~ "Joint Publication",
            keyword == "LWIR" ~ "Long Wavelength Infrared",
            keyword %in% c("LADAR", "LIDAR") ~ "LAser Detection And Ranging",
            TRUE ~ keyword
          ),
          keyword = keyword %>% str_to_upper() %>% str_remove_all("\\?")
        ) %>%
        filter(!keyword %>% str_detect("ORG: |E MAIL: ")) %>%
        filter(nchar(keyword) > 1)

      df_sbir <-
        df_sbir %>%
        mutate(
          count_total = nchar(keyword),
          count_bad = keyword %>% str_count("[0-9]|\\/|\\ |\\-")
        ) %>%
        filter(count_bad != count_total) %>%
        select(-c(count_total, count_bad)) %>%
        rename(keywordsTopicSBIR = keyword)


      data <-
        data %>%
        select(-keywordsTopicSBIR) %>%
        left_join(
          df_sbir %>%
            group_by(row) %>%
            summarise(
              keywordsTopicSBIR = unique(keywordsTopicSBIR) %>% sort() %>% str_c(collapse = " | "),
              countKeywordsTopic = n()
            ),
          by = "row"
        )


      df_sbir <-
        data %>%
        select(row, keywordsSBIRCompany) %>%
        separate_rows(keywordsSBIRCompany, convert = T, sep = "\\,|\\;|^&|[0-9]. |\\ & ") %>%
        separate_rows(keywordsSBIRCompany, convert = T, sep = "\\ [0-9].") %>%
        separate_rows(keywordsSBIRCompany, sep = "CONTACT:|POC:|TECHNICAL POINT OF CONTACT|PHONE:|TITLE:|PROGNOSTICS:|KEYWORDS: |TOPIC WRITERS:|ND:", convert = T) %>%
        mutate_if(is.character, str_squish) %>%
        filter(keywordsSBIRCompany != "") %>%
        filter(!keywordsSBIRCompany %>% str_detect("FAX:|EMAIL:|^:|E MAIL")) %>%
        rename(keyword = keywordsSBIRCompany) %>%
        mutate(
          keyword = keyword %>% str_squish() %>% str_remove_all("\\.|\u001a|\\(|\\)") %>%
            str_replace_all("\\-", " ") %>% str_replace_all("3 D", "3D")
        ) %>%
        filter(!keyword %in% c("", "AND")) %>%
        filter(!keyword %>% str_detect("EMAIL|FAX| PHONE$")) %>%
        mutate_if(is.character, str_squish) %>%
        mutate_at("keyword",
                  function(x) {
                    case_when(
                      x %in% c("AI", "ARTIFICIAL INTELLIGENCE, DEEP LEARNING") ~ "ARTIFICIAL INTELLIGENCE",
                      x %in% c("AJ", "ANTI-JAM") ~ "ANTIJAM",
                      x == "EW" ~ "ELECTRONIC WARFARE",
                      x == "MEMS" ~ "MICROELECTROMECHANICAL SYSTEMS",
                      x == "CFD" ~ "Computational fluid dynamics",
                      x == "HPM" ~ "HIGH POWER MICROWAVE",
                      x == "ML" ~ "MACHINE LEARNING",
                      x %in% c("RF") ~ "RADIO FREQUENCY",
                      x %in% c("QCL", "QUANTUM COMPUTING") ~ "QUANTUM COMPUTATION",
                      x %in% c("AR") ~ "AUGMENTED REALITY",
                      TRUE ~ x
                    )
                  }) %>%
        filter(!keyword %in% c("", "\u001a\u001a")) %>%
        mutate(
          keyword = case_when(
            keyword %in% c("AI", "AUTOMATED AI") ~ "ARTIFICIAL INTELLIGENCE",
            keyword %in% c("AR") ~ "AUGMENTED REALITY",
            keyword %in% c("IOT") ~ "INTERNET OF THINGS",
            keyword %in% c("UAV", "UAS", "DRONES", "DRONE", "UNMANNED") ~ "Unmanned aerial vehicle" %>% str_to_upper(),
            keyword %in% c("FPGA") ~ "Field Programmable Gate Array" %>% str_to_upper(),
            keyword %in% c("RF") ~ "Radio Frequency" %>% str_to_upper(),
            keyword %in% c("ISR") ~ "Intelligence Surveillance and reconnaissance" %>% str_to_upper(),
            keyword %in% c("GAN") ~ "Generative adversarial network" %>% str_to_upper(),
            keyword %in% c("MWIR") ~ "Midwave Infrared" %>% str_to_upper(),
            keyword %in% c("IR") ~ "Infrared" %>% str_to_upper(),
            keyword %in% c("UUV") ~ "Unmanned underwater vehicle" %>% str_to_upper(),
            keyword %in% c("SIC") ~ "single silicon carbide",
            keyword %in% c("IED") ~ "Improvised explosive device",
            keyword %in% c("NDE") ~ "Non Destructive Evaluation",
            keyword %in% c("MMIC") ~ "Monolithic microwave integrated circuit",
            keyword %in% c("IMU") ~ "inertial measurement unit",
            keyword %in% c("ATR") ~ "assisted target recognition",
            keyword %in% c("GMTI") ~ "Ground Moving Target Indicator",
            keyword %in% "SIGINT" ~ "Signals intelligence",
            keyword %in% "MANET" ~ "Mobile Ad Hoc Networks",
            keyword %in% "ROIC" ~ "Readout Integrated Circuit",
            keyword %in% "MIMO" ~ "Multiple input multiple output",
            keyword %in% "GPU" ~ "Graphical Processing Unit",
            keyword %in% "HMD" ~ "head mounted displays",
            keyword %in% "SPINEL" ~ "strength transparent magnesium aluminate",
            keyword %in% "DACS" ~ "Divert and Attitude Control System",
            keyword %in% "COTS" ~ "Commercial Off The Shelf",
            keyword %in% "LED" ~ "light emitting diode",
            keyword %in% "SDR" ~ "Software Defined Radio",
            keyword %in% "ADC" ~ "Analog to Digital Conversion",
            keyword %in% "UGV" ~ "Unmanned Ground Vehicle",
            keyword %in% "JSF" ~ "Joint Strike Fighter",
            keyword %in% "MAV" ~ "Micro Air Vehicles",
            keyword %>% str_detect("ADDITIVE MANUFACTURE") ~ "ADDITIVE MANUFACTURE",
            keyword %>% str_detect("ADVANCED MATERIAL") ~ "ADVANCED MATERIALS",
            keyword %>% str_detect("ADVANCED MANUFACTURE|ADVANCED MANUFACTURING") ~ "ADVANCED MANUFACTURING",
            keyword %>% str_detect("^AI|AI/ML|SYSML|MACHINE LEARNING") ~ "ARTIFICIAL INTELLIGENCE",
            keyword %in% "ASIC" ~ "application specific integrated circuit",
            keyword %in% "CMC" ~ "Ceramic Matrix Composites",
            keyword %in% "MOCVD" ~ "METAL ORGANIC CHEMICAL VAPOR DEPOSITION",
            keyword %in% "MBE" ~ "the molecular beam epitaxy",
            keyword %in% "SOFC" ~ "Solid Oxide Fuel Cell",
            keyword %in% "SAR" ~ "Synthetic Aperture Radar",
            keyword %in% "JTRS" ~ "Joint Tactical Radio System",
            keyword %in% "ASW" ~ "Antisubmarine Warfare",
            keyword %in% "FLP" ~ "focal plane array",
            keyword == "JP" ~ "Joint Publication",
            keyword == "LWIR" ~ "Long Wavelength Infrared",
            keyword %in% c("LADAR", "LIDAR") ~ "LAser Detection And Ranging",
            TRUE ~ keyword
          ),
          keyword = keyword %>% str_to_upper() %>% str_remove_all("\\?")
        ) %>%
        filter(!keyword %>% str_detect("ORG: |E MAIL: ")) %>%
        filter(nchar(keyword) > 1)

      df_sbir <-
        df_sbir %>%
        mutate(
          count_total = nchar(keyword),
          count_bad = keyword %>% str_count("[0-9]|\\/|\\ |\\-")
        ) %>%
        filter(count_bad != count_total) %>%
        select(-c(count_total, count_bad)) %>%
        rename(keywordsSBIRCompany = keyword)

      data <-
        data %>%
        select(-keywordsSBIRCompany) %>%
        left_join(
          df_sbir %>%
            group_by(row) %>%
            summarise(
              keywordsSBIRCompany = unique(keywordsSBIRCompany) %>% sort() %>% str_c(collapse = " | "),
              countKeywordsSBIRCompany = n()
            ),
          by = "row"
        ) %>%
        select(-row)

    }

    if (clean_companies) {
      data <-
        data %>%
        entities::refine_columns(entity_columns = "nameCompany")
    }

    data <- data %>%
      rename(
        groupRevenueCompany = amountRevenueCompanyPriorYear,
        cageCompany = slugCageCompany,
        addressStreet1Company = adressStreet1Company,
        slugAcquisitionProgramName = slugAcquisitionrProgramName,
      )


    data <-
      data %>%
      mutate_at(c("pctWorkSTTR", "pctRevenueSBIR"),
                list(function(x) {
                  x %>% as.numeric()
                })) %>%
      mutate_at(c("pctWorkSTTR", "pctRevenueSBIR"),
                list(function(x) {
                  case_when(x == 0 ~ 0,
                            x <= 100 ~ x / 100,
                            x >= 3000 ~ x / 10000,
                            TRUE ~ x / 1000)
                }))

    data <-
      data %>%
      rename(locationCompany = addressFullCompany,
             locationSTTR = addressFullSTTR)
    data <-
      data %>%
      select(matches("^id[A-Z]|year|nameCompany|DUNS"), everything()) %>%
      munge_lite(snake_names = snake_names, unformat = unformat)
    data
  }


# afwerx -------------------------------------------------------------------------------------------

## https://exporter.nih.gov/ExPORTER_Catalog.aspx


.sbir_afwerx_portfolio <-
  function(resolve_for_duns = T,
           only_open_topic = F,
           include_naics = T,
           include_psc = T,
           include_business_types = T,
           include_sba = T,
           snake_names = F) {
    options(warn = -1)
    data <-
      download_excel_file(url = "https://www.afwerx.af.mil/resources/afwerx-portfolio.xlsx", has_col_names = T)

    data <-
      data %>%
      .munge_sbir_names()

    if (data %>% hasName("idOffice")) {
      data <- data %>% rename(groupTopic = idOffice)
    }


    if (data %>% hasName("keywordsTopicSBIR")) {
      data <- data %>% rename(keywordsSBIRCompany = keywordsTopicSBIR)
    }

    data <-
      data %>%
      mutate(idPhase = case_when(idPhase %>% str_detect("1") ~ "PHASE I",
                                 TRUE ~ "PHASE II"))

    data <- data %>%
      separate(
        nameCompany,
        sep = "\\(|D/B/A| DBA: | dba | DBA ",
        into = c("nameCompany", "nameCompanyDBA"),
        extra = "merge",
        fill = "right"
      ) %>%
      mutate(nameCompanyDBA = nameCompany %>% str_remove_all("\\)")) %>%
      mutate_if(is.character, str_squish)

    data <-
      data %>%
      separate(
        descriptionTopic,
        into = c('groupSolicitation', "nameSolicitation"),
        sep = "\\(",
        extra = "merge",
        fill = "right"
      ) %>%
      mutate(
        nameSolicitation = nameSolicitation %>% str_remove_all("\\)"),
        groupSolicitation = groupSolicitation %>% str_remove_all(" Phase 1| Phase 2|Phase I| P1| P2") %>% str_remove_all("\\.") %>% str_replace_all("^191", "AF191") %>%
          str_replace_all("^182", "AF182") %>% str_replace_all("^183", "AF183")
      ) %>%
      mutate_if(is.character, str_squish) %>%
      separate(
        groupSolicitation,
        sep = "\\-",
        remove = F,
        into = c("idOffice", "idCohort")
      ) %>%
      munge_data() %>%
      mutate(
        typeSolicitation = case_when(
          nameSolicitation %>% str_detect("OPEN TOPIC") ~ "OPEN TOPIC",
          nameSolicitation %>% str_detect("PITCH DAY") ~ "PITCH DAY",
          TRUE ~ "OTHER"
        )
      ) %>%
      mutate(id = 1:n()) %>%
      select(id, typeSolicitation, idProgram,
             idPhase, everything())

    data <- data %>%
      mutate(nameCompany = nameCompany %>% str_remove_all("/") %>% str_trim())



    data <- data %>%
      entities::refine_columns(entity_columns = "nameCompany")
    data <- data %>%
      select(
        id,
        idProgram,
        idPhase,
        typeSolicitation,
        nameSolicitation,
        groupSolicitation,
        idOffice,
        idCohort,
        nameCompanyClean,
        everything()
      )

    data <- data %>%
      mutate(nameCompanyClean = nameCompanyClean %>% str_remove_all("/") %>% str_trim())



    tbl_keywords <-
      data %>%
      select(id, keywordsSBIRCompany) %>%
      separate_rows(keywordsSBIRCompany, sep  = "\\,|u001a|\\: |\u001a ") %>%
      mutate_if(is.character, str_trim) %>%
      mutate(keywordsSBIRCompany = keywordsSBIRCompany  %>% stringi::stri_trans_general("Latin-ASCII")) %>%
      mutate_at("keywordsSBIRCompany",
                function(x) {
                  case_when(
                    x %in% c("AI", "ARTIFICIAL INTELLIGENCE, DEEP LEARNING") ~ "ARTIFICIAL INTELLIGENCE",
                    x %in% c("AJ", "ANTI-JAM") ~ "ANTIJAM",
                    x == "EW" ~ "ELECTRONIC WARFARE",
                    x == "HPM" ~ "HIGH POWER MICROWAVE",
                    x == "ML" ~ "MACHINE LEARNING",
                    x %in% c("RF") ~ "RADIO FREQUENCY",
                    x %in% c("QCL", "QUANTUM COMPUTING") ~ "QUANTUM COMPUTATION",
                    x %in% c("AR") ~ "AUGMENTED REALITY",
                    TRUE ~ x
                  )
                }) %>%
      filter(!keywordsSBIRCompany %in% c("", "\u001a\u001a")) %>%
      mutate(keyword = keywordsSBIRCompany) %>%
      mutate(
        keyword = case_when(
          keyword %in% c("AI", "AUTOMATED AI") ~ "ARTIFICIAL INTELLIGENCE",
          keyword %in% c("AR") ~ "AUGMENTED REALITY",
          keyword %in% c("IOT") ~ "INTERNET OF THINGS",
          keyword %in% c("UAV", "UAS", "DRONES", "DRONE", "UNMANNED") ~ "Unmanned aerial vehicle" %>% str_to_upper(),
          keyword %in% c("FPGA") ~ "Field Programmable Gate Array" %>% str_to_upper(),
          keyword %in% c("RF") ~ "Radio Frequency" %>% str_to_upper(),
          keyword %in% c("ISR") ~ "Intelligence Surveillance and reconnaissance" %>% str_to_upper(),
          keyword %in% c("GAN") ~ "Generative adversarial network" %>% str_to_upper(),
          keyword %in% c("MWIR") ~ "Midwave Infrared" %>% str_to_upper(),
          keyword %in% c("IR") ~ "Infrared" %>% str_to_upper(),
          keyword %in% c("UUV") ~ "Unmanned underwater vehicle" %>% str_to_upper(),
          keyword %in% c("SIC") ~ "single silicon carbide",
          keyword %in% c("IED") ~ "Improvised explosive device",
          keyword %in% c("NDE") ~ "Non Destructive Evaluation",
          keyword %in% c("MMIC") ~ "Monolithic microwave integrated circuit",
          keyword %in% c("IMU") ~ "inertial measurement unit",
          keyword %in% c("ATR") ~ "assisted target recognition",
          keyword %in% c("GMTI") ~ "Ground Moving Target Indicator",
          keyword %in% "SIGINT" ~ "Signals intelligence",
          keyword %in% "MANET" ~ "Mobile Ad Hoc Networks",
          keyword %in% "ROIC" ~ "Readout Integrated Circuit",
          keyword %in% "MIMO" ~ "Multiple input multiple output",
          keyword %in% "GPU" ~ "Graphical Processing Unit",
          keyword %in% "HMD" ~ "head mounted displays",
          keyword %in% "SPINEL" ~ "strength transparent magnesium aluminate",
          keyword %in% "DACS" ~ "Divert and Attitude Control System",
          keyword %in% "COTS" ~ "Commercial Off The Shelf",
          keyword %in% "LED" ~ "light emitting diode",
          keyword %in% "SDR" ~ "Software Defined Radio",
          keyword %in% "ADC" ~ "Analog to Digital Conversion",
          keyword %in% "UGV" ~ "Unmanned Ground Vehicle",
          keyword %in% "JSF" ~ "Joint Strike Fighter",
          keyword %in% "MAV" ~ "Micro Air Vehicles",
          keyword %>% str_detect("ADDITIVE MANUFACTURE") ~ "ADDITIVE MANUFACTURE",
          keyword %>% str_detect("ADVANCED MATERIAL") ~ "ADVANCED MATERIALS",
          keyword %>% str_detect("ADVANCED MANUFACTURE|ADVANCED MANUFACTURING") ~ "ADVANCED MANUFACTURING",
          keyword %>% str_detect("^AI|AI/ML|SYSML|MACHINE LEARNING") ~ "ARTIFICIAL INTELLIGENCE",
          keyword %in% "ASIC" ~ "application specific integrated circuit",
          keyword %in% "CMC" ~ "Ceramic Matrix Composites",
          keyword %in% "MOCVD" ~ "METAL ORGANIC CHEMICAL VAPOR DEPOSITION",
          keyword %in% "MBE" ~ "the molecular beam epitaxy",
          keyword %in% "SOFC" ~ "Solid Oxide Fuel Cell",
          TRUE ~ keyword
        ),
        keyword = str_to_upper(keyword)
      )

    tbl_keywords <-
      tbl_keywords %>%
      group_by(id) %>%
      summarise(
        keywordAFWERX = unique(keywordsSBIRCompany) %>% sort() %>% str_c(collapse = " | "),
        keywordCompanyCleanAFWERX = unique(keyword) %>% sort() %>% str_c(collapse = " | ")
      ) %>%
      ungroup()

    data <-
      data %>%
      select(-keywordsSBIRCompany) %>%
      left_join(tbl_keywords, by = 'id') %>%
      distinct()

    data <- data %>%
      mutate(isOpenTopicAwardee = typeSolicitation %in% c("OPEN TOPIC") |
               groupSolicitation %>% str_detect("X")) %>%
      select(idProgram, isOpenTopicAwardee, everything())


    data <- data %>%
      mutate(
        url = urlCompany %>% str_remove_all("http://") %>% str_remove_all("http:/") %>% str_remove_all("https://|https//|www.") %>% str_squish() %>% map_chr(URLencode),
        urlCompany = str_c("https://", url, sep = "")
      ) %>%
      select(-url)

    data <- data %>%
      mutate(urlCompany = case_when(
        urlCompany %in% c("https://NA", "https://0") ~  NA_character_,
        TRUE ~ urlCompany
      )) %>%
      mutate(
        urlCompany = case_when(
          urlCompany == "https://.planeenglish.net" ~ "https://planeenglish.net",
          TRUE ~ urlCompany
        )
      )

    if (data %>% hasName("idOffice")) {
      data <-
        data %>%
        mutate(groupTopic = idOffice)
    }

    if (resolve_for_duns) {
      df_resolved <-
        afwerx_open_topic_duns %>%
        filter(!is.na(nameCompanyClean))

      data <-
        data %>%
        mutate(id = 1:n())

      df_matches <-
        seq_along(1:nrow(data)) %>%
        map_dfr(function(x) {
          companies <-
            data %>% slice(x) %>% select(matches("nameCompany")) %>%
            gather(item, value) %>%
            distinct(value) %>%
            filter(!is.na(value)) %>%
            pull() %>%
            str_c(collapse = "|")

          glue("Resolving {companies} for DUNS and registration data") %>% message()

          d <- df_resolved %>%
            filter(nameCompanyClean %>% str_detect(companies))
          if (nrow(d) == 0) {
            return(tibble(id = x))
          }

          d %>%
            select(idDUNS) %>%
            mutate(id = x)
        })

      df_matches <-
        df_matches %>%
        group_by(id) %>%
        mutate(count = n_distinct(idDUNS)) %>%
        ungroup() %>%
        filter(count == 1) %>%
        filter(!is.na(idDUNS)) %>%
        select(id, idDUNS)

      data <-
        data %>%
        select(-matches("idDUNS")) %>%
        left_join(df_matches, by = "id") %>%
        select(id:idCohort, idDUNS, everything())



      df_sam <- bulk_sam_entities()
      df_sam <-
        df_sam %>%
        entities::refine_columns(entity_columns = "nameCompanyLegal")


      df_missing <-
        data %>%
        filter(is.na(idDUNS)) %>%
        select(id, nameCompanyClean)


      df_new <-
        df_missing$id %>%
        map_dfr(function(x) {
          clean <-
            data %>%
            filter(id == x) %>% select(nameCompanyClean) %>%
            pull()

          companies <-
            data %>%
            filter(id == x) %>%
            select(matches("nameCompany")) %>%
            gather(item, value) %>%
            filter(!is.na(value)) %>%
            distinct(value) %>%
            pull() %>%
            str_c(collapse = "|^")

          glue("Resolving {companies} for DUNS and registration data") %>% message()

          search <- glue("^{companies}")

          d <-
            df_sam %>% filter(nameCompanyLegal == companies) %>%
            select(matches("idDUNS|nameCompany|date"))

          if (nrow(d) > 0) {
            d <-
              d %>%
              filter(dateRegistrationInitial == max(dateRegistrationInitial)) %>%
              select(idDUNS, nameCompanyLegalClean) %>%
              mutate(id = x,
                     nameCompanyClean = clean)
            return(d)
          }

          d <-
            df_sam %>%
            filter(nameCompanyLegal %>% str_detect(search)) %>%
            select(idDUNS, nameCompanyLegalClean) %>%
            mutate(id = x,
                   nameCompanyClean = clean)

          if (nrow(d) == 0) {
            return(tibble())
          }

          d
        })

      df_new <-
        df_new %>%
        group_by(id) %>%
        mutate(count = n_distinct(idDUNS)) %>%
        ungroup() %>%
        filter(count <= 3) %>%
        slice(1)

      data <-
        data %>%
        select(-matches("idDUNS")) %>%
        left_join(df_matches %>%
                    bind_rows(df_new %>% select(id, idDUNS)) %>%
                    distinct(),
                  by = "id") %>%
        select(id:idCohort, idDUNS, everything())

      df_details <-
        data %>%
        filter(!is.na(idDUNS)) %>%
        distinct(idDUNS) %>%
        left_join(df_sam %>% select(
          matches(
            "idDUNS|slugCAGE|nameCompany|urlCompany|locationCompany|emailPointOfContactElectronicBusiness|date"
          )
        ) %>%
          rename(urlCompanySAM = urlCompany),
        by = "idDUNS")

      remove_cols <-
        names(
          data)[names(data) %in% names(df_details)] %>%
        discard(function(x){
          x == "idDUNS"
        })

      data <-
        data %>%
        select(-one_of(remove_cols)) %>%
        left_join(df_details %>% rename(nameCompanyDBASAM = nameCompanyDBA)
                  , by = "idDUNS") %>%
        group_by(id) %>%
        slice(1) %>%
        ungroup()
    }

    if (data %>% hasName("nameCompanyClean")) {
      data <- data %>%
        mutate(
          idDUNS = case_when(
            nameCompanyClean %>% str_detect("CHESAPEAKE")  ~ 124240222,
            nameCompanyClean %>% str_detect("DVI GROUP") ~ 6317394,
            nameCompanyClean %>% str_detect("SHOCKTECH") ~ 8307977,
            nameCompanyClean %>% str_detect("THE DIFFERENCE A2C LLC") ~ 13750347,
            nameCompanyClean %>% str_detect("RESERVOIR LABS") ~ 22423854,
            nameCompanyClean %>% str_detect("ELDER RESEARCH") ~ 28211527,
            nameCompanyClean %>% str_detect("PLEX SOLUTIONS") ~ 117504011,
            nameCompanyClean %>% str_detect("WARRIOR CENTRIC") ~ 14217903,
            nameCompanyClean %>% str_detect("ZEBULON SCIENCES") ~ 117342435,
            nameCompanyClean %>% str_detect("YOU TUNE") ~ 117392260,
            nameCompanyClean %>% str_detect("XPLORE INC") ~ 81358031,
            nameCompanyClean %>% str_detect("AVILUTION") ~ 31980569,
            nameCompanyClean %>% str_detect("ESPIN TECHNOLOGIES") ~ 36255094,
            nameCompanyClean %>% str_detect("SPARKCOGNITION") ~ 41702227,
            nameCompanyClean %>% str_detect("VS MERLOT INC") ~ 51540760,
            nameCompanyClean %>% str_detect("FANTASY JOURNALIST") ~ 55329579,
            nameCompanyClean %>% str_detect("SURFACE OPTICS") ~ 64390719,
            nameCompanyClean %>% str_detect("ANALATOM") ~ 64744436,
            nameCompanyClean %>% str_detect("FLITE ADVANTAGE") ~ 78299725,
            nameCompanyClean %>% str_detect("VORTEX CONTROL") ~ 78718825,
            nameCompanyClean %>% str_detect("RE3D") ~ 78732259,
            nameCompanyClean %>% str_detect("NEARSPACE") ~ 78840927,
            nameCompanyClean %>% str_detect("ONVECTOR") ~ 78856808,
            nameCompanyClean %>% str_detect("BAZZE") ~ 79246108,
            nameCompanyClean %>% str_detect("EVERALBUM") ~ 79404516,
            nameCompanyClean %>% str_detect("FOCUS VENTURES") ~ 79440302,
            nameCompanyClean %>% str_detect("BMNT PARTNERS") ~ 79444802,
            nameCompanyClean %>% str_detect("SUDOTOUCH") ~ 79644764,
            nameCompanyClean %>% str_detect("FIGURE INC") ~ 79686017,
            nameCompanyClean %>% str_detect("CYBER 2020") ~ 80019157,
            nameCompanyClean %>% str_detect("THOUGHT OBJECT") ~ 80033961,
            nameCompanyClean %>% str_detect("INTENTIONET") ~ 80073865,
            nameCompanyClean %>% str_detect("NANOVMS") ~ 80183826,
            nameCompanyClean %>% str_detect("SAFE GROUP") ~ 80249783,
            nameCompanyClean %>% str_detect("OPERANT NETWORKS") ~ 80269860,
            nameCompanyClean %>% str_detect("JUST BOUNCE") ~ 80291548,
            nameCompanyClean %>% str_detect("LOCOAL CHARCOAL") ~ 80305090,
            nameCompanyClean %>% str_detect("ARIO LLC") ~ 80326845,
            nameCompanyClean %>% str_detect("PUEO BUSINESS") ~ 80380333,
            nameCompanyClean %>% str_detect("ASCEND INNOVATIONS") ~ 80413117,
            nameCompanyClean %>% str_detect("CROWDBOTICS") ~ 80469320,
            nameCompanyClean %>% str_detect("PLUGNIX") ~ 80504281,
            nameCompanyClean %>% str_detect("ONCLAVE") ~ 80517762,
            nameCompanyClean %>% str_detect("RAPID IMAGING") ~ 80665036,
            nameCompanyClean %>% str_detect("RESCUE ROVER") ~ 80756456,
            nameCompanyClean %>% str_detect("LLVR SYSTEMS") ~ 80788205,
            nameCompanyClean %>% str_detect("BATTLE SIGHT") ~ 80809286,
            nameCompanyClean %>% str_detect("DCODE") ~ 81117708,
            nameCompanyClean %>% str_detect("CONSTELLATION NETWORK") ~ 81162812,
            nameCompanyClean %>% str_detect("BRANDON") ~ 81296679,
            nameCompanyClean %>% str_detect("UBIQUITILINK") ~ 94459684,
            nameCompanyClean %>% str_detect("WWWCRIOTSOLUTIONSCOM") ~ 97211096,
            nameCompanyClean %>% str_detect("ATC - NY") ~ 101321479,
            nameCompanyClean %>% str_detect("HASHLIT") ~ 105377160,
            nameCompanyClean %>% str_detect("QUICK MED") ~ 107955085,
            nameCompanyClean %>% str_detect("OMITRON") ~ 108990649,
            nameCompanyClean %>% str_detect("FUTURIST INSTITUTE") ~ 117073942,
            nameCompanyClean %>% str_detect("BETA TECHNOLOGIES") ~ 117188780,
            nameCompanyClean %>% str_detect("SOLUTE") ~ 117938311,
            nameCompanyClean %>% str_detect("WANLYNX") ~ 118892723,
            nameCompanyClean %>% str_detect("CHESAPEAKE") ~ 124240222,
            nameCompanyClean %>% str_detect("CAROUSEL LABS LLC") ~ 117379887,
            nameCompanyClean %>% str_detect("451 TECH") ~ 32132320,
            nameCompanyClean %>% str_detect("LONGSHORTWAY") ~ 141877311,
            nameCompanyClean %>% str_detect("WEBSEC") ~ 144541625,
            nameCompanyClean %>% str_detect("ACREE") ~ 149397015,
            nameCompanyClean %>% str_detect("ARCHIEMD") ~ 157649471,
            nameCompanyClean %>% str_detect("ELPHEL") ~ 163315497,
            nameCompanyClean %>% str_detect("EMAGINE") ~ 165636205,
            nameCompanyClean %>% str_detect("FAAC INCORPORATED") ~ 175204163,
            nameCompanyClean %>% str_detect("HIKINO") ~ 197593788,
            nameCompanyClean %>% str_detect("TITUS HUMAN") ~ 557190209,
            nameCompanyClean %>% str_detect("AVASCENT") ~ 791497550,
            nameCompanyClean %>% str_detect("DIGIBEAM") ~ 798896564,
            nameCompanyClean %>% str_detect("DANGELO TECHNOLOGIES") ~ 809577443,
            nameCompanyClean %>% str_detect("STRATAGEM") ~ 831508903,
            nameCompanyClean %>% str_detect("THE DISTI") ~ 884814930,
            nameCompanyClean %>% str_detect("DARK WOLF SOLUTIONS") ~ 933645991,
            nameCompanyClean %>% str_detect("ZANSORS") ~ 962122052,
            nameCompanyClean %>% str_detect("MAINSTEM") ~ 963478297,
            nameCompanyClean %>% str_detect("METIS DESIGN") ~ 963612291,
            nameCompanyClean %>% str_detect("GENECAPTURE") ~ 964433838,
            nameCompanyClean %>% str_detect("CLEANNG") ~ 967838967,
            nameCompanyClean %>% str_detect("GIGAVATION") ~ 968402789,
            nameCompanyClean %>% str_detect("KENIFIC") ~ 969270474,
            TRUE ~ idDUNS
          )
        )

      data <- data %>%
        mutate(
          idDUNS = case_when(
            nameCompanyClean %>% str_detect("ACCESSWORKS") ~ 116812662,
            nameCompanyClean %>% str_detect("ALPHA-1 AEROSPACE") ~ 117000188,
            nameCompanyClean %>% str_detect("AMERICAN ECOTECH") ~ 91907811,
            nameCompanyClean %>% str_detect("APPLIED DYNAMICS CORPORATION") ~ 177217189,
            nameCompanyClean %>% str_detect("ARAGANTEAL") ~ 70346840,
            nameCompanyClean %>% str_detect("ARCHITECTURE TECHNOLOGY") ~ 52062833,
            nameCompanyClean %>% str_detect("ASSETAS") ~ 117340746,
            nameCompanyClean %>% str_detect("ASTRO DIGITAL") ~ 21940207,
            nameCompanyClean %>% str_detect("AUGMENTED TRAINING") ~ 117055781,
            nameCompanyClean %>% str_detect("AUGMENTIR") ~ 81115414,
            nameCompanyClean %>% str_detect("AVISARE") ~ 80138734,
            nameCompanyClean %>% str_detect("AVOCADO") ~ 80168687,
            nameCompanyClean %>% str_detect("BLIND TIGER") ~ 74580285,
            nameCompanyClean %>% str_detect("BUBO LEARNING") ~ 80076103,
            nameCompanyClean %>% str_detect("CAMGIAN MICROSYSTEMS") ~ 791134542,
            nameCompanyClean %>% str_detect("CENTIL") ~ 80439080,
            nameCompanyClean %>% str_detect("CESIUMASTRO") ~ 80537547,
            nameCompanyClean %>% str_detect("COBALT SOLUTIONS") ~ 117394919,
            nameCompanyClean %>% str_detect("CODER") ~ 80781662,
            nameCompanyClean %>% str_detect("DAXOR") ~ 79725288,
            nameCompanyClean %>% str_detect("DISCOVERY MACHINE") ~ 128516114,
            nameCompanyClean %>% str_detect("DIVERSIFIED 3D") ~ 81469562,
            nameCompanyClean %>% str_detect("FLIPPER") ~ 81107705,
            nameCompanyClean %>% str_detect("GANTZ-MOUNTAIN") ~ 79381784,
            nameCompanyClean %>% str_detect("GIGSTER") ~ 79830815,
            nameCompanyClean %>% str_detect("GLOBAL AIR LOGISTICS") ~ 80243169,
            nameCompanyClean %>% str_detect("GREEN REVOLUTION") ~ 828680186,
            nameCompanyClean %>% str_detect("GUIDED PARTICLE") ~ 80190025,
            nameCompanyClean %>% str_detect("HATCHBED") ~ 117150209,
            nameCompanyClean %>% str_detect("HODLPAL") ~ 117346223,
            nameCompanyClean %>% str_detect("INFEITER") ~ 117414624,
            nameCompanyClean %>% str_detect("JOYLAB") ~ 79642134,
            nameCompanyClean %>% str_detect("KANEY") ~ 603280723,
            nameCompanyClean %>% str_detect("MALLINDA") ~ 79523497,
            nameCompanyClean %>% str_detect("MICROSURGEONBOT") ~ 80945043,
            nameCompanyClean %>% str_detect("NEXTGEN BALANCING") ~ 80579265,
            nameCompanyClean %>% str_detect("OPTTEK") ~ 808711803,
            nameCompanyClean %>% str_detect("ORIONS") ~ 78488412,
            nameCompanyClean %>% str_detect("OUTERLINK") ~ 782809768,
            nameCompanyClean %>% str_detect("OUTSIDE ANALYTICS") ~ 80880218,
            nameCompanyClean %>% str_detect("P3 TECHNOLOGIES") ~ 80767240,
            nameCompanyClean %>% str_detect("PARAGON ROBOTICS") ~ 806514985,
            nameCompanyClean %>% str_detect("PARASANTI") ~ 43612924,
            nameCompanyClean %>% str_detect("PHOSPHORUS") ~ 20017457,
            nameCompanyClean %>% str_detect("PLANCK AEROSYSTEMS") ~ 79592990,
            nameCompanyClean %>% str_detect("PROGRESSIVE TECHNOLOGY") ~ 836998120,
            nameCompanyClean %>% str_detect("PTERODYNAMICS") ~ 80974858,
            nameCompanyClean %>% str_detect("R-STOR") ~ 80666106,
            nameCompanyClean %>% str_detect("RADARLOCK") ~ 54667383,
            nameCompanyClean %>% str_detect("RDA PREP") ~ 117112463,
            nameCompanyClean %>% str_detect("RE:3D INC") ~ 78732259,
            nameCompanyClean %>% str_detect("RHEA SPACE") ~ 81318218,
            nameCompanyClean %>% str_detect("SANGJEN") ~ 117305658,
            nameCompanyClean %>% str_detect("SC2") ~ 122963112,
            nameCompanyClean %>% str_detect("SENTECOR") ~ 78667179,
            nameCompanyClean %>% str_detect("SIERRA PACIFIC") ~ 7032076,
            nameCompanyClean %>% str_detect("SIGNAL FRAME") ~ 79669372,
            nameCompanyClean %>% str_detect("SIMX") ~ 81046320,
            nameCompanyClean %>% str_detect("SITCH AI") ~ 81117941,
            nameCompanyClean %>% str_detect("SIX STAR SERVICES") ~ 116812953,
            nameCompanyClean %>% str_detect("SOLIDDD") ~ 52833516,
            nameCompanyClean %>% str_detect("SPARKCOGNTION") ~ 41702227,
            nameCompanyClean %>% str_detect("SPECIAL OPERATIONS SUPPLY") ~ 81331701,
            nameCompanyClean %>% str_detect("SPOTLIGHT LABS") ~ 557569626,
            nameCompanyClean %>% str_detect("SUNSHOWERIO") ~ 81529416,
            nameCompanyClean %>% str_detect("SUPERIOR FLOAT TANKS") ~ 79936042,
            nameCompanyClean %>% str_detect("SYLABS") ~ 81320655,
            nameCompanyClean %>% str_detect("TEMPLE MASSAGER") ~ 828723895,
            nameCompanyClean %>% str_detect("TETRATEIO") ~ 97133609,
            nameCompanyClean %>% str_detect("TEXAS RESEARCH INSTITUTE") ~ 625120902,
            nameCompanyClean %>% str_detect("TGV ROCKETS") ~ 799713594,
            nameCompanyClean %>% str_detect("INFORMATICS APPLICATIONS") ~ 65245750,
            nameCompanyClean %>% str_detect("WHITE HOUSE PARTNERS") ~ 80131333,
            nameCompanyClean %>% str_detect("THIRD INSIGHT") ~ 70916951,
            nameCompanyClean %>% str_detect("THORTACTICAL") ~ 117057967,
            nameCompanyClean %>% str_detect("TONICAI") ~ 111766248,
            nameCompanyClean %>% str_detect("TRIFACTA") ~ 79275791,
            nameCompanyClean == "VAN" ~ 614374643,
            nameCompanyClean %>% str_detect("VERSTAAN") ~ 117400888,
            nameCompanyClean %>% str_detect("VICTOR DAROLFI") ~ 117325916,
            nameCompanyClean %>% str_detect("VIGILENT") ~ 78462380,
            nameCompanyClean %>% str_detect("VITA INCLINATA") ~ 81135866,
            nameCompanyClean %>% str_detect("XENESIS") ~ 81028906,
            nameCompanyClean %>% str_detect("HNU PHOTONICS") ~ 788623473,

            nameCompanyClean %>% str_detect("ALPHA RECON") ~ 79495048,
            nameCompanyClean %>% str_detect("APPLIED DYNAMICS") ~ 76387984,
            nameCompanyClean %>% str_detect("BEAR SYSTEMS") ~ 116902757,
            nameCompanyClean %>% str_detect("ELEMENO HEALTH") ~ 80438162,
            nameCompanyClean %>% str_detect("FOCUSMOTION") ~ 79440302,
            nameCompanyClean %>% str_detect("GIGXR") ~ 117258144,
            nameCompanyClean %>% str_detect("INTELESENSE TECHNOLOGIES") ~ 80921977,
            nameCompanyClean %>% str_detect("MCP GOPV") ~ 22567908,
            nameCompanyClean %>% str_detect("MEMCOMPUTNG INC") ~ 81259607,
            nameCompanyClean %>% str_detect("NEIL FINEMAN") ~ 78825664,
            nameCompanyClean %>% str_detect("PRAMOD") ~ 81263169,
            nameCompanyClean %>% str_detect("SIGNALFRAME") ~ 79669372,
            nameCompanyClean %>% str_detect("TAEKION") ~ 605505531,
            TRUE ~ idDUNS
          )
        )
    }
    has_sam <- 'df_sam' %>% exists()

    if (has_sam) {
      df_missing_names <-
        data %>%
        filter(is.na(idDUNS)) %>%
        select(nameCompanyClean) %>%
        count(nameCompanyClean, sort = T) %>%
        filter(!is.na(nameCompanyClean))

      df_missing_names <-
        df_missing_names %>%
        count(nameCompanyClean, sort = T) %>%
        filter(!is.na(nameCompanyClean))

      df_missing_names <-
        df_missing_names %>%
        left_join(df_sam %>% select(
          matches(
            "idDUNS|slugCAGE|nameCompany|urlCompany|locationCompany|emailPointOfContactElectronicBusiness|date"
          )
        ) %>%
          mutate(nameCompanyClean = nameCompanyLegal),
        by = "nameCompanyClean") %>%
        select(-n)

      df_matches <-
        df_missing_names %>% filter(!is.na(dateSAMData)) %>%
        group_by(idDUNS) %>%
        filter(dateCompanyStart == max(dateCompanyStart)) %>%
        ungroup()

      if (nrow(df_matches) > 0) {
        data <- data %>%
          filter(nameCompanyClean %in% df_matches$nameCompanyClean) %>%
          select(-idDUNS) %>%
          left_join(df_matches %>% select(nameCompanyClean, idDUNS), by = "nameCompanyClean") %>%
          bind_rows(data %>% filter(!nameCompanyClean %in% df_matches$nameCompanyClean)) %>%
          group_by(id) %>%
          slice(1) %>%
          ungroup()

      }

      df_details <-
        data %>%
        filter(!is.na(idDUNS)) %>%
        distinct(idDUNS) %>%
        left_join(
          df_sam %>% select(
            matches(
              "idDUNS|slugCAGE|nameCompany|urlCompany|locationCompany|emailPointOfContactElectronicBusiness|date"
            )
          ) %>%
            rename(urlCompanySAM = urlCompany,
                   nameCompanySAM = nameCompanyLegalClean),
          by = "idDUNS"
        )

      remove_cols <-
        names(
          data)[names(data) %in% names(df_details)] %>%
        discard(function(x){
          x == "idDUNS"
        })

      data <-
        data %>%
        select(-one_of(remove_cols)) %>%
        select(-one_of("nameCompanyDBASAM")) %>%
        left_join(df_details %>% rename(nameCompanyDBASAM = nameCompanyDBA)
                  , by = "idDUNS")

      data <- data %>%
        group_by(id) %>%
        slice(1) %>%
        ungroup() %>%
        select(-matches("urlFAR|urlDF"))

      data <- data %>%
        separate(
          emailPointOfContactElectronicBusiness,
          into = c("remove",
                   "urlPointOfContact"),
          sep = "\\@",
          remove = F
        ) %>%
        select(-remove)

      data <- data %>%
        mutate(
          urlPointOfContact = str_c("https://", urlPointOfContact, sep =
                                      ""),
          urlCompany = case_when(
            is.na(urlCompany) & !is.na(urlCompanySAM) ~ urlCompanySAM,
            TRUE ~ urlCompanySAM
          )
        )

      data <-
        data %>%
        mutate(
          urlCompany = case_when(is.na(urlCompany) ~ urlPointOfContact,
                                 TRUE ~ urlCompany),
          urlCompany = case_when(
            urlCompany %>% str_detect("gmail") ~ NA_character_,
            TRUE ~ urlCompany
          )
        )

      data <- data %>%
        mutate(hasDUNS = !is.na(idDUNS))

    }

    matched_duns <-
      data %>% filter(!is.na(idDUNS)) %>% distinct(idDUNS) %>% pull()

    if (has_sam & include_naics) {
      df_naics <-
        df_sam %>%
        filter(idDUNS %in% matched_duns) %>%
        select(idDUNS, idNAICSPrimary,nameNAICSPrimary, nameIndustryGroupNAICS, countNAICS, countNAICSExceptions, dataNAICS)

      if (snake_names) {
        df_naics <- df_naics %>%
          unnest() %>%
          clean_names() %>%
          group_by(
            id_duns,
            id_naics_primary,
            name_naics_primary,
            name_industry_group_naics,
            is_primary_naics,
            count_naics,
            count_naics_exceptions
          ) %>%
          nest() %>%
          rename(data_naics = data)

        data <-
          data %>%
          left_join(df_naics, by = c("idDUNS" = "id_duns")) %>%
          mutate(has_naics = T)
      } else {
        data <- data %>%
          left_join(df_naics, by = "idDUNS") %>%
          mutate(hasNAICS = T)
      }
      data <- data %>%
        select(-matches("^data"), everything())
    }

    if (has_sam & include_psc) {
      df_psc <-
        df_sam %>%
        filter(idDUNS %in% matched_duns) %>%
        select(idDUNS, countProductServiceCodes, dataProductServiceCodes)

      if (snake_names) {
        df_psc <-
          df_psc %>%
          unnest() %>%
          clean_names() %>%
          group_by(id_duns,
                   count_product_service_codes) %>%
          nest() %>%
          rename(data_psc = data)

        data <-
          data %>%
          left_join(df_psc, by = c("idDUNS" = "id_duns")) %>%
          mutate(has_product_service_codes = T)
      } else {
        data <-
          data %>%
          left_join(df_psc, by = "idDUNS") %>%
          mutate(hasProductServiceCodes = T)
      }
      data <- data %>%
        select(-matches("^data"), everything())
    }

    if (has_sam & include_business_types) {
      df_bt <-
        df_sam %>%
        filter(idDUNS %in% matched_duns) %>%
        select(idDUNS, countBusinessTypes, dataBusinessTypes)

      if (snake_names) {
        df_bt <-
          df_bt %>%
          unnest() %>%
          clean_names() %>%
          group_by(id_duns,
                   count_business_types) %>%
          nest() %>%
          rename(data_business_types = data)

        data <-
          data %>%
          left_join(df_bt, by = c("idDUNS" = "id_duns")) %>%
          mutate(has_business_types = T)
      } else {
        data <-
          data %>%
          left_join(df_bt, by = "idDUNS") %>%
          mutate(hasBusinessTypes = T)
      }
      data <- data %>%
        select(-matches("^data"), everything())
    }

    if (has_sam & include_sba) {
      df_sba <-
        df_sam %>%
        filter(idDUNS %in% matched_duns) %>%
        select(idDUNS, countSBATypes, dataSBA)

      if (snake_names) {
        df_sba <-
          df_sba %>%
          unnest() %>%
          clean_names() %>%
          group_by(id_duns,
                   count_sba_types) %>%
          nest() %>%
          rename(data_sba_types = data)

        data <-
          data %>%
          left_join(df_bt, by = c("idDUNS" = "id_duns")) %>%
          mutate(has_sba_types = T)
      } else {
        data <-
          data %>%
          left_join(df_sba, by = "idDUNS") %>%
          mutate(hasSBATypes = T)
      }
      data <- data %>%
        select(-matches("^data"), everything())
    }


      if (only_open_topic) {
        data <- data %>% filter(isOpenTopicAwardee)
      }


    if (snake_names) {
      data <- clean_names(data)
    }

    data
  }

.sbir_afwerex_cached <-
  memoise::memoise(function(snake_names = T, only_open_topic = F) {
    data <- read_rda("https://asbcllc.com/r_packages/govtrackR/data/afwerx_portfolio.rda")
    data <- data %>%
      mutate_if(is.logical,
                list(function(x){
                  case_when(is.na(x) ~ FALSE,
                            TRUE ~ x)
                }))

    if (only_open_topic) {
      data <- data %>%
        filter(isOpenTopicAwardee)
    }
    if (!snake_names) {
      return(data)
    }
    data <- clean_names(data)

    if (data %>% hasName("data_naics")) {
      df_naics <- data %>%
        select(id_duns, data_naics) %>%
        unnest() %>%
        clean_names() %>%
        group_by(id_duns) %>%
        nest() %>%
        ungroup() %>%
        rename(data_naics = data)
      data <- data %>%
        select(-data_naics) %>%
        left_join(df_naics, by = "id_duns") %>%
        select(-matches("data"), everything())
    }

    if (data %>% hasName("data_product_service_codes")) {
      df_psc <- data %>%
        select(id_duns, data_product_service_codes) %>%
        unnest() %>%
        clean_names() %>%
        group_by(id_duns) %>%
        nest() %>%
        ungroup() %>%
        rename(data_product_service_codes = data)
      data <- data %>%
        select(-data_product_service_codes) %>%
        left_join(df_psc, by = "id_duns") %>%
        select(-matches("data"), everything())
    }

    if (data %>% hasName("data_business_types")) {
      df_bt <- data %>%
        select(id_duns, data_business_types) %>%
        unnest() %>%
        clean_names() %>%
        group_by(id_duns) %>%
        nest() %>%
        ungroup() %>%
        rename(data_business_types = data)
      data <- data %>%
        select(-data_business_types) %>%
        left_join(df_bt, by = "id_duns") %>%
        select(-matches("data"), everything())
    }

    if (data %>% hasName("data_sba")) {
      df_sba <-
        data %>%
        select(id_duns, data_sba) %>%
        unnest() %>%
        clean_names() %>%
        group_by(id_duns) %>%
        nest() %>%
        ungroup() %>%
        rename(data_sba = data)
      data <- data %>%
        select(-data_sba) %>%
        left_join(df_sba, by = "id_duns") %>%
        select(-matches("data"), everything())
    }

    data

  })

#' SBIR AFWERX Companies
#'
#' Acquires and resolves information about
#'
#' @param resolve_for_duns if \code{TRUE} resolves for DUNS and other missing information
#' @param snake_names
#' @param only_open_topic
#' @param use_cached
#' @param include_naics
#' @param include_psc
#' @param include_sba
#' @param include_business_types
#'
#' @return \code{tibble}
#' @export
#'
#' @examples
#' sbir_afwerx_portfolio()
sbir_afwerx_portfolio <-
  function(use_cached = F,
           resolve_for_duns = T,
           only_open_topic = F,
           include_naics = F,
           include_psc = F,
           include_sba = T,
           include_business_types = T,
           snake_names = T)  {
    if (use_cached) {
      data <- .sbir_afwerex_cached(snake_names = snake_names, only_open_topic = only_open_topic)
      return(data)
    }

    .sbir_afwerx_portfolio(resolve_for_duns = resolve_for_duns,
                           only_open_topic = only_open_topic,
                           include_naics = include_naics,
                           include_psc = include_psc,
                           snake_names = snake_names,
                           include_business_types = include_business_types,
                           include_sba = include_sba)
  }


#' Join DOD SBIR/STTR data to input tibble
#'
#' @param data input tibble to join dod data to
#' @param duns_column name of the duns column
#' @param dod_select_columns if not `NULL` subset of columns
#' from DOD API to join
#'
#' @return
#' @export
#'
#' @examples
tbl_dod_sbir_duns_data <-
  function(data,
           duns_column = NULL,
           clean_contracts = T,
           dod_select_columns = NULL) {
    if (length(duns_column) == 0) {
      "Enter DUNS column" %>% message()
      return(invisible())
    }

    if (!data %>% hasName(duns_column)) {
      glue("{duns_column} missing from data") %>% message()
      return(invisible())
    }

    duns <-
      data %>%
      filter(!is.na(!!sym(duns_column))) %>%
      distinct(!!sym(duns_column)) %>%
      pull()

    tbl_dod <-
      dod_sbir_historic(snake_names = T) %>%
      filter(id_duns %in% duns)



    if (length(dod_select_columns) > 0) {
      col_slugs <-
        c("id_duns", dod_select_columns) %>% unique() %>% str_c(collapse = "|")

      tbl_dod <- tbl_dod %>% select(matches(col_slugs))

    }

    tbl_dod <-
      tbl_dod %>%
      group_by(id_duns) %>%
      mutate_if(is.numeric, as.numeric) %>%
      nest() %>%
      rename(UQ(duns_column) := id_duns, data_dod_sbirs = data) %>%
      ungroup() %>%
      mutate(has_dod_sbir = T,
             count_dod_sbir = data_dod_sbirs %>% map_dbl(nrow)) %>%
      select(-data_dod_sbirs, everything())

    data <-
      data %>%
      left_join(tbl_dod, by = duns_column) %>%
      mutate(has_dod_sbir = has_dod_sbir %>% coalesce(F))

    data <- data %>% select(-matches("data_"), everything())

    data

  }

.tbl_sbir_awards <-
  function(data,
           only_open_topic = T,
           sbir_columns = c("nameAward",
                            "descriptionAward"),
           dod_text_columns = c(
             "name_topic",
             "name_award",
             "description_award",
             "description_narrative_agency",
             "description_topic",
             "description_topic_rationale",
             "description_principal_benefit",
             "topic_phase1",
             "topic_phase2",
             "topic_phase3"
           ),
           join_sam_data = F,
           snake_names = T
           ) {
  tbl_afwerx <-
    sbir_afwerx_portfolio(resolve_for_duns = T, snake_names = F, only_open_topic = only_open_topic)

  matched_duns <- tbl_afwerx %>%
    filter(!is.na(idDUNS)) %>%
    distinct(idDUNS) %>%
    pull()

  tbl_afwerx <- tbl_afwerx %>%
    clean_names()

  df_sbir_historic <- sbirs_historic()
  df_sbir_historic <-
    df_sbir_historic %>%
    filter(idDUNS %in% matched_duns) %>%
    select(
      idDUNS,
      nameCompanySBIR = nameCompanyClean,
      idContractResolved,
      idSBIR,
      dateAward,
      amountContract,
      nameAgency,
      nameBranch,
      one_of(sbir_columns)
    )

  df_sbir_historic <-
    df_sbir_historic %>%
    create_text_block(id_column = "idSBIR", text_columns = sbir_columns) %>%
    select(-one_of(sbir_columns)) %>%
    clean_names() %>%
    group_by(id_duns) %>%
    nest() %>%
    rename(data_sbir = data) %>%
    mutate(count_sbir_website = data_sbir %>% map_dbl(nrow),
           has_sbirs_website = T) %>%
    select(-matches("data"), everything()) %>%
    ungroup()

  df_dod_duns <-
    dod_sbir_historic(snake_names = T) %>%
    filter(id_duns %in% matched_duns)

  df_dod_duns <-
    df_dod_duns %>%
    select(
      group_solicitation,
      id_duns,
      id_contract_analysis,
      date_award,
      amount_contract,
      keywords_sbir_company,
      keywords_topic_sbir,
      name_company_dod = name_company_clean,
      one_of(dod_text_columns)
    ) %>%
    mutate_if(
      is.character,
      list(function(x){
        x %>% str_remove_all("\u001a") %>% str_squish()
      })) %>%
    create_text_block(id_column = "id_contract_analysis", text_columns = dod_text_columns) %>%
    select(-matches("id_phase")) %>%
    select(-one_of(dod_text_columns))

  data_all_keywords <-
    df_dod_duns %>%
    select(id_duns, matches("keyword")) %>%
    left_join(tbl_afwerx %>% select(id_duns, name_company_clean), by = "id_duns") %>%
    gather(type, keyword, -c(id_duns, name_company_clean)) %>%
    separate_rows(keyword, sep = "\\|") %>%
    mutate(keyword = str_squish(keyword)) %>%
    mutate(type = type %>% str_remove_all("keywords_")) %>%
    bind_rows(
      tbl_afwerx %>%
        select(id_duns, name_company_clean, afwerx = keyword_company_clean_afwerx) %>%
        gather(type, keyword, -c(id_duns, name_company_clean)) %>%
        separate_rows(keyword, sep = "\\|") %>%
        mutate(keyword = str_squish(keyword))

    ) %>%
    distinct() %>%
    group_by(id_duns, name_company_clean) %>%
    nest() %>%
    rename(data_keywords = data) %>%
    ungroup() %>%
    mutate(has_keywords = T,
           count_keywords = data_keywords %>% map_dbl(nrow))


  tbl_text_block <-
    df_dod_duns %>%
    select(id_duns, text = text_contract_analysis) %>%
    bind_rows(
      df_sbir_historic %>% unnest_legacy() %>% filter(!id_contract_resolved %in% df_dod_duns$id_contract_analysis) %>%
        select(id_duns, text = text_sbir)
    ) %>%
    left_join(tbl_afwerx %>% select(id_duns, name_company_clean), by = "id_duns") %>%
    distinct() %>%
    filter(!is.na(text)) %>%
    group_by(id_duns, name_company_clean) %>%
    summarise(text = str_c(text, collapse = " ")) %>%
    ungroup() %>%
    mutate(has_text_block = T)

  df_dod_duns <- df_dod_duns %>%
    group_by(id_duns) %>%
    nest() %>%
    rename(data_dod_awards = data) %>%
    mutate(has_dod_sbir = T) %>%
    ungroup()


  tbl_afwerx <-
    tbl_afwerx %>% group_by(id_duns, name_company_clean) %>% nest() %>%
    rename(data_afwerx = data)

  data <-
    list(tbl_afwerx,
         data_all_keywords,
         tbl_text_block,
         df_dod_duns,
         df_sbir_historic) %>%
    reduce(left_join) %>%
    mutate_if(is.logical,
              list(function(x){
                case_when(is.na(x) ~ F,
                          TRUE ~ x)
              })) %>%
    ungroup()

  data

  }

#' Assembled SBIR data from source
#'
#' @param data
#' @param data_name
#' @param remove_columns
#' @param duns_column
#' @param exclude_dod_matches
#' @param sbir_id_column
#' @param sbir_select_columns
#' @param sbir_text_columns
#' @param dod_id_column
#' @param dod_select_columns
#' @param dod_text_columns
#' @param url_column
#' @param include_website_crux
#' @param crawl_websites
#' @param exclude_topic_keywords
#' @param exclude_custom_keywords
#' @param join_sam_data
#' @param stay_on_site
#' @param use_future
#' @param snake_names
#' @param assign_to_environment
#'
#' @return
#' @export
#'
#' @examples
#' \dontrun{
#' df <-
#' govtrackR::tbl_sbir_assemble(data = sbir_afwerx_portfolio(use_cached = T, snake_names = T), data_name = "afwerx", crawl_websites = F, include_website_crux = T)
#' }
tbl_sbir_assemble <-
  function(data,
           data_name = NULL,
           data_id = "id",
           nest_original_data = T,
           remove_columns = "keyword_afwerx",
           duns_column = "id_duns",
           exclude_dod_matches = T,
           sbir_id_column = "idSBIR",
           sbir_select_columns = c(
             "idSBIR",
             "idDUNS",
             "dateAward",
             "idAgencyTracking",
             "idContractResolved",
             "idSolicitation",
             "nameAgency",
             "nameBranch",
             "amountContract",
             "namePrincipal"
           ),
           sbir_text_columns = c("nameAward",
                                 "descriptionAward"),
           dod_id_column = "id_contract_analysis",
           dod_select_columns = c(
             "id_duns",
             "name_office",
             "slug_agency",
             "name_full_point_of_contact_sttr",
             "group_solicitation",
             "keywords_sbir_company",
             "keywords_topic_sbir",
             "date_award_dod",
             "amount_contract",
             "name_company_dod_clean",
             "longitude_company",
             "latitude_company",
             "slug_acquisition_program_name",
             "count_patents_company",
             "count_employees_current"
           ),
           dod_text_columns = c(
             "name_topic",
             "name_award",
             "description_award",
             "description_narrative_agency",
             "description_topic",
             "description_principal_benefit"
           ),
           url_column = "url_company",
           exclude_urls = c("google", "outlook", "yahoo", "protonmail"),
           include_website_crux = T,
           crawl_websites = F,
           exclude_topic_keywords = F,
           exclude_custom_keywords = F,
           join_sam_data = F,
           stay_on_site = T,
           use_future  = F,
           snake_names = T,
           assign_to_environment = T) {

    if (length(remove_columns) > 0) {
      data <-
        data %>%
        select(-one_of(remove_columns))
    }
    if (snake_names) {
      data <- data %>% clean_names()
      duns_column <- duns_column %>% make_clean_names()
    }

    if (join_sam_data) {
      data <-
        data %>% tbl_sam_data(duns_column = duns_column, snake_names = snake_names)
    }

    duns_to_match <- data %>%
      filter(!is.na(!!sym(duns_column))) %>%
      select(one_of(duns_column)) %>%
      distinct() %>%
      pull()

    tbl_dod_sbirs <-
      dod_sbir_historic(
        snake_names = T,
        create_text_description = F,
        unformat = T
      )

    tbl_dod_sbirs <-
      tbl_dod_sbirs %>%
      rename(name_company_dod_clean = name_company_clean) %>%
      rename(date_award_dod = date_award) %>%
      filter(id_duns %in% duns_to_match) %>%
      select(one_of(c(
        dod_id_column, dod_select_columns, dod_text_columns
      ))) %>%
      mutate_if(
        is.character,
        list(function(x){
          x %>% str_remove_all("\u001a") %>% str_squish()
        })) %>%
      create_text_block(id_column = dod_id_column, text_columns = dod_text_columns
      )

    tbl_keywords <-
      tbl_dod_sbirs %>%
      select(one_of(duns_column, dod_id_column), matches("keyword")) %>% rename(id := UQ(dod_id_column)) %>%
      gather(type_keyword, keyword, -c(duns_column, id)) %>%
      mutate(type_keyword = type_keyword %>% str_remove_all("^keywords_|^keyword_"),
             id = as.character(id)) %>%
      separate_rows(keyword, sep = "\\|") %>%
      mutate_if(is.character,
                list(function(x){
                  x %>% str_squish()
                }))

    tbl_texts <-
      tbl_dod_sbirs %>%
      select(id_duns := UQ(duns_column),
             id := UQ(dod_id_column),
             text = text_contract_analysis) %>%
      mutate(source = "DOD") %>%
      select(source, everything())

    if (assign_to_environment) {
      assign('tbl_dod_sbirs', value = tbl_dod_sbirs, envir = .GlobalEnv)
    }



    if (exclude_topic_keywords) {
      tbl_keywords <-
        tbl_keywords %>%
        filter(type_keyword != "topic_sbir")
    }

    key_col <- data %>% select(matches("keyword")) %>% names()

    if (length(key_col) & !exclude_custom_keywords) {
      df_new_keywords <-
        data %>%
        select(one_of(duns_column, data_id), matches("keyword")) %>%
        rename(id := UQ(data_id)) %>%
        gather(type_keyword, keyword, -c(duns_column, id)) %>%
        mutate(type_keyword = type_keyword %>% str_remove_all("^keywords_|^keyword_"),
               id = as.character(id)) %>%
        separate_rows(keyword, sep = "\\|") %>%
        mutate_if(is.character,
                  list(function(x){
                    x %>% str_squish()
                  }))

      if (nrow(df_new_keywords) > 0) {
        tbl_keywords <-
          tbl_keywords %>%
          bind_rows(df_new_keywords) %>%
          arrange(!!sym(duns_column))
      }
    }
    tbl_keywords <-
      tbl_keywords %>% filter(!is.na(keyword)) %>%
      filter(!is.na(!!sym(duns_column))) %>%
      filter(keyword != "")
    if (assign_to_environment & nrow(tbl_keywords) > 0) {
      assign(x = "tbl_keywords_sbirs", tbl_keywords, envir = .GlobalEnv)
    }

    matched_dod_contracts <-
      tbl_dod_sbirs %>% distinct(!!sym(dod_id_column)) %>%
      pull()

    tbl_sbirs <- sbirs_historic()

    tbl_sbirs <-
      tbl_sbirs %>%
      filter(idDUNS %in% duns_to_match) %>%
      select(one_of(sbir_id_column, sbir_select_columns, sbir_text_columns)) %>%
      create_text_block(id_column = sbir_id_column, text_columns = sbir_text_columns)


    if (length(matched_dod_contracts) > 0 &
        exclude_dod_matches) {
      if (tbl_sbirs %>% hasName("idContractResolved")) {
        tbl_sbirs <- tbl_sbirs %>%
          filter(!idContractResolved %in% matched_dod_contracts)
      }
    }


    if (snake_names) {
      tbl_sbirs <- tbl_sbirs %>% clean_names()
      sbir_id_column <- make_clean_names(sbir_id_column)
    }

    if (nrow(tbl_dod_sbirs) > 0) {
      text_col <- tbl_sbirs %>% select(matches("text_")) %>% names()


      tbl_texts <- tbl_texts %>%
        bind_rows(
          tbl_sbirs %>%
            select(
              id_duns := UQ(duns_column),
              id := UQ(sbir_id_column),
              text := UQ(text_col)
            ) %>%
            mutate(source = "SBA",
                   id = as.character(id)) %>%
            select(source, everything())
        )
    }

    if (assign_to_environment & nrow(tbl_sbirs) > 0) {
      assign("tbl_sba_sbirs", tbl_sbirs, envir = .GlobalEnv)
    }


    if (data %>% hasName(url_column) & include_website_crux) {
      df_urls <-
        data %>%
        select(one_of(duns_column, url_column)) %>%
        filter(!is.na(!!sym(url_column))) %>%
        distinct()

      if (length(exclude_urls) > 0) {
        df_urls <- df_urls %>%
          filter(!(!!sym(url_column) %>% str_detect(str_c(
            exclude_urls, collapse = "|"
          ))))
      }

      df_crux <-
        tbl_crux(data = df_urls,
                 snake_names = snake_names,
                 url_column = url_column)

      df_crux <- df_crux %>% munge_data(snake_names = snake_names)

      df_crux <- df_crux %>%
        mutate_if(
          is.character,
          list(function(x){
            x %>% str_remove_all("\u001a|\U001A") %>% str_squish()
          }))

      tbl_texts <-
        tbl_texts %>%
        bind_rows(
          df_crux %>%
            filter(!is.na(description_site_text)) %>%
            select(id_duns, id = url_company, text = description_site_text) %>%
            mutate(source = "CRUX")
        )

      if (assign_to_environment & nrow(df_crux) > 0)
      {
        assign("tbl_crux_sbir", df_crux, envir = .GlobalEnv)
      }

    }

    if (data %>% hasName(url_column) & crawl_websites) {
      df_urls <-
        data %>%
        select(one_of(duns_column, url_column)) %>%
        filter(!is.na(!!sym(url_column))) %>%
        distinct()

      if (length(exclude_urls) > 0) {
        df_urls <- df_urls %>%
          filter(!(!!sym(url_column) %>% str_detect(str_c(
            exclude_urls, collapse = "|"
          ))))
      }

      df_crawled <-
        sheldon::crawl_for_urls(
          urls = df_urls %>% pull(url_column),
          stay_on_site = stay_on_site,
          use_future = use_future
        )

      df_crawled <- df_crawled %>%
        filter(!is.na(url_reference))

      df_crawled_text <-
        sheldon::scrape_text(urls = df_crawled$url_reference)

      df_crawled_text <-
        df_crawled_text %>%
        select(url, text_site = text) %>%
        rename(url_reference = url) %>%
        left_join(df_crawled) %>%
        rename(UQ(url_column) := url) %>%
        left_join(df_urls) %>%
        fill(duns_column) %>%
        filter(!is.na(!!sym(duns_column)))

      df_site <-
        df_crawled_text %>%
        filter(!is.na(url_reference)) %>%
        filter(!is.na(text_site)) %>%
        select(id_duns, id = url_reference, text = text_site) %>%
        mutate(source = "CRAWL")

      tbl_texts <-
        tbl_texts %>%
        bind_rows(df_site)

      if (assign_to_environment & nrow(df_crux) > 0) {
        assign("tbl_crawled_sbirs", df_crawled_text, envir = .GlobalEnv)
      }

    }

    tbl_texts <-
      tbl_texts %>%
      mutate(type = "company") %>%
      select(type, everything())

    if (assign_to_environment) {
      assign("tbl_text_items_sbirs", tbl_texts, envir = .GlobalEnv)
      tbl_all <-
        tbl_texts %>%
        group_by(!!sym(duns_column)) %>%
        filter(!is.na(text)) %>%
        summarise(text = text %>% str_c(collapse = " ")) %>%
        mutate(type = "company") %>%
        select(type, everything())

      assign("tbl_text_all_sbirs", tbl_all, envir = .GlobalEnv)
    }

    if (nest_original_data) {
      d <-
        data %>%
        group_by(!!sym(duns_column)) %>%
        nest() %>%
        ungroup()

      if (length(data_name) > 0) {
        d <- d %>%
          rename(UQ(data_name) := data)
      }
    } else {
      d <-
        data %>% group_by(!!sym(duns_column)) %>% slice(1) %>%
        ungroup()
    }

    tbl_sbirs <- tbl_sbirs %>%
      group_by(!!sym(duns_column)) %>%
      nest() %>%
      rename(data_sbir_sba = data) %>%
      ungroup() %>%
      mutate(has_sbir_sba = T)

    tbl_dod_sbirs <-
      tbl_dod_sbirs %>%
      group_by(!!sym(duns_column)) %>%
      nest() %>%
      rename(data_sbir_dod = data) %>%
      ungroup() %>%
      mutate(has_sbir_dod = T)

    tbl_keywords <-
      tbl_keywords %>%
      group_by(!!sym(duns_column)) %>%
      nest() %>%
      rename(data_keywords = data) %>%
      ungroup() %>%
      mutate(has_keywords = T)

    tbl_texts <-
      tbl_texts %>%
      group_by(!!sym(duns_column)) %>%
      nest() %>%
      rename(data_text = data) %>%
      ungroup() %>%
      mutate(has_text = T)


    data <-
      list(d, tbl_texts, tbl_keywords, tbl_sbirs,
           tbl_dod_sbirs) %>%
      reduce(left_join, by = duns_column)

    if (include_website_crux) {
      data <-
        data %>%
        left_join(
          tbl_crux_sbir %>%
            group_by(!!!syms(c(
              duns_column, url_column
            ))) %>%
            nest() %>%
            rename(data_crux = data) %>%
            ungroup()
        )
    }

    if (crawl_websites) {
      data <- data %>%
        left_join(
          df_crawled_text %>%
            group_by(!!!syms(c(
              duns_column, url_column
            ))) %>%
            nest() %>%
            rename(data_crawled_text = data) %>%
            ungroup()
        )
    }

    data <- data %>%
      mutate_if(is.logical,
                list(function(x){
                  case_when(is.na(x) ~ FALSE,
                            TRUE ~ x)
                })) %>%
      select(-matches("data"), everything())
    data

  }

#' Assign Nested data
#'
#' @param data a `tibble`
#' @param duns_column name of `DUNS` column
#' @param other_select_columns if not `NULL` other columns to unnest
#'
#' @return
#' @export
#'
#' @examples
assign_nested_sbir_data <-
  function(data, duns_column = "id_duns", other_select_columns = NULL) {
    tbl_classes <-
      data %>% map_df(class) %>%
      gather(column, class)
    nested_cols <-
      tbl_classes %>%
      filter(class %>% str_detect("list|data")) %>%
      pull(column)

    nested_cols %>%
      walk(function(x){
        d <-
          data %>%
          select(one_of(duns_column, other_select_columns, x)) %>%
          unnest() %>%
          distinct()

        if (x == "data_text") {
          assign("tbl_text_items_sbirs", d, envir = .GlobalEnv)
          tbl_all <-
            d %>%
            group_by(!!sym(duns_column)) %>%
            filter(!is.na(text)) %>%
            summarise(text = text %>% str_c(collapse = " ")) %>%
            mutate(type = "company") %>%
            select(type, everything())

          assign("tbl_text_all_sbirs", tbl_all, envir = .GlobalEnv)
          return(invisible())
        }

        if (x == "data_keywords") {
          assign(x = "tbl_keywords_sbirs", value = d, envir = .GlobalEnv)
          return(invisible())
        }

        if (x == "data_sbir_sba") {
          assign("tbl_sba_sbirs", value = d, envir = .GlobalEnv)
          return(invisible())
        }

        if (x == "data_sbir_dod") {
          assign('tbl_dod_sbirs', value = d, envir = .GlobalEnv)
          return(invisible())
        }

        if (x == "data_crux") {
          assign("tbl_crux_sbir", d, envir = .GlobalEnv)
          return(invisible())
        }

        table_name <- x %>% str_replace_all("data", "tbl")


        assign(x = table_name, value = d, envir = .GlobalEnv)
      })

    return(invisible())
  }
abresler/govtrackR documentation built on July 11, 2020, 12:30 a.m.