R/sam_data_api.R

Defines functions .clean_usg_organizations .filter_duns .parse_html_description

## https://github.com/GSA/IAE-Architecture/blob/master/as-is/tech-docs/SAM/SAMWebServicesExtractsMappingsv1.0/SAM%20Master%20Extract%20Mapping%20v5.5%20Sensitive%20File%20V2%20Layout.xlsx

# utils -------------------------------------------------------------------
.parse_html_description <-
  function(description) {
     description %>%
      stri_enc_toascii() %>%
      read_html() %>%
      html_text() %>%
      stringi::stri_trans_general("Latin-ASCII") %>%
      stri_enc_toascii() %>% str_replace_all(" \032 ", " ") %>% str_to_upper() %>%
      str_squish()
  }
.filter_duns <-
  function(data, duns) {
    data %>%
      filter(idDUNS %in% duns)
  }

.clean_usg_organizations <-
  function(data,
           column = "nameDepartment",
           include_slug = T) {
    if (!data %>% hasName(column)) {
      return(data)
    }

    df_dict <-
      data %>%
      select(column) %>%
      distinct() %>%
      filter(!is.na(!!sym(column))) %>%
      arrange(!!sym(column))

    new_col <- glue("{column}Clean") %>% as.character()

    df_dict <-
      df_dict %>%
      separate(
        !!sym(column),
        sep = "\\,",
        c("part1", "part2"),
        fill = "right",
        extra = "merge",
        remove = F
      ) %>%
      mutate_all(str_squish)


    df_dict <-
      df_dict %>%
      mutate(UQ(new_col) := case_when(
        is.na(part2) ~ part1,
        !is.na(part2) ~ str_c(part2, part1, sep = " ")
      )) %>%
      select(one_of(column, new_col)) %>%
      mutate_at(new_col, list(function(x) {
        x %>% str_remove_all("ATTN:") %>%  stri_enc_toascii() %>%
          gsub("\\s+", " ", .) %>%
          str_squish()
      }))

    if (include_slug) {
      slug_col <-
        new_col %>% str_replace_all("name", "slug") %>% str_remove_all("Clean")

      df_dict <-
        df_dict %>%
        separate(
          UQ(new_col),
          into = c(new_col, slug_col),
          sep = "\\(",
          fill = "right",
          extra =  "merge"
        )

      df_dict <-
        df_dict %>%
        mutate_at(slug_col,
                  list(function(x) {
                    x %>% str_remove_all("\\)")
                  })) %>%
        .remove_na()

    }


    df_dict <- df_dict %>%
      mutate(
        UQ(new_col) := case_when(
          !!sym(new_col) == "DEPT OF DEFENSE" ~ "DEPARTMENT OF DEFENSE",
          TRUE ~ !!sym(new_col)
        )
      )

    data <-
      data %>%
      left_join(df_dict, by = column) %>%
      select(-column) %>%
      rename(UQ(column) := UQ(new_col)) %>%
      select(one_of(names(data)), everything()) %>%
      mutate_if(is.character, str_squish)


    data

  }



# dictionaries ------------------------------------------------------------

#' SAM Bulk Entity File Dictionary
#'
#' Returns list of the most recent
#' SAM Entity Data Dictionary
#'
#' @return
#' @export
#'
#' @examples
#' dictionary_sam_entity_files()
dictionary_sam_entity_files <-
  function() {
    data <-
      "https://beta.sam.gov/api/prod/fileextractservices/v1/api/listfiles?random=1575841130052&domain=Entity%20Information" %>%
      fromJSON() %>%
      .[[1]]

    data <- data[[1]] %>%
      as_tibble() %>%
      setNames(c(
        "nameFile",
        "dateModified",
        "nameBucketS3",
        "keyS3",
        "urlFile"
      )) %>%
      mutate(dateModified = mdy(dateModified))

    urls <- data$urlFile$self %>% pull()
    date_slugs <- urls %>%
      (
        "https://s3.amazonaws.com/falextracts/Entity Information/|\\SAM_PUBLIC_MONTHLY_|entityinformation|csv|ZIP"
      ) %>% ("\\.")

    data <-
      data %>%
      mutate(
        urlFile = urls,
        dateSlug = date_slugs,
        typeAPI = case_when(
          nameFile %>% str_detect("SAM_PUBLIC") ~ "SAM V2",
          TRUE ~ "SAM OLD"
        ),
        dateData = case_when(typeAPI == "SAM V2" ~ ymd(dateSlug),
                             TRUE ~ mdy(dateSlug)),
        isMostRecent = dateData == dateData %>% max()
      ) %>%
      select(dateData, nameFile, everything()) %>%
      select(-dateSlug)

    data

  }

#' SAM Parent Entity Dictionary
#'
#' Returns information about SAM parent entity types.
#'
#' @return \code{tibble()}
#' @export
#'
#' @examples
#' dictionary_sam_parent_entity_types()
dictionary_sam_parent_entity_types <-
  function() {
    "2J  - Sole Proprietorship
    2K  - Partnership or Limited Liability Partnership
    2L  - Non Tax Exempt Corporate Entity
    8H  - Tax Exempt Corporate Entity
    2A  - United States Government Entity
    CY  - Foreign Country or Government
    X6  - International Organization
    ZZ - Other" %>%
      str_split("\n") %>%
      flatten_chr() %>%
      str_squish() %>%
      tibble(type = .) %>%
      separate(
        type,
        into = c("slugEntityStructure", "typeEntityStructure"),
        sep = "\\ - "
      ) %>%
      mutate_all(str_to_upper)
  }

.dictionary_sam_extract_names <- function() {
  tibble(
    idColumn = 1:149,
    nameColumn = c(
      "idDUNS",
      "slugDUNS",
      "slugCAGE",
      "slugDeptDefenseAddressCode",
      "slugSAMExtract",
      "slugRegistrationPurpose",
      "dateRegistrationInitial",
      "dateExpiration",
      "dateLastUpdate",
      "dateActivation",
      "nameCompanyLegal",
      "nameCompanyDBA",
      "nameCompanyDivision",
      "codeCompanyDivision",
      "addressStreet1Company",
      "addressStreet2Company",
      "cityCompany",
      "stateCompany",
      "zipcodeCompany",
      "zipcode4Company",
      "codeCountryCompany",
      "idCongressionalDistrictCompany",
      "dateCompanyStart",
      "monthdayFiscalYearEnd",
      "urlCompany",
      "slugEntityStructure",
      "stateIncorporation",
      "codeCountryIncorporation",
      "countBusinessTypes",
      "dataBusinessTypes",
      "idNAICSPrimary",
      "countNAICS",
      "dataNAICS",
      "countProductServiceCodes",
      "dataProductServiceCodes",
      "hasCreditCardUsage",
      "slugCorrespondenceFlag",
      "addressStreet1MailingCompany",
      "addressStreet2MailingCompany",
      "cityMailingCompany",
      "zipcodeMailingCompany",
      "zipcode4MailingCompany",
      "codeCountryMailingCompany",
      "stateMailingCompany",
      "nameFirstPointOfContactGovt",
      "nameMiddlePointOfContactGovt",
      "nameLastPointOfContactGovt",
      "titlePointOfContactGovt",
      "addressStreet1PointOfContactGovt",
      "addressStreet2PointOfContactGovt",
      "cityPointOfContactGovt",
      "zipcodePointOfContactGovt",
      "zip4PointOfContactGovt",
      "codeCountryPointOfContactGovt",
      "statePointOfContactGovt",
      "telephonePointOfContactGovt",
      "telephoneExtensionPointOfContactGovt",
      "telephoneNonUSPointOfContactGovt",
      "faxPointOfContactGovt",
      "emailPointOfContactGovt",

      "nameFirstPointOfContactGovtAlt",
      "nameMiddlePointOfContactGovtAlt",
      "nameLastPointOfContactGovtAlt",
      "titlePointOfContactGovtAlt",
      "addressStreet1PointOfContactGovtAlt",
      "addressStreet2PointOfContactGovtAlt",
      "cityPointOfContactGovtAlt",
      "zipcodePointOfContactGovtAlt",
      "zip4PointOfContactGovtAlt",
      "codeCountryPointOfContactGovtAlt",
      "statePointOfContactGovtAlt",
      "telephonePointOfContactGovtAlt",
      "telephoneExtensionPointOfContactGovtAlt",
      "telephoneNonUSPointOfContactGovtAlt",
      "faxPointOfContactGovtAlt",
      "emailPointOfContactGovtAlt",

      "nameFirstPointOfContactPastPerformance",
      "nameMiddlePointOfContactPastPerformance",
      "nameLastPointOfContactPastPerformance",
      "titlePointOfContactPastPerformance",
      "addressStreet1PointOfContactPastPerformance",
      "addressStreet2PointOfContactPastPerformance",
      "cityPointOfContactPastPerformance",
      "zipcodePointOfContactPastPerformance",
      "zip4PointOfContactPastPerformance",
      "codeCountryPointOfContactPastPerformance",
      "statePointOfContactPastPerformance",
      "telephonePointOfContactPastPerformance",
      "telephoneExtensionPointOfContactPastPerformance",
      "telephoneNonUSPointOfContactPastPerformance",
      "faxPointOfContactPastPerformance",
      "emailPointOfContactPastPerformance",

      "nameFirstPointOfContactPastPerformanceAlt",
      "nameMiddlePointOfContactPastPerformanceAlt",
      "nameLastPointOfContactPastPerformanceAlt",
      "titlePointOfContactPastPerformanceAlt",
      "addressStreet1PointOfContactPastPerformanceAlt",
      "addressStreet2PointOfContactPastPerformanceAlt",
      "cityPointOfContactPastPerformanceAlt",
      "zipcodePointOfContactPastPerformanceAlt",
      "zip4PointOfContactPastPerformanceAlt",
      "codeCountryPointOfContactPastPerformanceAlt",
      "statePointOfContactPastPerformanceAlt",
      "telephonePointOfContactPastPerformanceAlt",
      "telephoneExtensionPointOfContactPastPerformanceAlt",
      "telephoneNonUSPointOfContactPastPerformanceAlt",
      "faxPointOfContactPastPerformanceAlt",
      "emailPointOfContactPastPerformanceAlt",

      "nameFirstPointOfContactElectronicBusiness",
      "nameMiddlePointOfContactElectronicBusiness",
      "nameLastPointOfContactElectronicBusiness",
      "titlePointOfContactElectronicBusiness",
      "addressStreet1PointOfContactElectronicBusiness",
      "addressStreet2PointOfContactElectronicBusiness",
      "cityPointOfContactElectronicBusiness",
      "zipcodePointOfContactElectronicBusiness",
      "zip4PointOfContactElectronicBusiness",
      "codeCountryPointOfContactElectronicBusiness",
      "statePointOfContactElectronicBusiness",
      "telephonePointOfContactElectronicBusiness",
      "telephoneExtensionPointOfContactElectronicBusiness",
      "telephoneNonUSPointOfContactElectronicBusiness",
      "faxPointOfContactElectronicBusiness",
      "emailPointOfContactElectronicBusiness",
      "nameFirstPointOfContactElectronicBusinessAlternate",
      "nameMiddlePointOfContactElectronicBusinessAlternate",
      "nameLastPointOfContactElectronicBusinessAlternate",
      "titlePointOfContactElectronicBusinessAlternate",
      "addressStreet1PointOfContactElectronicBusinessAlternate",
      "addressStreet2PointOfContactElectronicBusinessAlternate",
      "cityPointOfContactElectronicBusinessAlternate",
      "zipcodePointOfContactElectronicBusinessAlternate",
      "zip4PointOfContactElectronicBusinessAlternate",
      "codeCountryPointOfContactElectronicBusinessAlternate",
      "statePointOfContactElectronicBusinessAlternate",
      "telephonePointOfContactElectronicBusinessAlternate",
      "telephoneExtensionPointOfContactElectronicBusinessAlternate",
      "telephoneNonUSPointOfContactElectronicBusinessAlternate",
      "faxPointOfContactElectronicBusinessAlternate",
      "emailPointOfContactElectronicBusinessAlternate",
      "countNAICSExceptions",
      "dataNAICSExceptions",
      "hasDebtSubjectToOffsetClause",
      "slugExclusionStatusFlag",
      "countSBATypes",
      "dataSBACodes",
      "slugPublicDisplay",
      "countDisasterResponseCodes",
      "dataDisasterResponse"
    )
  )
}

#' SAM Business Type Codes
#'
#' Dictionary of the SAM business types
#'
#' @return \code{tibble()}
#' @export
#'
#' @examples
#' dictionary_sam_business_types()
dictionary_sam_business_types <- function() {
  data <-
    structure(
      list(
        c(
          "2R",
          "2F",
          "12",
          "3I",
          "CY",
          "A7",
          "20",
          "1D",
          "LJ",
          "XS",
          "MF",
          "2X",
          "A8",
          "2U",
          "HK",
          "A3",
          "A5",
          "QF",
          "A2",
          "23",
          "FR",
          "QZ",
          "OY",
          "PI",
          "NB",
          "8W",
          "27",
          "8E",
          "8C",
          "8D",
          "NG",
          "QW",
          "C8",
          "C7",
          "ZR",
          "MG",
          "C6",
          "H6",
          "TW",
          "UD",
          "8B",
          "86",
          "KM",
          "T4",
          "H2",
          "6D",
          "M8",
          "G6",
          "G7",
          "G8",
          "HB",
          "1A",
          "1R",
          "ZW",
          "GW",
          "OH",
          "HS",
          "QU",
          "G3",
          "G5",
          "BZ",
          "80",
          "FY",
          "HQ",
          "05",
          "OW",
          "XY",
          "8U",
          "1B",
          "FO",
          "TR",
          "G9",
          "JX",
          "V2",
          "VW"
        ),
        c(
          "UNITED STATES FEDERAL GOVERNMENT",
          "UNITED STATES STATE GOVERNMENT",
          "UNITED STATES LOCAL GOVERNMENT",
          "TRIBAL GOVERNMENT",
          "FOREIGN GOVERNMENT",
          "ABILITYONE NON PROFIT AGENCY",
          "FOREIGN OWNED",
          "SMALL AGRICULTURAL COOPERATIVE",
          "LIMITED LIABILITY COMPANY",
          "SUBCHAPTER S CORPORATION",
          "MANUFACTURER OF GOODS",
          "FOR PROFIT ORGANIZATION",
          "NON-PROFIT ORGANIZATION",
          "OTHER NOT FOR PROFIT ORGANIZATION",
          "COMMUNITY DEVELOPMENT CORPORATION OWNED FIRM",
          "LABOR SURPLUS AREA FIRM",
          "VETERAN OWNED BUSINESS",
          "SERVICE DISABLED VETERAN OWNED BUSINESS",
          "WOMAN OWNED BUSINESS",
          "MINORITY OWNED BUSINESS",
          "ASIAN-PACIFIC AMERICAN OWNED",
          "SUBCONTINENT ASIAN (ASIAN-INDIAN) AMERICAN OWNED",
          "BLACK AMERICAN OWNED",
          "HISPANIC AMERICAN OWNED",
          "NATIVE AMERICAN OWNED",
          "WOMAN OWNED SMALL BUSINESS",
          "SELF CERTIFIED SMALL DISADVANTAGED BUSINESS",
          "ECONOMICALLY DISADVANTAGED WOMEN SMALL OWNED BUSINESS",
          "JOINT VENTURE WOMEN OWNED SMALL BUSINESS",
          "JOINT VENTURE ECONOMICALLY DISADVANTAGED WOMEN SMALL OWNED BUSINESS",
          "FEDERAL AGENCY",
          "FEDERALLY FUNDED RESEARCH AND DEVELOPMENT CENTER",
          "CITY",
          "COUNTY",
          "INTER-MUNICIPAL",
          "LOCAL GOVERNMENT OWNED",
          "MUNICIPALITY",
          "SCHOOL DISTRICT",
          "TRANSIT AUTHORITY",
          "COUNCIL OF GOVERNMENTS",
          "HOUSING AUTHORITIES PUBLIC/TRIBAL",
          "INTERSTATE ENTITY",
          "PLANNING COMMISSION",
          "PORT AUTHORITY",
          "COMMUNITY DEVELOPMENT CORPORATION",
          "DOMESTIC SHELTER",
          "EDUCATIONAL INSTITUTION",
          "1862 LAND GRANT COLLEGE",
          "1890 LAND GRANT COLLEGE",
          "1994 LAND GRANT COLLEGE",
          "HISTORICALLY BLACK COLLEGE OR UNIVERSITY",
          "MINORITY INSTITUTION",
          "PRIVATE UNIVERSITY OR COLLEGE",
          "SCHOOL OF FORESTRY",
          "HISPANIC SERVICING INSTITUTION",
          "STATE CONTROLLED INSTITUTION OF HIGHER LEARNING",
          "TRIBAL COLLEGE",
          "VETERINARY COLLEGE",
          "ALASKAN NATIVE SERVICING INSTITUTION",
          "NATIVE HAWAIIAN SERVICING INSTITUTION",
          "FOUNDATION",
          "HOSPITAL",
          "VETERINARY HOSPITAL",
          "DOT CERTIFIED DBE",
          "ALASKAN NATIVE CORPORATION OWNED FIRM",
          "AMERICAN INDIAN OWNED",
          "INDIAN TRIBE - FEDERALLY RECOGNIZED",
          "NATIVE HAWAIIAN ORGANIZATION OWNED FIRM",
          "TRIBALLY OWNED FIRM",
          "TOWNSHIP",
          "AIRPORT AUTHORITY",
          "OTHER THAN ONE OF THE PROCEEDING",
          "SELF-CERTIFIED HUBZONE JOINT VENTURE",
          "GRANTS",
          "CONTRACTS AND GRANTS"
        )
      ),
      .Names = c("slugBusinessType", "typeBusinessSAM"),
      class = c("tbl_df",
                "tbl", "data.frame"),
      row.names = c(NA, -75L)
    )

  ### add types
  data

}

#' SAM Extract Codes
#'
#' Returns information about
#' SAM extract fields, ie, what type of
#' record is most recent.
#'
#' @return
#' @export
#'
#' @examples
#' dictionary_sam_extract_types()
dictionary_sam_extract_types <-
  function() {
    tibble(
      slugSAMExtract = c("A", "E", "1", "2", "3", "4"),
      typeSAMExtract = c("Active", "Expired", "Deleted", "New", "Update", "Expired") %>% str_to_upper()
    )
  }


#' SAM Registration Code Dictionary
#'
#' Information about the reasons for
#' the SAM entity registration
#'
#' @return \code{tibble()}
#' @export
#'
#' @examples
#' dictionary_sam_registration_purpose_codes()
dictionary_sam_registration_purpose_codes <-
  function() {
    tibble(
      slugRegistrationPurpose = c("Z1", "Z2", "Z3", "Z4", "Z5"),
      typeRegistrationPurpose = c(
        "Federal Assistance",
        "All Awards",
        "IGT-Only",
        "IGT | Federal Assistance",
        "All Awards | IGT"
      ) %>% str_to_upper()
    )
  }

#' SBA Setaside Codes for SAM
#'
#' @return
#' @export
#'
#' @examples
#' dictionary_sam_sba_set_asides()
dictionary_sam_sba_set_asides <-
  function() {
    structure(
      list(
        c("HUBZONE CERTIFIED", "8A PARTICPANT", "8A JOINT VENTURE"),
        c("XX", "A6", "JT")
      ),
      class = c("tbl_df", "tbl", "data.frame"),
      row.names = c(NA, -3L),
      .Names = c("typeSetAside", "slugSetAside")
    )

  }

#' Dictionary SAM Bulk Entity API URL
#'
#' @return
#' @export
#'
#' @examples
dictionary_sam_bulk_entity_urls <- function() {
  json <-
    "https://beta.sam.gov/api/prod/fileextractservices/v1/api/listfiles?random=&domain=Entity%20Information" %>% fromJSON(simplifyDataFrame = T)

  data <- json[[1]]$customS3ObjectSummaryList
  urls <- data$`_links` %>% unlist() %>% as.character()
  data <-
    data[, 1:(ncol(data) - 1)] %>%
    as_tibble() %>%
    mutate(urlFile = urls)

  data %>%
    mutate(
      dateModified = mdy(dateModified),
      typeEntitiesFile = case_when(
        displayKey %>% str_detect("SAM_PUBLIC") ~ "SAM",
        TRUE ~ "ENTITY"
      ),
      dateData = case_when(
        typeEntitiesFile == "ENTITY" ~
          displayKey %>% str_remove_all(".csv|entityinformation") %>% mdy(),
        TRUE ~
          displayKey %>% str_remove_all(".ZIP|SAM_PUBLIC_MONTHLY_") %>%
          ymd()
      )
    ) %>%
    select(dateData, dateModified, urlFile)
}


# general -----------------------------------------------------------------

# # https://open.gsa.gov/api/

# entities ----------------------------------------------------------------


### https://beta.sam.gov/data-services?domain=none


.dl_zip <-
  function(url) {
    tmp <-
      tempfile()

    file <- curl_download(URLencode(url), tmp)
    unz_files <- unzip(file, exdir = "xml")

    data <-
      unz_files %>%
      fread(header = F, verbose = F,showProgress = FALSE, quote = "")

    unz_files %>% unlink()
    file %>% unlink()
    unlink("xml", recursive = T)
    gc()

    data <-
      data %>%
      as_tibble()

    data
  }

## https://open.gsa.gov/api/sam-entity-extracts-api/v1/public_extract_layout.pdf



.parse_bulk_sam_entities <-
  function(url = "https://s3.amazonaws.com/falextracts/Entity Information/SAM_PUBLIC_MONTHLY_20191201.ZIP") {
    options(scipen = 999999)
    data <-
      .dl_zip(url = url) %>%
      select(-150)

    data <- data %>%
      setNames(.dictionary_sam_extract_names() %>% pull(nameColumn))

    data <- data %>%
      mutate(idDUNS = as.integer(idDUNS))


    data <-
      data %>% mutate_if(is.character,
                         list(function(x) {
                           x %>%
                             str_squish()
                         }))
    data <-
      data %>%
      mutate_if(is.character,
                list(function(x) {
                  case_when(x %in% c("", " ") ~ NA_character_,
                            TRUE ~ x)
                }))

    logical_cols <-
      data %>% select(matches("^is|^has")) %>% names()

    data <- data %>%
      mutate_at(logical_cols,
                list(function(x) {
                  case_when(x == "N" ~ FALSE,
                            x == "Y" ~ TRUE,
                            TRUE ~ NA)
                }))

    data <-
      data %>%
      left_join(dictionary_sam_parent_entity_types(), by = "slugEntityStructure")

    data <-
      data %>%
      mutate(slugSAMExtract = as.character(slugSAMExtract)) %>%
      left_join(dictionary_sam_extract_types(), by = "slugSAMExtract")

    data <- data %>%
      left_join(dictionary_sam_registration_purpose_codes(), by = "slugRegistrationPurpose")

    data <- data %>%
      mutate(isNonPublicSAMRegistered = case_when(slugPublicDisplay == "NPDY" ~ TRUE,
                                                  TRUE ~ FALSE)) %>%
      select(-slugPublicDisplay)

    data <-
      data %>%
      mutate(isExpiredSAMEntity = typeSAMExtract == "EXPIRED") %>%
      select(idDUNS, isExpiredSAMEntity, everything())

    df_types <-
      data %>%
      filter(!is.na(dataBusinessTypes)) %>%
      select(idDUNS, dataBusinessTypes) %>%
      separate_rows("dataBusinessTypes", sep = "~") %>%
      rename(slugBusinessType = dataBusinessTypes) %>%
      filter(!is.na(slugBusinessType))

    df_types <-
      df_types %>%
      left_join(dictionary_sam_business_types(), by = "slugBusinessType") %>%
      nest(dataBusinessTypes = c(slugBusinessType, typeBusinessSAM))

    data <-
      data %>%
      select(-dataBusinessTypes) %>%
      left_join(df_types, by = "idDUNS") %>%
      select(one_of(names(data)), everything())

    rm(df_types)
    gc()

    df_response_codes <-
      data %>%
      filter(!is.na(dataDisasterResponse)) %>%
      select(idDUNS, dataDisasterResponse) %>%
      rename(slugDisasterResponseCode = dataDisasterResponse) %>%
      separate_rows(slugDisasterResponseCode, sep = "\\~") %>%
      filter(!is.na(slugDisasterResponseCode)) %>%
      mutate(
        typeAreaResponse = case_when(
          slugDisasterResponseCode == "ANY" ~ "ANY AREA",
          slugDisasterResponseCode %>% str_detect("^STA") ~ "STATE",
          slugDisasterResponseCode %>% str_detect("^CTY") ~ "COUNTY",
          slugDisasterResponseCode %>% str_detect("^MSA") ~ "METROPOLITAN SERVICE AREA"
        ),
        codeAreaResponse =
          case_when(
            slugDisasterResponseCode == "ANY" ~  NA_character_,
            TRUE ~ substr(slugDisasterResponseCode,
                          4,
                          nchar(slugDisasterResponseCode))
          )
      )

    df_response_codes <-
      df_response_codes %>%
      nest(
        dataDisasterResponse = c(slugDisasterResponseCode,
                                 typeAreaResponse,
                                 codeAreaResponse)
      )

    data <-
      data %>%
      select(-dataDisasterResponse) %>%
      left_join(df_response_codes, by = "idDUNS")

    rm(df_response_codes)
    gc()

    dict_naics <-
      dictionary_naics_codes()

    data <-
      data %>%
      group_by(idDUNS) %>%
      slice(1) %>%
      ungroup()

    data <-
      data %>%
      left_join(dict_naics %>% select(idNAICSPrimary = idNAICS, nameNAICSPrimary = nameNAICS),
                by = "idNAICSPrimary") %>%
      group_by(idDUNS) %>%
      slice(1) %>%
      ungroup()

    df_naics <-
      data %>%
      select(idDUNS, idNAICSPrimary, dataNAICS) %>%
      filter(!is.na(dataNAICS)) %>%
      separate_rows(dataNAICS, sep = "\\~") %>%
      rename(codeNAICS = dataNAICS) %>%
      filter(!is.na(codeNAICS)) %>%
      group_by(idDUNS) %>%
      slice(1) %>%
      ungroup()

    df_naics <-
      df_naics %>%
      mutate(hasSmallBusinessIndicator = codeNAICS %>% str_detect("Y$")) %>%
      mutate(idNAICS = parse_number(codeNAICS),
             isPrimaryNAICS = idNAICS == idNAICSPrimary) %>%
      select(idDUNS, isPrimaryNAICS, idNAICS, hasSmallBusinessIndicator)

    df_naics <-
      df_naics %>%
      left_join(dict_naics, by = "idNAICS")

    df_naics <- df_naics %>% select(-yearCodeBookNAICS)


    data <-
      data %>% select(-dataNAICS) %>%
      left_join(df_naics %>%
                  nest(
                    dataNAICS = c(
                      isPrimaryNAICS,
                      idNAICS,
                      hasSmallBusinessIndicator,
                      nameNAICS,
                      isPrimaryNAICS
                    )
                  ), by = "idDUNS") %>%
      distinct() %>%
      group_by(idDUNS) %>%
      slice(1) %>%
      ungroup()

    df_naics_exceptions <-
      data %>%
      select(idDUNS, dataNAICSExceptions) %>%
      filter(!is.na(dataNAICSExceptions)) %>%
      separate_rows(dataNAICSExceptions, sep = "\\~") %>%
      rename(slugNAICS = dataNAICSExceptions) %>%
      mutate_if(is.character, str_squish) %>%
      filter(slugNAICS != "") %>%
      mutate(idNAICS = slugNAICS %>% substr(1, 6) %>% as.integer()) %>%
      mutate(isException = T) %>%
      left_join(dict_naics %>% select(idNAICS, nameNAICS), by = "idNAICS")

    df_naics_exceptions <-
      df_naics_exceptions %>%
      mutate(
        slugExceptions = slugNAICS %>% str_remove_all("[0-9]"),
        countExceptions = slugExceptions %>% nchar()
      ) %>%
      nest(
        dataNAICSExceptions = c(
          slugNAICS,
          idNAICS,
          isException,
          nameNAICS,
          slugExceptions,
          countExceptions
        )
      )

    data <-
      data %>%
      select(-dataNAICSExceptions) %>%
      left_join(df_naics_exceptions, by = "idDUNS") %>%
      group_by(idDUNS) %>%
      slice(1) %>%
      ungroup()

    rm(df_naics)
    rm(dict_naics)
    gc()


    dict_psc <-
      dictionary_psc_active(only_active = T)

    df_psc <-
      data %>%
      filter(!is.na(dataProductServiceCodes)) %>%
      select(idDUNS, dataProductServiceCodes) %>%
      separate_rows(dataProductServiceCodes, sep = "\\~") %>%
      rename(codeProductService = dataProductServiceCodes)

    df_psc <-
      df_psc %>%
      filter(!codeProductService == "") %>%
      mutate(
        letternumber = codeProductService %>% substr(1, 1),
        isNumber = letternumber %>% str_detect("[0-9]")
      ) %>%
      mutate(
        idSolicitationGroup = case_when(
          isNumber ~ codeProductService %>% substr(1, 2),
          TRUE ~ codeProductService %>% substr(1, 1)
        )
      ) %>%
      select(-c(letternumber, isNumber))


    dict_psc <-
      dict_psc %>% select(typePSC,
                          codeProductService,
                          nameProductService)

    df_psc <-
      df_psc %>%
      left_join(dict_psc, by = c("codeProductService"))

    df_psc <-
      df_psc %>%
      nest(
        dataProductServiceCodes = c(
          typePSC,
          nameProductService,
          idSolicitationGroup,
          codeProductService
        )
      )

    data <-
      data %>%
      select(-dataProductServiceCodes) %>%
      left_join(df_psc, by = "idDUNS") %>%
      group_by(idDUNS) %>%
      slice(1) %>%
      ungroup()

    if (data %>% hasName("urlCompany")) {
      data <- data %>%
        mutate(
          urlCompany = case_when(
            urlCompany %>% str_detect("http:|https:") ~ urlCompany,
            is.na(urlCompany) ~ NA_character_,
            TRUE ~ glue("https://{urlCompany}") %>% as.character()
          )
        )
    }

    rm(df_psc)
    gc()

    df_sba <-
      data %>%
      filter(!is.na(dataSBACodes)) %>%
      select(idDUNS, dataSBACodes) %>%
      separate_rows(dataSBACodes, sep = "\\~") %>%
      rename(codeSetAside = dataSBACodes)

    df_sba <- df_sba %>%
      mutate(
        typeSetAside = case_when(
          codeSetAside == "XX" ~ "HUBZONE CERTIFIED",
          codeSetAside %>% str_detect("^A6") ~ "8A PARTICPANT",
          codeSetAside %>% str_detect("^JT") ~ "8A JOINT VENTURE"
        ),
        slugSetAside = codeSetAside %>% substr(1, 2)
      )

    df_sba <- df_sba %>%
      mutate(
        dateExpirationSBA = case_when(
          slugSetAside == "XX" ~ NA_character_,
          TRUE ~ codeSetAside %>% substr(3, nchar(codeSetAside))
        ) %>% as.numeric() %>% ymd()
      ) %>%
      select(-codeSetAside)

    df_sba <- df_sba %>%
      nest(dataSBA = c(typeSetAside, slugSetAside, dateExpirationSBA))

    data <-
      data %>%
      select(-dataSBACodes) %>%
      left_join(df_sba, by = "idDUNS")

    date_names <-
      data %>%
      select(matches("date")) %>%
      select(-matches("dateCompanyStart")) %>%
      names()

    data <-
      data %>%
      mutate_at(date_names,
                list(function(x) {
                  x %>% ymd()
                }))


    data <- data %>%
      mutate(
        charDate = nchar(dateCompanyStart),
        ymdDate = case_when(
          charDate == 7 ~ dateCompanyStart %>% substr(2, 7) %>% as.integer(),
          TRUE ~ dateCompanyStart
        ),
        date = ymd(ymdDate),
        year = year(date),
        ymdDate = case_when(
          year <= 1120 ~ ymdDate %>% substr(3, nchar(ymdDate)) %>% as.character(),
          idDUNS %in% c(
            942278250,
            807634428,
            81559260,
            54766010,
            64402055,
            78443143,
            967454070
          ) ~ ymdDate %>% substr(3, nchar(ymdDate)) %>% as.character(),
          TRUE ~ as.character(ymdDate)
        ),
        date = ymd(ymdDate),
        year = year(date)
      ) %>%
      select(-c(dateCompanyStart, year, charDate, ymdDate)) %>%
      rename(dateCompanyStart = date) %>%
      select(one_of(names(data)), everything())


    data <-
      data %>%
      mutate_at(c("nameCompanyLegal",
                  "nameCompanyDivision",
                  "nameCompanyDBA"),
                list(function(x) {
                  x %>% str_replace_all("\\, INC.|\\, INC|\\,INCORPORATED", " INC") %>%
                    str_replace_all("\\, LLC|\\, L.L.C|\\, L.L.C.", " LLC") %>%
                    str_replace_all("\\, LLP|\\, L.L.P|\\, L.L.P.", " LLP") %>%
                    str_replace_all("\\, LP|\\, L.P|\\, L.P.", " LP") %>%
                    str_replace_all("\\, LTD.|\\, LTD", " LTD") %>%
                    str_replace_all("\\, COMPANY", " COMPANY") %>%
                    str_replace_all("\\,CORP", "  CORP") %>%
                    str_remove_all("\\*") %>%
                    str_remove_all('\\"')

                }))


    data <-
      data %>%
      mutate(
        countDaysPostFoundingToSAMRegistration = (dateActivation - dateCompanyStart) %>% as.integer(),
        countDaysExistence = (Sys.Date()-dateCompanyStart) %>% as.integer()
      )

    lower_names <-
      data %>% select_if(is.character) %>%
      select(matches("^email|^url")) %>%
      names()

    data <- data %>%
      mutate_at(lower_names, str_to_lower)

    upper_names <-
      data %>% select_if(is.character) %>% select(-one_of(lower_names)) %>% names()

    data <-
      data %>%
      mutate_at(upper_names, str_to_upper)

    data <-
      data %>%
      mutate(
        nameContactGovernment = case_when(
          !is.na(nameFirstPointOfContactGovt) &
            !is.na(nameLastPointOfContactGovt) ~ str_c(
              nameFirstPointOfContactGovt,
              nameLastPointOfContactGovt,
              sep = " "
            ),
          TRUE ~ NA_character_
        ),
        nameContactGovernmentAlt = case_when(
          !is.na(nameFirstPointOfContactGovtAlt) &
            !is.na(nameLastPointOfContactGovtAlt) ~ str_c(
              nameFirstPointOfContactGovtAlt,
              nameLastPointOfContactGovtAlt,
              sep = " "
            ),
          TRUE ~ NA_character_
        ),
        nameContactPastPerformance = case_when(
          !is.na(nameFirstPointOfContactPastPerformance) &
            !is.na(nameLastPointOfContactPastPerformance) ~ str_c(
              nameFirstPointOfContactPastPerformance,
              nameLastPointOfContactPastPerformance,
              sep = " "
            ),
          TRUE ~ NA_character_
        ),
        nameContactElectronic = case_when(
          !is.na(nameFirstPointOfContactElectronicBusiness) &
            !is.na(nameLastPointOfContactElectronicBusiness) ~ str_c(
              nameFirstPointOfContactElectronicBusiness,
              nameLastPointOfContactElectronicBusiness,
              sep = " "
            ),
          TRUE ~ NA_character_
        )
      )

    data <- data %>%
      mutate(
        typeEntity =
          case_when(
            typeEntityStructure == "OTHER" ~ "OTHER",
            typeEntityStructure %>% str_detect("GOVERNMENT") ~ "GOVERNMENT",
            TRUE ~ "CORPORATION"
          ),
        isTaxExemptEntity = typeEntityStructure %in% c(
          "FOREIGN COUNTRY OR GOVERNMENT",
          "TAX EXEMPT CORPORATE ENTITY",
          "UNITED STATES GOVERNMENT ENTITY"
        )
      )

    data <- data %>%
      mutate(
        isForeignLocatedCompany = codeCountryCompany != "USA",
        isForeignIncorporatedcompany = codeCountryIncorporation != "USA"
      )


    data <- data %>%
      mutate(
        hasBusinessTypes = dataBusinessTypes %>% map_dbl(length) > 0,
        hasSBA = dataSBA %>% map_dbl(length) > 0,
        hasProductServiceCodes = dataProductServiceCodes %>% map_dbl(length) > 0,
        hasNAICS = dataNAICS %>% map_dbl(length) > 0,
        hasDisasterResponse = dataDisasterResponse %>% map_dbl(length) > 0,
        hasNAICSExceptions = dataNAICSExceptions %>% map_dbl(length) > 0
      )
    gc()
    data
  }


.bulk_entities_api_old <-
  function(file_type = "ENTITY",
           sensitivity = "PUBLIC",
           frequency = "daily",
           date = Sys.Date() - 1,
           api_key = "O4kzViWGVYNumPqhAzUhYGiZZZwW3RKUEYJOI6ii",
           only_active = F,
           sam_extract_filter = NULL,
           psc_filter = NULL,
           naics_filter = NULL,
           incorporation_company_filter = NULL,
           company_state_filter = NULL,
           company_zipcode_filter = NULL,
           company_country_filter = NULL,
           set_aside_filter = NULL,
           business_type_filter = NULL,
           return_message = T) {
    date <- ymd(date)
    date_year <- year(date)
    month_date <- month(date)
    day_date <- day(date)
    month_date <-
      case_when(nchar(month_date) == 1 ~ str_c("0", month_date),
                TRUE ~ as.character(month_date))
    day_slug <-
      case_when(nchar(day_date) == 1 ~ str_c("0", day_date),
                TRUE ~ as.character(day_date))

    period_slug <-
      glue("{month_date}/{day_slug}/{year(date)}") %>% as.character()

    url <-
      glue(
        "https://api.sam.gov/prod/dataservices/v1/extracts?api_key={api_key}&fileType={str_to_upper(file_type)}&sensitivity={str_to_upper(sensitivity)}&frequency={str_to_upper(frequency)}&date={period_slug}"
      )

    if (return_message) {
      glue("Parsing {url}") %>% message()
    }

    data <- .parse_bulk_sam_entities(url = url)

    data <-
      data %>%
      mutate(dateData = date,
             frequencyData = str_to_upper(frequency)) %>%
      select(frequencyData, dateData, everything())

    if (length(sam_extract_filter) > 0 &&
        str_to_upper(frequency) == "DAILY") {
      if (return_message) {
        glue(
          "Including only {str_c(sam_extract_filter, collapse = ',')} SAM entity registration(s)"
        )
      }
      data <- data %>%
        filter(typeSAMExtract %in% str_to_upper(sam_extract_filter))
    }

    if (only_active) {
      if (return_message) {
        "Filtering for only active SAM companies" %>% message()
      }
      data <- data %>%
        filter(!isExpiredSAMEntity)
    }

    if (length(psc_filter) > 0) {
      if (return_message) {
        glue(
          "Filtering for companies with a registered PSC:\n{str_c(psc_filters, collapse = '\n')}"
        ) %>% message()
      }
      filter_duns <-
        data %>%
        filter(hasProductServiceCodes) %>%
        select(idDUNS, dataProductServiceCodes) %>%
        unnest_legacy() %>%
        filter(codeProductService %in% psc_filters) %>%
        pull(idDUNS)

      data <- .filter_duns(data = data, duns = filter_duns)
    }

    if (length(naics_filter) > 0) {
      if (return_message) {
        glue(
          "Filtering for companies with a registered naics:\n{str_c(naics_filters, collapse = '\n')}"
        ) %>% message()
      }
      filter_duns <-
        data %>%
        filter(hasNAICS) %>%
        select(idDUNS, dataNAICS) %>%
        unnest_legacy() %>%
        filter(idNAICS %in% naics_filters) %>%
        pull(idDUNS)

      data <- .filter_duns(data = data, duns = filter_duns)
    }

    if (length(set_aside_filter) > 0) {
      if (return_message) {
        glue(
          "Filtering for companies with a set aside code of:\n{str_c(set_aside_filter, collapse = '\n')}"
        ) %>% message()
      }

      filter_duns <-
        data %>%
        filter(hasSBA) %>%
        select(idDUNS, dataSBA) %>%
        unnest_legacy() %>%
        filter(slugSetAside %in% str_to_upper(set_aside_filter)) %>%
        pull(idDUNS)

      data <- .filter_duns(data = data, duns = filter_duns)
    }

    if (length(incorporation_company_filter) > 0) {
      if (return_message) {
        glue(
          "Filtering for companies domiciled in:\n{str_c(incorporation_company_filter, collapse = '\n')}"
        ) %>% message()
      }

      data <-
        data %>%
        filter(codeCountryIncorporation %in% str_to_upper(incorporation_company_filter))
    }

    if (length(company_country_filter) > 0) {
      if (return_message) {
        glue(
          "Filtering for companies head-quartered in:\n{str_c(company_country_filter, collapse = '\n')}"
        ) %>% message()
      }

      data <-
        data %>%
        filter(codeCountryCompany %in% str_to_upper(company_country_filter))
    }

    if (length(company_state_filter) > 0) {
      if (return_message) {
        glue(
          "Filtering for companies headquarters in:\n{str_c(company_state_filter, collapse = '\n')}"
        ) %>% message()
      }

      data <- data %>%
        filter(stateCompany %in% str_to_upper(company_state_filter))
    }

    if (length(company_zipcode_filter)) {
      if (return_message) {
        glue(
          "Filtering for companies located in zipcode(s):\n{str_c(company_zipcode_filter, collapse = '\n')}"
        ) %>% message()
      }

      data <- data %>%
        filter(zipcodeCompany %in% str_to_upper(as.character(company_zipcode_filter)))


    }

    if (length(business_type_filter) > 0) {
      if (return_message) {
        glue(
          "Filtering for business types:\n{str_c(business_type_filter, collapse = '\n')}"
        ) %>% message()
      }
      filter_duns <-
        data %>%
        filter(hasBusinessTypes) %>%
        select(idDUNS, dataBusinessTypes) %>%
        unnest_legacy() %>%
        filter(slugBusinessType %in% str_to_upper(business_type_filter)) %>%
        pull(idDUNS) %>%
        unique()

      data <-
        .filter_duns(data = data, duns = filter_duns)
    }

    name_cols <- data %>% select(matches("name")) %>% names()

    data <- data %>%
      mutate_at(name_cols,
                list(function(x) {
                  x %>% str_replace_all(" OF, ", " OF ")
                }))

    if (data %>% hasName("slugDUNS")) {
      data <- data %>% mutate(slugDUNS = as.character(slugDUNS))
    }

    data <- data %>%
      mutate_if(is.numeric, as.numeric)
    data
  }


.bulk_sam_entities_old <-
  function(frequencies = c("monthly"),
           dates = c(Sys.Date()),
           api_key = "O4kzViWGVYNumPqhAzUhYGiZZZwW3RKUEYJOI6ii",
           only_active = F,
           sam_extract_filter = NULL,
           psc_filter = NULL,
           naics_filter = NULL,
           incorporation_company_filter = NULL,
           company_state_filter = NULL,
           company_zipcode_filter = NULL,
           company_country_filter = NULL,
           set_aside_filter = NULL,
           business_type_filter = NULL,
           file_type = "ENTITY",
           sensitivity = "PUBLIC",
           return_message = T) {
    df_input <-
      expand_grid(date = dates,
                  frequency = str_to_upper(frequencies)) %>%
      as_tibble()

    df_input <-
      df_input %>% filter(frequency == "MONTHLY") %>% filter(date == max(date)) %>%
      bind_rows(df_input %>% filter(frequency == "DAILY"))

    all_data <-
      1:nrow(df_input) %>%
      map_dfr(function(x) {
        df_row <-
          df_input %>% dplyr::slice(x)

        .bulk_entities_api(
          frequency = df_row$frequency,
          date = df_row$date,
          api_key = api_key,
          only_active = only_active,
          psc_filter = psc_filter,
          sam_extract_filter = sam_extract_filter,
          naics_filter = naics_filter,
          file_type = file_type,
          sensitivity = sensitivity,
          incorporation_company_filter = incorporation_company_filter,
          company_state_filter = company_state_filter,
          company_zipcode_filter = company_zipcode_filter,
          company_country_filter = company_country_filter,
          set_aside_filter = set_aside_filter,
          business_type_filter = business_type_filter,
          return_message = return_message
        )
      })

    if (return_message) {
      "Generating FAR and DFAR link urls" %>% message()
    }

    df_duns <- all_data %>%
      distinct(idDUNS) %>%
      mutate(char = nchar(idDUNS),
             zeros = 9 - char)


    df_duns <- df_duns %>%
      filter(zeros != 0) %>%
      mutate(
        slugZero = zeros %>% map_chr(function(x) {
          rep("0", times = x) %>% str_c(collapse = "")
        }),
        slugDUNS = glue("{slugZero}{idDUNS}") %>% as.character()
      ) %>%
      select(idDUNS, slugDUNS) %>%
      bind_rows(df_duns %>% filter(zeros == 0) %>%
                  select(idDUNS) %>%
                  mutate(slugDUNS = as.character(idDUNS))) %>%
      mutate(
        urlFARPDF = glue(
          "https://www.sam.gov/SAM/filedownload?pdfType=1&duns={slugDUNS}"
        ) %>% as.character(),
        urlDFARPDF = glue(
          "https://www.sam.gov/SAM/filedownload?pdfType=2&duns={slugDUNS}"
        ) %>% as.character()
      ) %>%
      select(-slugDUNS)

    all_data <-
      all_data %>%
      left_join(df_duns, by = "idDUNS")

    all_data

  }

#' SAM Active Contract Opportunities
#'
#' Returns all active contract information
#' from SAM contract opportunity data store.
#'
#' @param exclude_archived if \code{TRUE} excludes archived listings
#' @param exclude_awards if \code{TRUE} excludes awards
#' @param only_active if \code{TRUE} only
#' @param include_unknown_responses if \code{TRUE} keep opportunities with unknown response dates
#' @param snake_names
#' @param join_address
#'
#' @return
#' @export
#'
#' @examples
#' bulk_sam_contract_opportunities()
bulk_sam_contract_opportunities <-
  function(exclude_archived = T,
           only_active = T,
           include_unknown_responses = F,
           snake_names = F,
           join_address = T,
           exclude_awards = T) {
    data <-
      "https://s3.amazonaws.com/falextracts/Contract%20Opportunities/datagov/ContractOpportunitiesFullCSV.csv" %>%
      read_csv(guess_max = 100000)

    data <-
      data %>%
      .munge_biz_opps_names()

    data <-
      data %>%
      filter(!idNotice %>% str_detect("<"))

    data <-
      data %>%
      mutate(amountAward = currency(parse_number(as.character(amountAward)), digits = 0))


    upper_cols <-
      data %>% select_if(is.character) %>% select(-matches("email|idNotice|url")) %>% names()

    lower_cols <-
      data %>% select_if(is.character) %>% select(matches("email|url")) %>% names()

    data <- data %>%
      mutate_at(upper_cols, str_to_upper)

    data <-
      data %>% mutate_at(lower_cols, str_to_lower)

    data <-
      data %>% mutate_if(is.character, str_squish)

    date_cols <-
      data %>%
      select_if(is.character) %>%
      select(matches("^datetime[A-Z]|^date[A-Z]")) %>% names()

    if (data %>% hasName("nameSolicitation")) {
      data <-
        data %>%
        mutate(
          nameSolicitation = nameSolicitation %>% str_remove_all("^[A-Z]--") %>%
            str_remove_all("^[0-9][0-9]--") %>%
            str_squish() %>% str_to_upper()
        )
    }

    if (length(date_cols) > 0) {
      data <-
        data %>%
        mutate_at(date_cols,
                  list(function(x) {
                    case_when(is.na(x) ~ NA_character_,
                              TRUE ~ x %>% substr(1, 10)) %>% ymd()
                  }))
    }

    date_cols <-
      data %>% select(matches("^date")) %>% names()

    df_years <-
      data %>%
      select(date_cols) %>%
      transmute_at(date_cols, year)

    names(df_years) <-
      names(df_years) %>% str_replace_all("datetime", "year") %>% str_replace_all("date", "year")

    data <-
      data %>%
      bind_cols(df_years)

    data <-
      data %>%
      mutate(
        hasNoticeUpdates = typeNotice != typeNoticeBase,
        nameAwardee = case_when(
          nameAwardee == "NULL" ~ NA_character_,
          nameAwardee == "VARIOUS" ~ "MULTIPLE",
          TRUE ~ nameAwardee
        ),
        isAward = !is.na(nameAwardee),
        hasAwardeeDUNS = nameAwardee %>% str_detect("DUNS:") %>% coalesce(FALSE),
        isActive = case_when(isActive == "YES" ~ TRUE,
                             TRUE ~ FALSE)
      )

    data <- data %>%
      separate(
        typeSetAside,
        sep = "\\(FAR",
        into = c("typeSetAside", "codeFAR"),
        extra = "drop",
        fill = "right"
      ) %>%
      mutate_at(c("typeSetAside", "codeFAR"),
                list(function(x) {
                  x %>% str_remove_all("\\(|\\)") %>%
                    str_remove_all("\\HUBZONE|\\IEE|\\EDWOSB|\\SDVOSB") %>%
                    str_squish()
                })) %>%
      mutate(
        hasSetAside = !is.na(typeSetAside) %>% coalesce(FALSE),
        isSoleSource = typeSetAside %>% str_detect("SOLE SOURCE") %>% coalesce(FALSE)
      )

    df_solicitation_groups <-
      data %>%
      filter(!is.na(idSolicitationGroup)) %>%
      count(idSolicitationGroup) %>%
      mutate(
        char = nchar(idSolicitationGroup),
        hasPSCExact = char > 2,
        typePSC = case_when(
          idSolicitationGroup %>% str_detect("[A-Z]") ~ "SERVICE",
          TRUE ~ "PRODUCT"
        ),
        codeProductService = idSolicitationGroup,
        idSolicitationGroupActual  = case_when(
          typePSC == "PRODUCT" ~ idSolicitationGroup %>% substr(1, 2),
          TRUE ~ idSolicitationGroup %>% substr(1, 1)
        )
      ) %>%
      select(-c(n, char)) %>%
      left_join(
        dictionary_psc_active() %>%
          select(
            codeProductService,
            nameSolicitationGroup,
            nameProductService
          ),
        by = "codeProductService"
      )

    data <- data %>%
      left_join(df_solicitation_groups, by = "idSolicitationGroup") %>%
      select(-idSolicitationGroup) %>%
      rename(idSolicitationGroup = idSolicitationGroupActual) %>%
      select(one_of(names(data)), everything()) %>%
      select(idNotice:idSolicitation,
             names(
               df_solicitation_groups %>% select(-idSolicitationGroupActual)
             ),
             everything())


    data <-
      data %>%
      mutate(
        isActive = as.Date(datetimeResponse) > Sys.Date(),
        dateResponse = as.Date(datetimeResponse),
        countDaysToRespond =  (as.Date(datetimeResponse) - Sys.Date()) %>% as.numeric(),
        countDaysToRespond = case_when(is.na(countDaysToRespond) ~ 0,
                                       TRUE ~ countDaysToRespond),
        countDaysToRespond = pmax(0, countDaysToRespond, na.rm = T),
        countDaysOnline = (Sys.Date() - as.Date(datetimePublished)) %>% as.integer(),
        countDaysOpenToRespond = (as.Date(datetimeResponse) - as.Date(datetimePublished)) %>% as.numeric(),
        countDaysOpenToRespond = pmax(0, countDaysOpenToRespond, na.rm = T),
        urlOpportunityAPI = glue(
          "https://beta.sam.gov/api/prod/opps/v2/opportunities/{idNotice}"
        ) %>% as.character(),
        urlOpportunityAttachmentAPI = glue(
          "https://beta.sam.gov/api/prod/opps/v3/opportunities/{idNotice}/resources?excludeDeleted=false&withScanResult=false"
        ) %>% as.character(),
        urlOpportunityAttachmentZIP = glue(
          "https://beta.sam.gov/api/prod/opps/v3/opportunities/{idNotice}/resources/download/zip"
        ) %>% as.character()
      )

    if (data %>% hasName("hasPSCExact")) {
      data <- data %>%
        mutate(hasPSCExact = hasPSCExact %>% coalesce(FALSE))
    }

    data <-
      data %>%
      mutate_at(c("nameSolicitation", "descriptionSolicitation"),
                list(function(x) {
                  x %>% stringi::stri_enc_toascii() %>%
                    stri_unescape_unicode() %>%
                    str_remove_all("\032") %>%
                    str_replace_all("\\|", " ") %>%
                    str_squish()
                }))


    # df_parents <-
    #   data %>%
    #   group_by(nameSolicitation) %>%
    #   filter(datetimePublished == min(datetimePublished)) %>%
    #   dplyr::slice(1) %>%
    #   ungroup() %>%
    #   select(
    #     nameSolicitation,
    #     datetimePublishedInitial = datetimePublished,
    #     idSolicitationParent = idSolicitation
    #   ) %>%
    #   group_by(idSolicitationParent) %>%
    #   dplyr::slice(1) %>%
    #   ungroup()
    #
    # data %>%
    #   select(countDaysOnline,datetimeResponse) %>%
    #   mutate(isPossibleActive = is.na(datetimeResponse) & countDaysOnline %>% between(1, 365))


    data <-
      .clean_usg_organizations(data = data, column = "nameDepartment")

    data <-
      .clean_usg_organizations(data = data, column = "nameCommandSub")

    data <-
      data %>% .clean_usg_organizations("nameOffice")

    data <-
      data %>% fix_usg_organization_col(org_col = "nameDepartment")
    data <-
      data %>% fix_usg_organization_col(org_col = "nameOffice")
    data <-
      data %>% fix_usg_organization_col(org_col = "nameCommandSub")

    data <- distinct(data)

    if (exclude_awards) {
      data <-
        data %>%
        filter(is.na(nameAwardee)) %>%
        filter(is.na(dateAward))
    }

    if (only_active) {
      if (include_unknown_responses) {
        data <-
          data %>%
          filter(isActive | is.na(isActive))
      } else {
        data <-
          data %>%
          filter(isActive)
      }


    }

    if (exclude_archived) {
      data <-
        data %>%
        filter(dateArchive >= Sys.Date() | is.na(dateArchive))
    }

    df_descriptions <-
      data %>%
      filter(!is.na(descriptionSolicitation)) %>%
      distinct(descriptionSolicitation) %>%
      mutate(html = glue("<html>{descriptionSolicitation}</html>") %>% as.character())

    descriptions <-
      seq_along(df_descriptions$html) %>%
      map_chr(function(x) {
        df_descriptions$html[[x]] %>%
          stri_enc_toascii() %>%
          read_html() %>%
          html_text() %>%
          stringi::stri_trans_general("Latin-ASCII") %>%
          stri_enc_toascii() %>% str_replace_all(" \032 ", " ") %>% str_to_upper() %>%
          str_squish()
      })

    df_descriptions <-
      df_descriptions %>%
      mutate(descriptionSolicitationActual = descriptions) %>%
      select(-html)

    data <-
      data %>%
      left_join(df_descriptions, by = "descriptionSolicitation") %>%
      select(-descriptionSolicitation) %>%
      rename(descriptionSolicitation = descriptionSolicitationActual)
    rm(df_descriptions)
    rm(descriptions)
    gc()

    data <-
      data %>%
      .remove_na()

    data <- data %>%
      mutate(
        namePrimaryContact = namePrimaryContact %>% stringi::stri_enc_toascii() %>%
          stri_unescape_unicode() %>%
          str_remove_all("\032") %>%
          str_replace_all("\\|", " ") %>%
          str_squish() %>%
          str_to_upper()
      )


    df_naics <-
      data %>%
      filter(!is.na(idNAICS)) %>%
      distinct(idNAICS)

    dict_naics <-
      dictionary_naics_codes() %>%
      group_by(idNAICS) %>%
      slice(1) %>%
      ungroup() %>%
      mutate(idNAICS = as.numeric(idNAICS))

    data <-
      data %>%
      mutate(idNAICS = as.numeric(idNAICS)) %>%
      left_join(dict_naics, by = "idNAICS") %>%
      group_by(idNotice) %>%
      slice(1) %>%
      ungroup()

    if (join_address) {
      data <-
        data %>%
        build_address()
    }

    data <- data %>%
      mutate(hasResponseDate = !is.na(dateResponse),
           hasAwardID = !is.na(idAward),
           idSolicitationClean = idSolicitation %>% str_remove_all("\\-|\\_|\\(|\\)"))

    if (snake_names) {
      data <-
        data %>%
        clean_names()
    }

    data
  }



# historic_contracts ------------------------------------------------------


#' Historic USA Spending extracts
#'
#' Links for annual
#' contract solicitations by
#' year.
#'
#' @return \code{tibble}
#' @export
#'
#' @examples
#' dictionary_usa_spending_contract_archive()
dictionary_usa_spending_contract_archive <-
  memoise::memoise(function() {
    json <-
      "https://beta.sam.gov/api/prod/fileextractservices/v1/api/listfiles?random=1578940065681&domain=Contract%20Opportunities/Archived%20Data" %>%
      fromJSON(simplifyDataFrame = T)

    df <- json[[1]][[1]]

    urlCSV <- df[["_links"]]$self$href %>% map_chr(URLencode)

    data <-
      df[, 1:4] %>%
      as_tibble() %>%
      select(1:2) %>%
      setNames(c("slugFile", "dateModified")) %>%
      mutate(dateModified = mdy(dateModified)) %>%
      mutate(yearData = slugFile %>% substr(3, 6) %>% as.numeric()) %>%
      select(yearData, everything()) %>%
      mutate(urlCSV)

    data
  })


.parse_contract_archive_url <-
  function(url = "https://s3.amazonaws.com/falextracts/Contract Opportunities/Archived Data/FY2012_archived_opportunities.csv",
           join_address = T) {

    data <-
      url %>%
      URLencode() %>%
      vroom()

    data <-
      data %>%
      .munge_biz_opps_names()

    upper_cols <-
      data %>% select_if(is.character) %>% select(-matches("email|idNotice|url")) %>% names()

    lower_cols <-
      data %>% select_if(is.character) %>% select(matches("email|url")) %>% names()

    data <-
      data %>%
      mutate_at(upper_cols,
                list(function(x) {
                  x %>%
                    stringi::stri_enc_toascii() %>%
                    str_remove_all("\032") %>%
                    str_replace_all("\\|", " ") %>%
                    str_squish() %>%
                    str_to_upper() %>%
                    str_remove_all("\\_|\\~|--") %>%
                    # stri_unescape_unicode() %>%
                    stri_replace_all_charclass("\\p{WHITE_SPACE}", " ")

                }))

    data <- data %>%
      mutate(
        namePrimaryContact = namePrimaryContact %>%
          stringi::stri_enc_toascii() %>%
          str_remove_all("\032") %>%
          str_replace_all("\\|", " ") %>%
          str_squish() %>%
          str_to_upper() %>%
          str_remove_all("\\_|\\~|--") %>%
          stri_unescape_unicode() %>%
          stri_replace_all_charclass("\\p{WHITE_SPACE}", " ")
      )

    data <-
      data %>%
      mutate_at(lower_cols,
                list(function(x) {
                  x %>%
                    stringi::stri_enc_toascii() %>%
                    str_remove_all("\032") %>%
                    str_replace_all("\\|", " ") %>%
                    str_squish() %>%
                    str_to_upper() %>%
                    str_remove_all("\\_|\\~|--") %>%
                    stri_unescape_unicode() %>%
                    stri_replace_all_charclass("\\p{WHITE_SPACE}", " ") %>%
                    str_to_lower()
                }))

    data <-
      data %>% mutate_if(is.character, str_squish)

    if (data %>% hasName("zipcodeOrganization")) {
      data <- data %>%
        mutate(zipcodeOrganization = as.character(zipcodeOrganization))
    }

    if (data %>% hasName("nameSolicitation")) {
      data <-
        data %>%
        mutate(
          nameSolicitation = nameSolicitation %>% str_remove_all("^[A-Z]--") %>%
            str_remove_all("^[0-9][0-9]--") %>%
            str_squish() %>% str_to_upper()
        )
    }

    date_cols <-
      data %>%
      select_if(is.character) %>%
      select(matches("^datetime[A-Z]|^date[A-Z]")) %>% names()

    data <- data %>%
      mutate(amountAward = parse_number(as.character(amountAward)))

    if (length(date_cols) > 0) {
      data <-
        data %>%
        mutate_at(date_cols,
                  list(function(x) {
                    case_when(is.na(x) ~ NA_character_,
                              TRUE ~ x %>% substr(1, 10)) %>% ymd()
                  }))
    }

    date_cols <-
      data %>% select(matches("^date")) %>% names()

    df_years <-
      data %>%
      select(date_cols) %>%
      transmute_at(tidyselect::all_of(date_cols), year)

    names(df_years) <-
      names(df_years) %>% str_replace_all("datetime", "year") %>% str_replace_all("date", "year")

    data <-
      data %>%
      bind_cols(df_years)

    data <-
      data %>%
      mutate(
        hasNoticeUpdates = typeNotice != typeNoticeBase,
        isActive = case_when(isActive == "YES" ~ TRUE,
                             TRUE ~ FALSE)
      ) %>%
      .remove_na()



    if (data %>% hasName("nameAwardee")) {
      data <- data %>%
        mutate(
          nameAwardee = case_when(
            nameAwardee == "NULL" ~ NA_character_,
            nameAwardee == "VARIOUS" ~ "MULTIPLE",
            TRUE ~ nameAwardee
          ),
          isAward = !is.na(nameAwardee),
          hasAwardeeDUNS = nameAwardee %>% str_detect("DUNS:") %>% coalesce(FALSE)
        )
    }


    if (data %>% hasName("typeSetAside")) {
      data <-
        data %>%
        separate(
          typeSetAside,
          sep = "\\(FAR",
          into = c("typeSetAside", "codeFAR"),
          extra = "drop",
          fill = "right"
        ) %>%
        mutate_at(c("typeSetAside", "codeFAR"),
                  list(function(x) {
                    x %>% str_remove_all("\\(|\\)") %>%
                      str_remove_all("\\HUBZONE|\\IEE|\\EDWOSB|\\SDVOSB") %>%
                      str_squish()
                  })) %>%
        mutate(
          hasSetAside = !is.na(typeSetAside) %>% coalesce(FALSE),
          isSoleSource = typeSetAside %>% str_detect("SOLE SOURCE") %>% coalesce(FALSE)
        )
    }


    df_solicitation_groups <-
      data %>%
      filter(!is.na(idSolicitationGroup)) %>%
      count(idSolicitationGroup) %>%
      mutate(
        char = nchar(idSolicitationGroup),
        hasPSCExact = char > 2,
        typePSC = case_when(
          idSolicitationGroup %>% str_detect("[A-Z]") ~ "SERVICE",
          TRUE ~ "PRODUCT"
        ),
        codeProductService = idSolicitationGroup,
        idSolicitationGroupActual  = case_when(
          typePSC == "PRODUCT" ~ idSolicitationGroup %>% substr(1, 2),
          TRUE ~ idSolicitationGroup %>% substr(1, 1)
        )
      ) %>%
      select(-c(n, char)) %>%
      mutate_at(c("codeProductService", "idSolicitationGroup"),
                as.character) %>%
      left_join(
        dictionary_psc_active() %>%
          select(
            codeProductService,
            nameSolicitationGroup,
            nameProductService
          ),
        by = "codeProductService"
      )

    data <-
      data %>%
      mutate_at(c("idSolicitationGroup"),
                as.character) %>%
      left_join(df_solicitation_groups, by = "idSolicitationGroup") %>%
      select(-idSolicitationGroup) %>%
      rename(idSolicitationGroup = idSolicitationGroupActual) %>%
      select(one_of(names(data)), everything()) %>%
      select(idNotice:idSolicitation,
             names(
               df_solicitation_groups %>% select(-idSolicitationGroupActual)
             ),
             everything())


    data <-
      data %>%
      mutate(
        isActive = as.Date(datetimeResponse) > Sys.Date(),
        dateResponse = as.Date(datetimeResponse),
        countDaysToRespond =  (as.Date(datetimeResponse) - Sys.Date()) %>% as.numeric(),
        countDaysToRespond = case_when(is.na(countDaysToRespond) ~ 0,
                                       TRUE ~ countDaysToRespond),
        countDaysToRespond = pmax(0, countDaysToRespond, na.rm = T),
        countDaysOnline = (Sys.Date() - as.Date(datetimePublished)) %>% as.integer(),
        countDaysOpenToRespond = (as.Date(datetimeResponse) - as.Date(datetimePublished)) %>% as.numeric(),
        countDaysOpenToRespond = pmax(0, countDaysOpenToRespond, na.rm = T),
        urlOpportunityAPI = glue(
          "https://beta.sam.gov/api/prod/opps/v2/opportunities/{idNotice}"
        ) %>% as.character(),
        urlOpportunityAttachmentAPI = glue(
          "https://beta.sam.gov/api/prod/opps/v3/opportunities/{idNotice}/resources?excludeDeleted=false&withScanResult=false"
        ) %>% as.character(),
        urlOpportunityAttachmentZIP = glue(
          "https://beta.sam.gov/api/prod/opps/v3/opportunities/{idNotice}/resources/download/zip"
        ) %>% as.character()
      )

    if (data %>% hasName("hasPSCExact")) {
      data <- data %>%
        mutate(hasPSCExact = hasPSCExact %>% coalesce(FALSE))
    }


    if (data %>% hasName("descriptionSolicitation")) {
      data <-
        data %>%
        filter(!descriptionSolicitation %>% str_detect("New Action IEJ Amendment")) %>%
        mutate_at(c("nameSolicitation", "descriptionSolicitation"),
                  list(function(x) {
                    x %>%
                      stringi::stri_enc_toascii() %>%
                      str_remove_all("\032") %>%
                      str_replace_all("\\|", " ") %>%
                      str_squish() %>%
                      str_remove_all("\\_|\\~|--") %>%
                      stri_replace_all_charclass("\\p{WHITE_SPACE}", " ")
                  }))

      df_descriptions <-
        data %>%
        filter(!is.na(descriptionSolicitation)) %>%
        distinct(descriptionSolicitation) %>%
        mutate(html = glue("<html>{descriptionSolicitation}</html>") %>% as.character())

      .parse_html_description_safe <-
        possibly(.parse_html_description, NA)

      descriptions <-
        seq_along(df_descriptions$html) %>%
        map_chr(function(x) {
          x %>% message()
          description <-
            df_descriptions$html[[x]] %>%
            .parse_html_description_safe()
          description
        })

      df_descriptions <-
        df_descriptions %>%
        mutate(descriptionSolicitationActual = descriptions) %>%
        select(-html)


      data <-
        data %>%
        left_join(df_descriptions, by = "descriptionSolicitation") %>%
        select(-descriptionSolicitation) %>%
        rename(descriptionSolicitation = descriptionSolicitationActual) %>%
        mutate(urlCSV = url)

      rm(df_descriptions)
      rm(descriptions)

    }

    data <-
      .clean_usg_organizations(data = data, column = "nameDepartment")

    data <-
      .clean_usg_organizations(data = data, column = "nameCommandSub")

    data <-
      data %>% .clean_usg_organizations("nameOffice")

    data <- distinct(data) %>%
      .remove_na()


    if (data %>% hasName("idNAICS")) {
      dict_naics <-
        dictionary_naics_codes() %>%
        group_by(idNAICS) %>%
        slice(1) %>%
        ungroup()

      data <-
        data %>%
        left_join(dict_naics, by = "idNAICS") %>%
        group_by(idNotice) %>%
        slice(1) %>%
        ungroup()

    }

    if (join_address) {
      data <- data %>%
        build_address()
    }

    data <- data %>%
      mutate(
        hasResponseDate = !is.na(dateResponse),
        hasAwardID = !is.na(idAward),
        idSolicitationClean = idSolicitation %>% str_remove_all("\\-|\\_|\\(|\\)")
      )


    gc()

    data
  }

#' Parse vector of contract
#' archive urls
#'
#' @param urls vector of SAM bulk contract urls
#' @param snake_names if \code{TRUE} returns snaked names
#'
#' @return
#' @export
#'
#' @examples
parse_contract_archive_urls <-
  function(urls = c(
    "https://s3.amazonaws.com/falextracts/Contract%20Opportunities/Archived%20Data/FY1970_archived_opportunities.csv",
    "https://s3.amazonaws.com/falextracts/Contract%20Opportunities/Archived%20Data/FY1980_archived_opportunities.csv",
    "https://s3.amazonaws.com/falextracts/Contract%20Opportunities/Archived%20Data/FY1998_archived_opportunities.csv"
  ),snake_names = F,
  join_address = T) {
    options(future.globals.maxSize = 500 * 1024 ^ 10)
    all_data <-
      urls %>%
      map_dfr(function(url) {
        parts <- url %>% str_split("/") %>% flatten_chr()
        yearData <-
          parts[length(parts)] %>% substr(3,6) %>%
          as.numeric()

        .parse_contract_archive_url(url = url, join_address = join_address) %>%
          mutate(yearData) %>%
          select(yearData, everything())
      })

    if (snake_names) {
      all_data <-
        all_data %>%
        clean_names()
    }

    all_data
  }

#' Bulk Download of SAM Bulk Contract Opportunities
#'
#' @param contract_years vector of contract years available from \link{dictionary_usa_spending_contract_archive}
#' @param snake_names if `true` snake names
#' @param join_address if true joins address
#'
#' @return \code{tibble}
#' @export
#'
#' @examples
bulk_sam_contract_archives <-
  function(contract_years = NULL,
           snake_names = F,
           join_address = T
           ) {
    dict_archives <- dictionary_usa_spending_contract_archive()

    if (length(contract_years) == 0) {

      stop("Please Enter contract years")

    }
    if (contract_years %>% length() > 0) {
      dict_archives <-
        dict_archives %>%
        filter(yearData %in% contract_years)
    }

    data <-
      dict_archives$urlCSV %>%
      parse_contract_archive_urls(snake_names = snake_names, join_address = join_address)

    data
  }

#' Cached SAM Contracts
#'
#' @param munge
#' @param snake_names
#'
#' @return
#' @export
#'
#' @examples
cached_active_sam_contracts <-
  function(munge = T, snake_names = T) {
    data <-
      read_rda("https://asbcllc.com/r_packages/govtrackR/sheldon/data/active_contracts.rda")

    if (munge) {
      data <- data %>%
        munge_lite(snake_names = snake_names)
    }

    data
  }
abresler/govtrackR documentation built on July 11, 2020, 12:30 a.m.