R/entities_stuff.R

Defines functions bis_entities .munge_bis_names dictionary_bis_names

Documented in bis_entities

# https://www.export.gov/article?id=Consolidated-Screening-List

dictionary_bis_names <-
  function(){
    tibble(nameBIS = c("source", "entity_number", "type", "programs", "name", "title",
                       "addresses", "federal_register_notice", "start_date", "end_date",
                       "standard_order", "license_requirement", "license_policy", "call_sign",
                       "vessel_type", "gross_tonnage", "gross_registered_tonnage", "vessel_flag",
                       "vessel_owner", "remarks", "source_list_url", "alt_names", "citizenships",
                       "dates_of_birth", "nationalities", "places_of_birth", "source_information_url",
                       "ids",
                       "Classification",
                       "Name",
                       "Prefix",
                       "First",
                       "Middle",
                       "Last",
                       "Suffix",
                       "Address 1",
                       "Address 2",
                       "Address 3",
                       "Address 4",
                       "City",
                       "State / Province",
                       "Country",
                       "Zip Code",
                       "DUNS",
                       "Exclusion Program",
                       "Excluding Agency",
                       "CT Code",
                       "Exclusion Type",
                       "Additional Comments",
                       "Active Date",
                       "Termination Date",
                       "Record Status",
                       "Cross-Reference",
                       "SAM Number",
                       "CAGE",
                       "NPI",
                       "Creation_Date"

                       ),
           nameActual = c("nameSource", "idEntity", "typeEntity", "programBIS", "nameParty", "titleParty",
                          "addressesParty", "detailsFederalRegisterNotice",
                          "dateStart", "dateEnd",
                          "isStandardOrder", "detailsLicenseRequirement", "detailsLicensePolicy",
                          "slugCallSign",
                          "typeVesel", "amountGrossTonnage", "amountGrossTonnageRegistered", "flagVessel",
                          "ownerVessel", "detailsRemarks", "urlSources", "namesAlternative", "countriesCitizenship",
                          "detailsDatesOfBirth", "detailsNationalities", "detailsPlacesOfBirth", "urlSourceInformation",
                          "detailsIdentifiers",

                          "typeClassification",
                          "nameEntity",
                          "prefixEntity",
                          "nameFirst",
                          "nameMiddle",
                          "nameLast",
                          "suffixEntity",
                          "address1",
                          "address2",
                          "address3",
                          "address4",
                          "city",
                          "state",
                          "country",
                          "zipcode",
                          "idDUNS",
                          "typeExclusionProgram",
                          "slugAgencyExcluding",
                          "codeCauseTreatment",
                          "typeExclusion",
                          "commentsExclusion",
                          "dateActive",
                          "dateTermination",
                          "statusRecord",
                          "descriptionCrossReference",
                          "idSAM",
                          "slugCAGE",
                          "idNationalProvider",
                          "dateCreated"
                          )
    )
  }

.munge_bis_names <-
  function(data) {
    dict_names <- dictionary_bis_names()
    fdps_names <-
      names(data)

    actual_names <-
      fdps_names %>%
      map_chr(function(name) {
        df_row <-
          dict_names %>% filter(nameBIS == name)
        if (nrow(df_row) == 0) {
          glue::glue("Missing {name}") %>% message()
          return(name)
        }

        df_row$nameActual
      })

    data %>%
      set_names(actual_names)
  }

# entities ----------------------------------------------------------------

.dl_bis <-
  memoise::memoise(function() {
    data <-
      "https://api.trade.gov/consolidated_screening_list/search.csv?api_key=OHZYuksFHSFao8jDXTkfiypO" %>%
      data.table::fread(verbose = F, showProgress = FALSE) %>%
      as_tibble()

    data
  })

#' Bureau of Industry and Security Entity List
#'
#' Consolidated list of entities and individuals that
#' may be banned from doing business in the United States or
#' with U.S. parties.
#'
#' @return \code{tibble}
#' @export
#'
#' @examples
bis_entities <-
  function(snake_names = F){
    data <- .dl_bis()
    data <-.munge_bis_names(data)

    data <- data %>%
      separate(nameSource, into = c("nameSource", "nameAgencyResponsible"), sep = "\\ - ",extra = "merge", fill = "right") %>%
      separate(nameSource, into  = c("nameSource", "slugSource"), sep = "\\(",extra = "merge", fill = "right") %>%
      mutate(slugSource = slugSource %>% str_remove_all("\\)")) %>%
      mutate_if(is.character, str_trim)

    data <-
      data %>%
      mutate_if(is.character,
                list(function(x) {
                  ifelse(x == "", NA, x)
                })) %>%
      mutate(
        dateStart = dateStart %>% str_replace_all("0097", "1997") %>% ymd(),
        dateEnd = dateEnd %>% ymd()
      )

    data <- data %>%
      .munge_data(clean_address = F)

    data <- data %>%
      mutate_at(c("amountGrossTonnage", "amountGrossTonnageRegistered"),list(function(x){
        x %>% as.numeric() %>% formattable::comma(digits = 0)
      }))

    data <- data %>%
      mutate(idRow = 1:n())

    df_people <- data %>% filter(typeEntity == "INDIVIDUAL")

    df_people <-
      df_people %>% distinct(nameParty) %>%
      separate(
        nameParty,
        sep = "\\, ",
        into = c("nameLast", "nameFirst"),
        remove = F,
        extra = "merge"
      ) %>%
      mutate(namePartyClean = case_when(
        is.na(nameFirst) ~ nameLast,
        TRUE ~ str_c(nameFirst, nameLast, sep = " ")
      )) %>%
      select(-c(nameFirst, nameLast)) %>%
      left_join(df_people, by = "nameParty") %>%
      select(-nameParty) %>%
      rename(nameParty = namePartyClean) %>%
      select(one_of(names(df_people)))

    data <-
      data %>%
      filter(typeEntity != "INDIVIDUAL" | is.na(typeEntity)) %>%
      bind_rows(df_people) %>%
      arrange(idRow)

    data <- data %>%
      mutate_if(is.character,
                list(function(x) {
                  x %>% str_replace_all("\\; ", "\\ | ")
                }))

    df_birthdays <- data %>%
      distinct(idRow, detailsDatesOfBirth) %>%
      separate_rows(detailsDatesOfBirth, sep = "\\| ") %>%
      mutate_if(is.character, str_trim) %>%
      mutate(detailsDatesOfBirth = case_when(
        nchar(detailsDatesOfBirth) == 4 ~ glue("{detailsDatesOfBirth}-01-01"),
        TRUE ~ detailsDatesOfBirth
      )) %>%
      filter(!is.na(detailsDatesOfBirth)) %>%
      group_by(idRow) %>%
      summarise(detailsDatesOfBirth = as.character(detailsDatesOfBirth) %>% str_c(collapse =  " | ")) %>%
      ungroup()

    data <- data %>%
      select(-detailsDatesOfBirth) %>%
      left_join(df_birthdays, by = "idRow") %>%
      select(names(data), everything())

    data <- data %>%
      mutate(
        detailsLicensePolicy = case_when(
          detailsLicensePolicy %>% str_detect("PRESUMPTION OF DENIAL") ~ "PRESUMPTION OF DENIAL",
          detailsLicensePolicy %>% str_detect("CASE-BY-CASE BASIS") ~ "CASE-BY-CASE BASIS",
          detailsLicensePolicy %>% str_detect("032746.5 OF THE EAR") ~ "032746.5 OF THE EAR",
          TRUE ~ detailsLicensePolicy
        )
      )

    data <- data %>%
      mutate(
        detailsLicenseRequirement =
          case_when(
            detailsLicenseRequirement %in% c(
              "FOR ALL ITEMS SUBJECT TO THE EAR .",
              "FOR ALL ITEMS SUBJECT TO THE EAR. .",
              "FOR ALL ITEMS SUBJECT TO THE EAR",
              "FOR ALL ITEMS SUBJECT TO THE EAR."
            ) ~ "ITEMS SUBJECT TO EAR",
            TRUE ~ detailsLicenseRequirement
          )
      )

    if (snake_names){
      data <- clean_names(data)

    }

    data
  }

#' BIS Denied Persons List
#'
#' @param clean_address
#'
#' @return
#' @export
#'
#' @examples
bis_denied_persons <-
  memoise::memoise(function(clean_address = T) {
    data <- "https://www.bis.doc.gov/dpl/dpl.txt" %>% read_tsv()
    data <- data %>% clean_names()
    data <- data %>%
      rename(
        party = name,
        is_standard_order = standard_order,
        type_action  = action,
        zipcode = postal_code,
        date_last_updated = last_update,
        date_effective = effective_date,
        date_expiration = expiration_date
      ) %>%
      munge_data(clean_address = clean_address) %>%
      mutate(date_data = Sys.Date()) %>%
      clean_names()

    data
  })





# sam_entities ------------------------------------------------------------

#' SAM Entitiy URLS
#'
#' @return
#' @export
#'
#' @examples
dictionary_sam_entities <-
  memoise::memoise(function() {
    page <- "https://www.sam.gov/SAM/pages/public/extracts/samPublicAccessData.jsf" %>%
      read_html()
    nodes <- page %>% html_nodes(".overrideResetCSS+ .overrideResetCSS b~ a")
    urls <-
      nodes %>% html_attr("href") %>%
      str_c("https://www.sam.gov",.)

    files <-
      nodes %>% html_text() %>% str_to_upper()

    julian_dates <-
      urls %>%
      str_remove_all(".ZIP") %>%
      str_split("MONTHLY_") %>%
      map_chr(function(x){
        x[[2]]
      }) %>%
      as.numeric()


    tibble(
      dateData = ymd(julian_dates),
      nameFile = files,
      urlZIP = urls
    ) %>%
      mutate(typeFile = "SAM BULK",
             isASCII = nameFile %>% str_detect("ASCII"))
  })

.parse_sam_entity_zip <- function(url = "https://www.sam.gov/SAM/extractfiledownload?role=SAM-PUBLIC-UTF8&version=SAM&filename=SAM_PUBLIC_UTF-8_MONTHLY_20200607.ZIP") {
  outfile <- tempfile("download", fileext = ".zip")
  file <- download.file(url, outfile)
  unz_files <- outfile %>% unzip(exdir = "zip")

  data <-
    unz_files %>%
    fread()


  unz_files %>% unlink()
  file %>% unlink()
  outfile %>% unlink()
  unlink("zip", recursive = T, force = T)

  data <-
    data %>%
    select(-150)

  actual_names <- .dictionary_sam_extract_names() %>% pull(nameColumn)

  data <- data %>% setNames(actual_names)

  data
}

.sam_bulk_entities <-
  function(clean_address = T, return_message =T) {
    df_urls <-
      dictionary_sam_entities()

    df_urls <- df_urls %>% filter(!isASCII)
    url <- df_urls$urlZIP
    dateData <- df_urls$dateData
    data <- .parse_sam_entity_zip(url = url)
    data <- data %>%
      mutate(idDUNS = as.integer(idDUNS))


    data <-
      data %>% mutate_if(is.character,
                         list(function(x) {
                           x %>%
                             str_squish()
                         }))
    data <-
      data %>%
      mutate_if(is.character,
                list(function(x) {
                  case_when(x %in% c("", " ") ~ NA_character_,
                            TRUE ~ x)
                }))

    logical_cols <-
      data %>% select(matches("^is|^has")) %>% names()

    data <- data %>%
      mutate_at(logical_cols,
                list(function(x) {
                  case_when(x == "N" ~ FALSE,
                            x == "Y" ~ TRUE,
                            TRUE ~ NA)
                }))

    data <-
      data %>%
      left_join(dictionary_sam_parent_entity_types(), by = "slugEntityStructure")

    data <-
      data %>%
      mutate(slugSAMExtract = as.character(slugSAMExtract)) %>%
      left_join(dictionary_sam_extract_types(), by = "slugSAMExtract")

    data <- data %>%
      left_join(dictionary_sam_registration_purpose_codes(), by = "slugRegistrationPurpose")

    data <- data %>%
      mutate(isNonPublicSAMRegistered = case_when(slugPublicDisplay == "NPDY" ~ TRUE,
                                                  TRUE ~ FALSE)) %>%
      select(-slugPublicDisplay)

    data <-
      data %>%
      mutate(isExpiredSAMEntity = typeSAMExtract == "EXPIRED") %>%
      select(idDUNS, isExpiredSAMEntity, everything())

    df_types <-
      data %>%
      filter(!is.na(dataBusinessTypes)) %>%
      select(idDUNS, dataBusinessTypes) %>%
      separate_rows("dataBusinessTypes", sep = "~") %>%
      rename(slugBusinessType = dataBusinessTypes) %>%
      filter(!is.na(slugBusinessType))

    df_types <-
      df_types %>%
      left_join(dictionary_sam_business_types(), by = "slugBusinessType") %>%
      nest(dataBusinessTypes = c(slugBusinessType, typeBusinessSAM))

    data <-
      data %>%
      select(-dataBusinessTypes) %>%
      left_join(df_types, by = "idDUNS") %>%
      select(one_of(names(data)), everything())

    rm(df_types)
    gc()

    df_response_codes <-
      data %>%
      filter(!is.na(dataDisasterResponse)) %>%
      select(idDUNS, dataDisasterResponse) %>%
      rename(slugDisasterResponseCode = dataDisasterResponse) %>%
      separate_rows(slugDisasterResponseCode, sep = "\\~") %>%
      filter(!is.na(slugDisasterResponseCode)) %>%
      mutate(
        typeAreaResponse = case_when(
          slugDisasterResponseCode == "ANY" ~ "ANY AREA",
          slugDisasterResponseCode %>% str_detect("^STA") ~ "STATE",
          slugDisasterResponseCode %>% str_detect("^CTY") ~ "COUNTY",
          slugDisasterResponseCode %>% str_detect("^MSA") ~ "METROPOLITAN SERVICE AREA"
        ),
        codeAreaResponse =
          case_when(
            slugDisasterResponseCode == "ANY" ~  NA_character_,
            TRUE ~ substr(slugDisasterResponseCode,
                          4,
                          nchar(slugDisasterResponseCode))
          )
      )

    df_response_codes <-
      df_response_codes %>%
      nest(
        dataDisasterResponse = c(slugDisasterResponseCode,
                                 typeAreaResponse,
                                 codeAreaResponse)
      )

    data <-
      data %>%
      select(-dataDisasterResponse) %>%
      left_join(df_response_codes, by = "idDUNS")

    rm(df_response_codes)
    gc()

    dict_naics <-
      dictionary_naics_codes()

    data <-
      data %>%
      group_by(idDUNS) %>%
      slice(1) %>%
      ungroup()

    data <-
      data %>%
      left_join(dict_naics %>% select(idNAICSPrimary = idNAICS, nameNAICSPrimary = nameNAICS),
                by = "idNAICSPrimary") %>%
      group_by(idDUNS) %>%
      slice(1) %>%
      ungroup()

    df_naics <-
      data %>%
      select(idDUNS, idNAICSPrimary, dataNAICS) %>%
      filter(!is.na(dataNAICS)) %>%
      separate_rows(dataNAICS, sep = "\\~") %>%
      rename(codeNAICS = dataNAICS) %>%
      filter(!is.na(codeNAICS)) %>%
      group_by(idDUNS) %>%
      slice(1) %>%
      ungroup()

    df_naics <-
      df_naics %>%
      mutate(hasSmallBusinessIndicator = codeNAICS %>% str_detect("Y$")) %>%
      mutate(idNAICS = parse_number(codeNAICS),
             isPrimaryNAICS = idNAICS == idNAICSPrimary) %>%
      select(idDUNS, isPrimaryNAICS, idNAICS, hasSmallBusinessIndicator)

    df_naics <-
      df_naics %>%
      left_join(dict_naics, by = "idNAICS")

    df_naics <- df_naics %>% select(-yearCodeBookNAICS)


    data <-
      data %>% select(-dataNAICS) %>%
      left_join(df_naics %>%
                  nest(
                    dataNAICS = c(
                      isPrimaryNAICS,
                      idNAICS,
                      hasSmallBusinessIndicator,
                      nameNAICS,
                      isPrimaryNAICS
                    )
                  ), by = "idDUNS") %>%
      distinct() %>%
      group_by(idDUNS) %>%
      slice(1) %>%
      ungroup()

    df_naics_exceptions <-
      data %>%
      select(idDUNS, dataNAICSExceptions) %>%
      filter(!is.na(dataNAICSExceptions)) %>%
      separate_rows(dataNAICSExceptions, sep = "\\~") %>%
      rename(slugNAICS = dataNAICSExceptions) %>%
      mutate_if(is.character, str_squish) %>%
      filter(slugNAICS != "") %>%
      mutate(idNAICS = slugNAICS %>% substr(1, 6) %>% as.integer()) %>%
      mutate(isException = T) %>%
      left_join(dict_naics %>% select(idNAICS, nameNAICS), by = "idNAICS")

    df_naics_exceptions <-
      df_naics_exceptions %>%
      mutate(
        slugExceptions = slugNAICS %>% str_remove_all("[0-9]"),
        countExceptions = slugExceptions %>% nchar()
      ) %>%
      nest(
        dataNAICSExceptions = c(
          slugNAICS,
          idNAICS,
          isException,
          nameNAICS,
          slugExceptions,
          countExceptions
        )
      )

    data <-
      data %>%
      select(-dataNAICSExceptions) %>%
      left_join(df_naics_exceptions, by = "idDUNS") %>%
      group_by(idDUNS) %>%
      slice(1) %>%
      ungroup()

    rm(df_naics)
    rm(dict_naics)
    gc()


    dict_psc <-
      dictionary_psc_active(only_active = T)

    df_psc <-
      data %>%
      filter(!is.na(dataProductServiceCodes)) %>%
      select(idDUNS, dataProductServiceCodes) %>%
      separate_rows(dataProductServiceCodes, sep = "\\~") %>%
      rename(codeProductService = dataProductServiceCodes)

    df_psc <-
      df_psc %>%
      filter(!codeProductService == "") %>%
      mutate(
        letternumber = codeProductService %>% substr(1, 1),
        isNumber = letternumber %>% str_detect("[0-9]")
      ) %>%
      mutate(
        idSolicitationGroup = case_when(
          isNumber ~ codeProductService %>% substr(1, 2),
          TRUE ~ codeProductService %>% substr(1, 1)
        )
      ) %>%
      select(-c(letternumber, isNumber))


    dict_psc <-
      dict_psc %>% select(typePSC,
                          codeProductService,
                          nameProductService)

    df_psc <-
      df_psc %>%
      left_join(dict_psc, by = c("codeProductService"))

    df_psc <-
      df_psc %>%
      nest(
        dataProductServiceCodes = c(
          typePSC,
          nameProductService,
          idSolicitationGroup,
          codeProductService
        )
      )

    data <-
      data %>%
      select(-dataProductServiceCodes) %>%
      left_join(df_psc, by = "idDUNS") %>%
      group_by(idDUNS) %>%
      slice(1) %>%
      ungroup()

    if (data %>% hasName("urlCompany")) {
      data <- data %>%
        mutate(
          urlCompany = case_when(
            urlCompany %>% str_detect("http:|https:") ~ urlCompany,
            is.na(urlCompany) ~ NA_character_,
            TRUE ~ glue("https://{urlCompany}") %>% as.character()
          )
        )
    }

    rm(df_psc)
    gc()

    df_sba <-
      data %>%
      filter(!is.na(dataSBACodes)) %>%
      select(idDUNS, dataSBACodes) %>%
      separate_rows(dataSBACodes, sep = "\\~") %>%
      rename(codeSetAside = dataSBACodes)

    df_sba <- df_sba %>%
      mutate(
        typeSetAside = case_when(
          codeSetAside == "XX" ~ "HUBZONE CERTIFIED",
          codeSetAside %>% str_detect("^A6") ~ "8A PARTICPANT",
          codeSetAside %>% str_detect("^JT") ~ "8A JOINT VENTURE"
        ),
        slugSetAside = codeSetAside %>% substr(1, 2)
      )

    df_sba <- df_sba %>%
      mutate(
        dateExpirationSBA = case_when(
          slugSetAside == "XX" ~ NA_character_,
          TRUE ~ codeSetAside %>% substr(3, nchar(codeSetAside))
        ) %>% as.numeric() %>% ymd()
      ) %>%
      select(-codeSetAside)

    df_sba <- df_sba %>%
      nest(dataSBA = c(typeSetAside, slugSetAside, dateExpirationSBA))

    data <-
      data %>%
      select(-dataSBACodes) %>%
      left_join(df_sba, by = "idDUNS")

    date_names <-
      data %>%
      select(matches("date")) %>%
      select(-matches("dateCompanyStart")) %>%
      names()

    data <-
      data %>%
      mutate_at(date_names,
                list(function(x) {
                  x %>% ymd()
                }))


    data <-
      data %>%
      mutate(
        charDate = nchar(dateCompanyStart),
        dateCompanyStart = as.integer(dateCompanyStart),
        ymdDate = case_when(
          charDate == 7 ~ dateCompanyStart %>% substr(2, 7) %>% as.integer(),
          TRUE ~ dateCompanyStart
        ),
        date = ymd(ymdDate),
        year = year(date),
        ymdDate = case_when(
          year <= 1120 ~ ymdDate %>% substr(3, nchar(ymdDate)) %>% as.character(),
          idDUNS %in% c(
            942278250,
            807634428,
            81559260,
            54766010,
            64402055,
            78443143,
            967454070
          ) ~ ymdDate %>% substr(3, nchar(ymdDate)) %>% as.character(),
          TRUE ~ as.character(ymdDate)
        ),
        date = ymd(ymdDate),
        year = year(date)
      ) %>%
      select(-c(dateCompanyStart, year, charDate, ymdDate)) %>%
      rename(dateCompanyStart = date) %>%
      select(one_of(names(data)), everything())


    data <-
      data %>%
      mutate_at(c("nameCompanyLegal",
                  "nameCompanyDivision",
                  "nameCompanyDBA"),
                list(function(x) {
                  x %>% str_replace_all("\\, INC.|\\, INC|\\,INCORPORATED", " INC") %>%
                    str_replace_all("\\, LLC|\\, L.L.C|\\, L.L.C.", " LLC") %>%
                    str_replace_all("\\, LLP|\\, L.L.P|\\, L.L.P.", " LLP") %>%
                    str_replace_all("\\, LP|\\, L.P|\\, L.P.", " LP") %>%
                    str_replace_all("\\, LTD.|\\, LTD", " LTD") %>%
                    str_replace_all("\\, COMPANY", " COMPANY") %>%
                    str_replace_all("\\,CORP", "  CORP") %>%
                    str_remove_all("\\*") %>%
                    str_remove_all('\\"')

                }))

    data <-
      data %>%
      mutate(
        countDaysPostFoundingToSAMRegistration = (dateActivation - dateCompanyStart) %>% as.integer(),
        countDaysExistence = (Sys.Date()-dateCompanyStart) %>% as.integer()
      )

    lower_names <-
      data %>% select_if(is.character) %>%
      select(matches("^email|^url")) %>%
      names()

    data <- data %>%
      mutate_at(lower_names, str_to_lower)

    upper_names <-
      data %>% select_if(is.character) %>% select(-one_of(lower_names)) %>% names()

    data <-
      data %>%
      mutate_at(upper_names, str_to_upper)

    data <-
      data %>%
      mutate(
        nameContactGovernment = case_when(
          !is.na(nameFirstPointOfContactGovt) &
            !is.na(nameLastPointOfContactGovt) ~ str_c(
              nameFirstPointOfContactGovt,
              nameLastPointOfContactGovt,
              sep = " "
            ),
          TRUE ~ NA_character_
        ),
        nameContactGovernmentAlt = case_when(
          !is.na(nameFirstPointOfContactGovtAlt) &
            !is.na(nameLastPointOfContactGovtAlt) ~ str_c(
              nameFirstPointOfContactGovtAlt,
              nameLastPointOfContactGovtAlt,
              sep = " "
            ),
          TRUE ~ NA_character_
        ),
        nameContactPastPerformance = case_when(
          !is.na(nameFirstPointOfContactPastPerformance) &
            !is.na(nameLastPointOfContactPastPerformance) ~ str_c(
              nameFirstPointOfContactPastPerformance,
              nameLastPointOfContactPastPerformance,
              sep = " "
            ),
          TRUE ~ NA_character_
        ),
        nameContactElectronic = case_when(
          !is.na(nameFirstPointOfContactElectronicBusiness) &
            !is.na(nameLastPointOfContactElectronicBusiness) ~ str_c(
              nameFirstPointOfContactElectronicBusiness,
              nameLastPointOfContactElectronicBusiness,
              sep = " "
            ),
          TRUE ~ NA_character_
        )
      )

    data <- data %>%
      mutate(
        typeEntity =
          case_when(
            typeEntityStructure == "OTHER" ~ "OTHER",
            typeEntityStructure %>% str_detect("GOVERNMENT") ~ "GOVERNMENT",
            TRUE ~ "CORPORATION"
          ),
        isTaxExemptEntity = typeEntityStructure %in% c(
          "FOREIGN COUNTRY OR GOVERNMENT",
          "TAX EXEMPT CORPORATE ENTITY",
          "UNITED STATES GOVERNMENT ENTITY"
        )
      )

    data <- data %>%
      mutate(
        isForeignLocatedCompany = codeCountryCompany != "USA",
        isForeignIncorporatedcompany = codeCountryIncorporation != "USA"
      )


    data <- data %>%
      mutate(
        hasBusinessTypes = dataBusinessTypes %>% map_dbl(length) > 0,
        hasSBA = dataSBA %>% map_dbl(length) > 0,
        hasProductServiceCodes = dataProductServiceCodes %>% map_dbl(length) > 0,
        hasNAICS = dataNAICS %>% map_dbl(length) > 0,
        hasDisasterResponse = dataDisasterResponse %>% map_dbl(length) > 0,
        hasNAICSExceptions = dataNAICSExceptions %>% map_dbl(length) > 0
      )

    data <-
      data %>%
      mutate(dateSAMData = dateData) %>%
      select(dateSAMData, everything())

    if (clean_address) {
      data <- data %>%
        build_address(return_message = return_message)
    }
    data
  }

#' Bulk SAM Entity Registration Data
#'
#' Downloads bulk SAM registered entities based on
#' user specifications and filters from SAM monthly release
#'
#'
#' @param only_active
#' @param sam_extract_filter
#' @param psc_filter
#' @param naics_filter
#' @param incorporation_company_filter
#' @param company_state_filter
#' @param company_zipcode_filter
#' @param company_country_filter
#' @param set_aside_filter
#' @param business_type_filter
#' @param return_message
#' @param snake_names
#' @param clean_address
#'
#' @return `tibble`
#' @export
#'
#' @examples
bulk_sam_entities <-
  function(only_active = F,
           sam_extract_filter = NULL,
           psc_filter = NULL,
           naics_filter = NULL,
           incorporation_company_filter = NULL,
           company_state_filter = NULL,
           company_zipcode_filter = NULL,
           company_country_filter = NULL,
           set_aside_filter = NULL,
           business_type_filter = NULL,
           snake_names = F,
           clean_address = T,
           return_message = T) {

    data <-
      .sam_bulk_entities(clean_address = clean_address, return_message = return_message)

    if (only_active) {
      if (return_message) {
        "Filtering for only active SAM companies" %>% message()
      }
      data <- data %>%
        filter(!isExpiredSAMEntity)
    }

    if (length(psc_filter) > 0) {
      if (return_message) {
        glue(
          "Filtering for companies with a registered PSC:\n{str_c(psc_filters, collapse = '\n')}"
        ) %>% message()
      }
      filter_duns <-
        data %>%
        filter(hasProductServiceCodes) %>%
        select(idDUNS, dataProductServiceCodes) %>%
        unnest_legacy() %>%
        filter(codeProductService %in% psc_filters) %>%
        pull(idDUNS)

      data <- .filter_duns(data = data, duns = filter_duns)
    }

    if (length(naics_filter) > 0) {
      if (return_message) {
        glue(
          "Filtering for companies with a registered naics:\n{str_c(naics_filters, collapse = '\n')}"
        ) %>% message()
      }
      filter_duns <-
        data %>%
        filter(hasNAICS) %>%
        select(idDUNS, dataNAICS) %>%
        unnest_legacy() %>%
        filter(idNAICS %in% naics_filters) %>%
        pull(idDUNS)

      data <- .filter_duns(data = data, duns = filter_duns)
    }

    if (length(set_aside_filter) > 0) {
      if (return_message) {
        glue(
          "Filtering for companies with a set aside code of:\n{str_c(set_aside_filter, collapse = '\n')}"
        ) %>% message()
      }

      filter_duns <-
        data %>%
        filter(hasSBA) %>%
        select(idDUNS, dataSBA) %>%
        unnest_legacy() %>%
        filter(slugSetAside %in% str_to_upper(set_aside_filter)) %>%
        pull(idDUNS)

      data <- .filter_duns(data = data, duns = filter_duns)
    }

    if (length(incorporation_company_filter) > 0) {
      if (return_message) {
        glue(
          "Filtering for companies domiciled in:\n{str_c(incorporation_company_filter, collapse = '\n')}"
        ) %>% message()
      }

      data <-
        data %>%
        filter(codeCountryIncorporation %in% str_to_upper(incorporation_company_filter))
    }

    if (length(company_country_filter) > 0) {
      if (return_message) {
        glue(
          "Filtering for companies head-quartered in:\n{str_c(company_country_filter, collapse = '\n')}"
        ) %>% message()
      }

      data <-
        data %>%
        filter(codeCountryCompany %in% str_to_upper(company_country_filter))
    }

    if (length(company_state_filter) > 0) {
      if (return_message) {
        glue(
          "Filtering for companies headquarters in:\n{str_c(company_state_filter, collapse = '\n')}"
        ) %>% message()
      }

      data <- data %>%
        filter(stateCompany %in% str_to_upper(company_state_filter))
    }

    if (length(company_zipcode_filter)) {
      if (return_message) {
        glue(
          "Filtering for companies located in zipcode(s):\n{str_c(company_zipcode_filter, collapse = '\n')}"
        ) %>% message()
      }

      data <- data %>%
        filter(zipcodeCompany %in% str_to_upper(as.character(company_zipcode_filter)))


    }

    if (length(business_type_filter) > 0) {
      if (return_message) {
        glue(
          "Filtering for business types:\n{str_c(business_type_filter, collapse = '\n')}"
        ) %>% message()
      }
      filter_duns <-
        data %>%
        filter(hasBusinessTypes) %>%
        select(idDUNS, dataBusinessTypes) %>%
        unnest_legacy() %>%
        filter(slugBusinessType %in% str_to_upper(business_type_filter)) %>%
        pull(idDUNS) %>%
        unique()

      data <-
        .filter_duns(data = data, duns = filter_duns)
    }

    name_cols <- data %>% select(matches("name")) %>% names()

    data <- data %>%
      mutate_at(name_cols,
                list(function(x) {
                  x %>% str_replace_all(" OF, ", " OF ")
                }))

    if (data %>% hasName("slugDUNS")) {
      data <- data %>% mutate(slugDUNS = as.character(slugDUNS))
    }

    data <- data %>%
      mutate_if(is.numeric, as.numeric)

    if (return_message) {
      "Generating FAR and DFAR link urls" %>% message()
    }

    df_duns <- data %>%
      distinct(idDUNS) %>%
      mutate(char = nchar(idDUNS),
             zeros = 9 - char)


    df_duns <- df_duns %>%
      filter(zeros != 0) %>%
      mutate(
        slugZero = zeros %>% map_chr(function(x) {
          rep("0", times = x) %>% str_c(collapse = "")
        }),
        slugDUNS = glue("{slugZero}{idDUNS}") %>% as.character()
      ) %>%
      select(idDUNS, slugDUNS) %>%
      bind_rows(df_duns %>% filter(zeros == 0) %>%
                  select(idDUNS) %>%
                  mutate(slugDUNS = as.character(idDUNS))) %>%
      mutate(
        urlFARPDF = glue(
          "https://www.sam.gov/SAM/filedownload?pdfType=1&duns={slugDUNS}"
        ) %>% as.character(),
        urlDFARPDF = glue(
          "https://www.sam.gov/SAM/filedownload?pdfType=2&duns={slugDUNS}"
        ) %>% as.character()
      ) %>%
      select(-slugDUNS)

    data <-
      data %>%
      left_join(df_duns, by = "idDUNS")


    if (snake_names) {
      data <-
        janitor::clean_names(data)
    }

    data
  }



# sam_exclusions ----------------------------------------------------------


### https://www.sam.gov/SAM/pages/public/extracts/samPublicAccessData.jsf
###

#' SAM Exclusion URLs
#'
#' List of the most recent SAM exclusion URLS
#'
#' @return \code{tibble()}
#' @export
#'
#' @examples
#' dictionary_sam_exclusion_urls()
dictionary_sam_exclusion_urls <-
  function() {
    page <- "https://www.sam.gov/SAM/pages/public/extracts/samPublicAccessData.jsf" %>%
      read_html()

    urls <- page %>% html_nodes("tr~ tr+ tr a") %>% html_attr("href") %>%
      str_c("https://www.sam.gov",.)

    julian_dates <- urls %>%
      str_remove_all(".ZIP") %>%
      str_split("Extract_") %>%
      map_chr(function(x){
        x[[2]]
      }) %>%
      as.numeric()

    year <-
      substr(julian_dates,1,2) %>% str_c("20") %>% as.numeric()

    day_of_year <-
      julian_dates %>% substr(3, nchar(julian_dates)) %>% as.numeric()

    start_date <- glue("{year}-01-01") %>% ymd() %>% unique()

    dates <- start_date + day_of_year

    tibble(dateData = dates, urlZIP = urls) %>%
      mutate(typeFile = "SAM EXCLUSIONS")

  }

.parse_sam_exclusion_zip <- function(url = "https://www.sam.gov/SAM/extractfiledownload?role=WW&version=EPLSPUB&filename=SAM_Exclusions_Public_Extract_20050.ZIP") {
  outfile <- tempfile("download", fileext = ".zip")
  file <- download.file(url, outfile)
  unz_files <- outfile %>% unzip(exdir = "zip")

  data <-
    unz_files %>% fread(showProgress = FALSE) %>% as_tibble()

  data <-
    data %>%
    .munge_bis_names()

  unz_files %>% unlink()
  file %>% unlink()
  unlink("attachments", recursive = T, force = T)


  data %>% select(matches("date"))

  data <- data %>%
    mutate_at(c("dateTermination", "dateActive"),
              mdy) %>%
    mutate_at("dateCreated", ymd)

  data <- data %>%
    .munge_data(parse_dates = F, clean_address = F)

  data <- data %>%
    mutate(
      idNationalProvider = as.numeric(idNationalProvider),
      idNationalProvider = case_when(idNationalProvider == 0 ~ NA_real_,
                                     TRUE ~ idNationalProvider),
      namePerson = case_when(
        namePerson == "" ~ NA_character_,
        TRUE ~ namePerson
      ),
      nameParty = case_when(
        is.na(nameEntity) ~ namePerson,
        TRUE ~ nameEntity
      )
    ) %>%
    select(typeClassification, nameParty, everything()) %>%
    .remove_na()

  data <-
    data %>%
    mutate(descriptionCrossReference = descriptionCrossReference %>% str_remove_all("^ALSO"),
           urlZIP = url)

  data

}

#' SAM Entity Exclusion data
#'
#' Returns information about people and entities
#' on the most recent SAM exclusion list
#'
#' @param snake_names
#'
#' @return
#' @export
#'
#' @examples
#' sam_exclusions()
sam_exclusions <-
  function(snake_names = T, clean_address = T) {
    df_urls <-
      dictionary_sam_exclusion_urls()

    df_today <-
      df_urls %>%
      filter(dateData == max(dateData))

    data <-
      df_today$urlZIP %>% .parse_sam_exclusion_zip()

    data <- data %>%
      left_join(df_today, by = "urlZIP")

    data <- data %>% munge_data(snake_names = snake_names, clean_address = clean_address)

    data
  }


#' Join SAM data
#'
#' @param data
#' @param duns_column
#' @param snake_names
#' @param sam_select_columns
#'
#' @return
#' @export
#'
#' @examples
tbl_sam_data <-
  function(data, duns_column = "id_duns",
           sam_select_columns = c(
             "dateSAMData",
             "idDUNS",
             "slugCAGE",
             "nameCompanyLegalClean",
             "nameCompanyLegal",
             "isExpiredSAMEntity",
             "slugDeptDefenseAddressCode",
             "dateRegistrationInitial",
             "dateExpiration",
             "dateLastUpdate",
             "dateActivation",
             "dateCompanyStart",
             "monthdayFiscalYearEnd",
             "urlCompany",
             "nameCompanyDBA",
             "nameCompanyDivision",
             "countSBATypes",
             "typeEntityStructure",
             "typeSAMExtract",
             "typeRegistrationPurpose",
             "isNonPublicSAMRegistered",
             "nameNAICSPrimary",
             "nameSectorNAICS",
             "nameSubSectorNAICS",
             "nameIndustryGroupNAICS",
             "nameIndustryNAICS",
             "nameContactGovernment",
             "nameContactGovernmentAlt",
             "nameContactPastPerformance",
             "nameContactElectronic",
             "typeEntity",
             "isTaxExemptEntity",
             "isForeignLocatedCompany",
             "isForeignIncorporatedcompany",
             "locationCompany",
             "addressStreet1Company",
             "addressStreet2Company",
             "cityStateCompany",
             "locationMailingCompany",
             "cityStateMailingCompany",
             "cityCompany",
             "stateCompany",
             "zipcodeCompany",
             "zipcode4Company",
             "codeCountryCompany",
             "idCongressionalDistrictCompany",
             "slugEntityStructure",
             "stateIncorporation",
             "codeCountryIncorporation",
             "countBusinessTypes",
             "idNAICSPrimary",
             "countNAICS",
             "countProductServiceCodes",
             "emailPointOfContactGovt",
             "dataBusinessTypes",
             "dataDisasterResponse",
             "dataNAICS",
             "dataNAICSExceptions",
             "dataProductServiceCodes",
             "dataSBA",
             "urlFARPDF",
             "urlDFARPDF"
           ),
           snake_names = T) {

    if (snake_names) {
      data <- data %>% clean_names()
      duns_column <- make_clean_names(duns_column)
    }

    matched_duns <-
      data %>%
      distinct(!!sym(duns_column)) %>%
      filter(!is.na(!!sym(duns_column))) %>%
      pull()

    df_sam <- bulk_sam_entities()

    tbl_sam <-
      df_sam %>% filter(idDUNS %in% matched_duns) %>%
      select(one_of(sam_select_columns), matches("^data"))

    if (snake_names) {
      tbl_sam <- tbl_sam %>%
        clean_names()
      if (tbl_sam %>% hasName("data_naics")) {
        df_naics <-
          tbl_sam %>%
          select(id_duns, data_naics) %>%
          unnest() %>%
          clean_names() %>%
          group_by(id_duns) %>%
          nest() %>%
          ungroup() %>%
          rename(data_naics = data)
        tbl_sam <- tbl_sam %>%
          select(-data_naics) %>%
          left_join(df_naics, by = "id_duns") %>%
          select(-matches("data"), everything())
      }

      if (tbl_sam %>% hasName("data_naics_exceptions")) {
        df_naics_ex <-
          tbl_sam %>%
          select(id_duns, data_naics_exceptions) %>%
          unnest() %>%
          clean_names() %>%
          group_by(id_duns) %>%
          nest() %>%
          ungroup() %>%
          rename(data_naics_exceptions = data)
        tbl_sam <- tbl_sam %>%
          select(-data_naics_exceptions) %>%
          left_join(df_naics_ex, by = "id_duns") %>%
          select(-matches("data"), everything())
      }

      if (tbl_sam %>% hasName("data_disaster_response")) {
        df_response <-
          tbl_sam %>%
          select(id_duns, data_disaster_response) %>%
          unnest() %>%
          clean_names() %>%
          group_by(id_duns) %>%
          nest() %>%
          ungroup() %>%
          rename(data_disaster_response = data)
        tbl_sam <- tbl_sam %>%
          select(-data_disaster_response) %>%
          left_join(df_response, by = "id_duns") %>%
          select(-matches("data"), everything())
      }

      if (tbl_sam %>% hasName("data_product_service_codes")) {
        df_psc <- tbl_sam %>%
          select(id_duns, data_product_service_codes) %>%
          unnest() %>%
          clean_names() %>%
          group_by(id_duns) %>%
          nest() %>%
          ungroup() %>%
          rename(data_product_service_codes = data)
        tbl_sam <- tbl_sam %>%
          select(-data_product_service_codes) %>%
          left_join(df_psc, by = "id_duns") %>%
          select(-matches("data"), everything())
      }

      if (tbl_sam %>% hasName("data_business_types")) {
        df_bt <-
          tbl_sam %>%
          select(id_duns, data_business_types) %>%
          unnest() %>%
          clean_names() %>%
          group_by(id_duns) %>%
          nest() %>%
          ungroup() %>%
          rename(data_business_types = data)
        tbl_sam <-
          tbl_sam %>%
          select(-data_business_types) %>%
          left_join(df_bt, by = "id_duns") %>%
          select(-matches("data"), everything())
      }

      if (tbl_sam %>% hasName("data_sba")) {
        df_sba <-
          tbl_sam %>%
          select(id_duns, data_sba) %>%
          unnest() %>%
          clean_names() %>%
          group_by(id_duns) %>%
          nest() %>%
          ungroup() %>%
          rename(data_sba = data)
        tbl_sam <-
          tbl_sam %>%
          select(-data_sba) %>%
          left_join(df_sba, by = "id_duns") %>%
          select(-matches("data"), everything())
      }
    }

    data_cols <-
      data %>% select(-matches(duns_column)) %>% names()

    if (length(data_cols) > 0) {
      remove_cols <-
        data_cols[data_cols %in% names(tbl_sam)]
      if (length(remove_cols) > 0) {
        data <-
          data %>% select(-one_of(remove_cols))
      }
    }

  data <-
    data %>%
    left_join(tbl_sam, by = duns_column)

  data

  }
abresler/govtrackR documentation built on July 11, 2020, 12:30 a.m.