R/ppp.R

Defines functions .ppp_foia_url ppp_foia .parse_ppp_folders .munge_ppp_names .dictionary_ppp_names

Documented in ppp_foia

.dictionary_ppp_names <-
  function() {
    tibble(nameSBA = c("LoanRange", "BusinessName", "Address", "City", "State", "Zip",
                       "NAICSCode", "BusinessType", "RaceEthnicity", "Gender", "Veteran",
                       "NonProfit", "JobsRetained", "DateApproved", "Lender", "CD",
                       "LoanAmount", "nameFile"),
           nameActual = c("rangeLoan", "nameEntitySBA", "addressStreetEntity", "cityEntity", "stateEntity", "zipcodeEntity",
                          "idNAICS", "typeBusiness", "typeEthnicity", "typeGenderPrincipal", "isVeteranOwned",
                          "isNonProfit", "countJobsRetained", "dateApprovalPPP", "nameLenderSBA", "codeCongressionalDistrict",
                          "amountLoan", "nameFile")

    )

  }

.munge_ppp_names <-
  function(data) {
    dict_names <- .dictionary_ppp_names()
    ppp_names <-
      names(data)

    actual_names <-
      ppp_names %>%
      map_chr(function(x) {
        df_row <-
          dict_names %>%
          filter(nameSBA == x) %>%
          distinct() %>%
          slice(1)
        if (nrow(df_row) == 0) {
          glue::glue("Missing {x}") %>% message()
          return(x)
        }

        df_row$nameActual
      })

    data %>%
      set_names(actual_names)
  }

.parse_ppp_folders <-
  function(folder) {
    options(scipen = 9999999)
    folders <-
      list.files() %>%
      discard(function(x) {
        x %>% str_detect("pdf")
      })

    data <-
      folders %>%
      map_dfr(function(x) {
        file <- dir(x) %>%
          keep(function(x) {
            x %>% str_detect("csv")
          })
        glue("{x}/{file}") %>%
          read_csv(
            col_types = cols(
              LoanRange = col_character(),
              BusinessName = col_character(),
              Address = col_character(),
              City = col_character(),
              State = col_character(),
              Zip = col_character(),
              NAICSCode = col_double(),
              BusinessType = col_character(),
              RaceEthnicity = col_character(),
              Gender = col_character(),
              Veteran = col_character(),
              NonProfit = col_character(),
              JobsRetained = col_double(),
              DateApproved = col_character(),
              Lender = col_character(),
              CD = col_character()
            )
          ) %>%
          mutate(nameFile = x) %>%
          select(nameFile, everything())
      }) %>%
      as_tibble()

    data <-
      data %>%
      .munge_ppp_names()

    data <- data %>%
      mutate(isLoanEstimate = !is.na(rangeLoan)) %>%
      select(nameFile,
             isLoanEstimate,
             nameEntitySBA,
             rangeLoan,
             amountLoan,
             everything())

    data <-
      data %>%
      mutate(rangeLoan = rangeLoan %>% str_replace("\\ ", "\\|")) %>%
      separate(rangeLoan,
               into = c("codeRangeLoan", "rangeLoan"),
               sep = "\\|") %>%
      separate(
        codeCongressionalDistrict,
        into = c("stateCongressionalDistrict", "numberCongressionalDistrict"),
        sep = "\\ - ",
        convert = F,
        remove = F
      )

    data <-
      data %>%
      mutate(
        rangeLoan = case_when(
          rangeLoan == "$1-2 million" ~ "1.00M-2.00M",
          rangeLoan == "$150,000-350,000" ~ ".150M-.350M",
          rangeLoan == "$2-5 million" ~ "2.000M-5.000M",
          rangeLoan == "$350,000-1 million" ~ ".350M-1.000M",
          rangeLoan == "$5-10 million" ~ "5.000M-10.000M",
          TRUE ~ "0.001M-0.150M"
        ) %>% forcats::as_factor()
      )

    data <- data %>%
      mutate(
        typeBusiness = case_when(
          typeBusiness == "Limited  Liability Company(LLC)" ~ "Limited Liability Company",
          typeBusiness == "Corporation" ~ "C Corporation",
          typeBusiness == "Subchapter S Corporation" ~ "S Corporation",
          typeBusiness %>% str_detect("Employee Stock Ownership Plan") ~ "Employee Stock Ownership Plan",
          typeBusiness %>% str_detect("Rollover as Business") ~ "Rollover as Business Startup",
          typeBusiness == "Non-Profit Organization" ~ "Non Profit Organization",
          typeBusiness == "Self-Employed Individuals" ~ "Self Employed Individuals",
          typeBusiness == "Non-Profit Childcare Center" ~ "Non Profit Childcare Center",
          typeBusiness %>% str_detect("Independent Contractors") ~ "Independent Contractor",
          is.na(typeBusiness) ~ "Unknown",
          TRUE ~ typeBusiness
        )
      )

    data <- data %>%
      mutate(
        hasEntityName = !is.na(nameEntitySBA),
        codeRangeLoan = codeRangeLoan %>% coalesce("F") %>% str_to_upper()
      ) %>%
      select(nameFile, nameEntitySBA, everything())

    data <- data %>%
      mutate(
        isSBAIndividual = typeBusiness %>% str_detect("Individual|Contractor"),
        isSBACorporation = typeBusiness %>% str_detect("Corporation"),
        isSBAPartnership = typeBusiness %>% str_detect("Partnership|Proprietorship"),
        isSBANonProfit = typeBusiness %>% str_detect("Non Profit"),
        isSBAEntity = !typeBusiness %>% str_detect("Unknown|Self Employ")
      )

    tbl_ranges <- data %>%
      distinct(codeRangeLoan, rangeLoan) %>%
      filter(codeRangeLoan != "F") %>%
      mutate(
        amountLoanMinimum = case_when(
          codeRangeLoan == "A" ~ 5000000,
          codeRangeLoan == "B" ~ 2000000,
          codeRangeLoan == "C" ~ 1000000,
          codeRangeLoan == "D" ~ 350000,
          codeRangeLoan == "E" ~ 150000
        ),
        amountLoanMaximum = case_when(
          codeRangeLoan == "A" ~ 10000000,
          codeRangeLoan == "B" ~ 5000000 -
            1,
          codeRangeLoan == "C" ~ 2000000 -
            1,
          codeRangeLoan == "D" ~ 1000000 -
            1,
          codeRangeLoan == "E" ~ 3500000 - 1
        ),
        amountLoanMean = (amountLoanMinimum + (amountLoanMaximum + 1)) / 2
      ) %>%
      select(-rangeLoan)

    data <- data %>%
      left_join(tbl_ranges, by = "codeRangeLoan") %>%
      mutate(
        amountLoanMean = case_when(is.na(amountLoanMean) ~ amountLoan,
                                   TRUE ~ amountLoanMean),
        amountLoanMinimum = case_when(is.na(amountLoanMinimum) ~ amountLoan,
                                      TRUE ~ amountLoanMinimum),
        amountLoanMaximum = case_when(is.na(amountLoanMaximum) ~ amountLoan,
                                      TRUE ~ amountLoanMaximum)
      )

    data <- data %>%
      mutate(
        dateApprovalPPP = mdy(dateApprovalPPP),
        isNonProfit = isNonProfit == "Y",
        isVeteranOwned = case_when(
          isVeteranOwned == "Non-Veteran" ~ FALSE,
          isVeteranOwned == "Veteran" ~ TRUE,
          TRUE ~ NA
        ),
        isJobsCountError = countJobsRetained < 0,
        countJobsRetainedCleaned = case_when(
          !is.na(countJobsRetained) ~ abs(countJobsRetained),
          TRUE ~ countJobsRetained
        )
      )

    data <- data %>%
      mutate(
        typeGenderPrincipal = case_when(
          typeGenderPrincipal == "Female Owned" ~ "Female",
          typeGenderPrincipal == "Male Owned" ~ "Male",
          TRUE ~ "Unstated"
        ),
        typeEthnicity = case_when(
          typeEthnicity %>% str_detect("American Indian") ~ "American Indian",
          typeEthnicity %>% str_detect("Eskimo") ~ "Eskimo",
          typeEthnicity %>% str_detect("Black") ~ "Black",
          typeEthnicity == "Unanswered" ~ "Unstated",
          typeEthnicity == "Multi Group" ~ "Multiracial",
          TRUE ~ typeEthnicity
        )
      )

    data <- data %>%
      mutate_if(is.character,
                list(function(x) {
                  case_when(x == "" ~ NA_character_,
                            TRUE ~ x)
                }))

    data <-
      data %>%
      mutate_if(is.character,
                list(function(x) {
                  x %>% str_to_upper() %>% stri_enc_toutf8() %>% str_squish()
                }))

    data <- data %>%
      mutate(
        amountLoanPerEmployee = case_when(
          is.na(amountLoan) ~ NA_real_,
          amountLoan == 0 ~ NA_real_,
          countJobsRetainedCleaned == 0 ~ NA_real_,
          TRUE ~ amountLoan / countJobsRetainedCleaned
        ),
        amountLoanMeanPerEmployee = case_when(
          is.na(amountLoanMean) ~ NA_real_,
          amountLoanMean == 0 ~ NA_real_,
          countJobsRetainedCleaned == 0 ~ NA_real_,
          TRUE ~ amountLoanMean / countJobsRetainedCleaned
        ),
        amountLoanMinPerEmployee = case_when(
          is.na(amountLoanMinimum) ~ NA_real_,
          amountLoanMinimum == 0 ~ NA_real_,
          countJobsRetainedCleaned == 0 ~ NA_real_,
          TRUE ~ amountLoanMinimum / countJobsRetainedCleaned
        ),
        amountLoanMaxPerEmployee = case_when(
          is.na(amountLoanMaximum) ~ NA_real_,
          amountLoanMaximum == 0 ~ NA_real_,
          countJobsRetainedCleaned == 0 ~ NA_real_,
          TRUE ~ amountLoanMaximum / countJobsRetainedCleaned
        )
      )

    data <-
      data %>%
      mutate(nameEntity = nameEntitySBA,
             nameLender = nameLenderSBA)


    data <- data %>%
      mutate_at(vars(nameEntity, nameLender), list(function(x) {
        x %>%  str_replace_all(
          "^_|^, |^:|^\\.|^\\,|\\'|\\,$|\\{|\\#|\\*|\\.|\\,|\\(|\\)|\\^-|\\@|\\[|\\]|^&|^+|^%|^%-|^-",
          ""
        ) %>% str_squish()
      })) %>%
      mutate(nameEntity = case_when(nameEntity == "-" ~ NA_character_,
                                    TRUE ~ nameEntity))

    data <- data %>%
      mutate(
        nameLender = nameLender %>% str_remove_all("NATIONAL ASSOCIATION") %>% str_replace_all("D/B/A", "\\ | ")
      ) %>%
      separate(
        nameLender,
        into = c("nameLender", "nameLenderDBA"),
        sep = "\\|",
        extra = "merge",
        fill  = "right"
      ) %>%
      mutate_if(is.character, str_squish)

    data <- data %>%
      mutate(hasLenderDBA = !is.na(nameLenderDBA))

    dict_naics <-
      dictionary_naics_codes() %>%
      select(idNAICS:nameIndustryNAICS) %>%
      group_by(idNAICS) %>%
      slice(1) %>%
      ungroup()

    data <- data %>%
      mutate(id = 1:n()) %>%
      left_join(dict_naics, by = "idNAICS") %>%
      distinct() %>%
      group_by(id) %>%
      slice(1) %>%
      ungroup() %>%
      select(-id)



    data <- data %>%
      entities::refine_columns(entity_columns = c("nameLender", "nameEntity"))

    data <- data %>%
      mutate(
        cityEntity = cityEntity %>% str_replace_all("^_|^, |^-|^:|^\\.|^\\,|^\\'|\\,$|\\{|\\#|\\*|`", "")
      ) %>%
      entities::refine_columns(entity_columns = c("cityEntity")) %>%
      select(-c(cityEntity, slugSoundexcityEntity)) %>%
      rename(cityEntity = cityEntityClean) %>%
      select(one_of(names(data)), everything())


    data <- data %>%
      mutate(stateEntity = case_when(stateEntity == "FI" ~ "FL",
                                     TRUE ~ stateEntity))
    data <-
      data %>%
      build_address()

    data <- data %>%
      mutate(isLLC = nameEntity %>% str_detect("LLC"),
             isINC = nameEntitySBA %>% str_detect("INC$"))
    data

  }

#' PPP Data
#'
#' FOIA PPP data
#'
#' @param unformat
#' @param snake_names
#' @param use_local
#'
#' @return
#' @export
#'
#' @examples
ppp_foia <-
  function(unformat = F,
           snake_names = T, use_local = F) {

    file_nos <- 1:10

    if (use_local) {
      oldwd <- getwd()
      setwd("~")
      urls <- glue(
        "Desktop/abresler.github.io/r_packages/govtrackR/data/ppp/ppp/ppp_{file_nos}.rda"
      )
    } else {
      urls <- glue("https://asbcllc.com/r_packages/govtrackR/data/ppp/ppp/ppp_{file_nos}.rda")
    }


    data <-
      urls %>%
      map_dfr(function(url){
        read_rda(file = url)
      })

    if (!unformat) {
      amt_names <- data %>% select(matches("amount")) %>% names()
      data <-
        data %>%
        mutate_at(amt_names,
                  list(function(x){
                    currency(x, digits = 0)
                  }))
    }

    data <- data %>%
      select(matches("nameFile|nameEntity|nameLender"), everything())

    if (snake_names) {
      data <- data %>% janitor::clean_names()
    }

    if (use_local) {
      if (getwd() != oldwd) {
        setwd(oldwd)
      }
    }
    data

  }

.ppp_foia_url <-
  function() {
    file_nos <- 1:10
    urls <- glue("https://asbcllc.com/r_packages/govtrackR/data/ppp/ppp/ppp_{file_nos}.rda")

    data <-
      urls %>%
      map_dfr(function(url){
        read_rda(file = url)
      })

    amt_names <- data %>% select(matches("amount")) %>% names()

    data <-
      data %>%
      mutate_at(amt_names,
                list(function(x){
                  currency(x, digits = 0)
                })) %>%
      mutate(nameAgencyParent = "NATIONAL SCIENCE FOUNDATION")

    name_cols <- data %>% select(matches("name")) %>% names()

    data <- data %>%
      mutate_at(name_cols, str_to_upper)

    data <- data %>%
      separate(
        nameDivisionFunding,
        extra = "merge",
        into = c("nameDivisionFunding", "slugDivisionFunding"),
        sep = "\\("
      ) %>%
      mutate(slugDivisionFunding = slugDivisionFunding %>% str_remove_all("\\)")) %>%
      mutate_if(is.character, str_squish)

    data <- data %>%
      mutate(nameAgencyParent = "NATIONAL SCIENCES FOUNDATION")

    data

  }
abresler/govtrackR documentation built on July 11, 2020, 12:30 a.m.