R/usa_spending_mungers.R

Defines functions .resolve_cage_codes add_department_codes .add_department_award_codes .add_department_funding_codes resolve_listed_duns .add_original_dates .resolve_parent_duns_listed .resolve_duns_listed .allocate_federal_accounts .add_agency_cgacs .add_analysis_contract .fix_sam_exceptions .fix_foreign_reference .fix_duns .add_dod_type .add_budget_year .guess_duns_type

Documented in add_department_codes resolve_listed_duns

.guess_duns_type <-
  function(data, metric = 5) {
    if (!data %>% hasName("idDUNS")) {
      return(data)
    }

    if (!data %>% hasName("idDUNSParent")) {
      return(data)
    }

    if (data %>% hasName("idDUNSAnalysis")) {
      return(data)
    }

    col_length <- data %>%
      dplyr::select(idDUNS, idDUNSParent, matches("^is")) %>%
      dplyr::select(idDUNS,
             idDUNSParent,
             matches("Government|NotForProfit|Federal|^isState")) %>%
      names() %>%
      length()

    if (col_length < 6) {
      return(data)
    }

    df_test <-
      data %>%
      dplyr::select(idDUNS, idDUNSParent, matches("^is")) %>%
      dplyr::select(-matches("FBO|Vendor|MissingDUNS")) %>%
      group_by(idDUNS, idDUNSParent) %>%
      dplyr::select(idDUNS,
             idDUNSParent,
             matches(
               "Government|NotForProfit|Federal|isState|College|School|University|Transit|International|isInternationalOrganization|Hospital|Airport|Institution|College"
             )) %>%
      summarise_all(sum) %>%
      ungroup() %>%
      gather(item, value, -c(idDUNS, idDUNSParent)) %>%
      group_by(idDUNS, idDUNSParent) %>%
      summarise(value = sum(value)) %>%
      ungroup()

    df_test <-
      df_test %>%
      mutate(has_parent = idDUNS != idDUNSParent) %>%
      mutate(idDUNSAnalysis = case_when(value <= metric ~ idDUNSParent,
                                        TRUE ~ idDUNS)) %>%
      dplyr::select(idDUNS, idDUNSParent, idDUNSAnalysis)

    data %>%
      left_join(df_test, by = c("idDUNS", "idDUNSParent")) %>%
      dplyr::select(one_of(names(df_test)), everything())

  }


.add_budget_year <- function(data) {
  if (!data %>% hasName("dateObligation")) {
    return(data)
  }

  data %>%
    mutate(
      yearBudget = case_when(
        lubridate::month(dateObligation) %>% as.numeric() >= 10 ~ lubridate::year(dateObligation) %>% as.numeric() + 1,
        TRUE ~ lubridate::year(dateObligation) %>% as.numeric()
      )
    )

}

.add_dod_type <-
  function(data) {
    has_columns <-
      data %>% hasName("codeDepartmentAward") &&
      data %>% hasName("codeDepartmentFunding")

    if (!has_columns) {
      return(data)
    }

    data <-
      data %>%
      mutate(
        typeDODAward = case_when(
          codeDepartmentAward == 97 & codeDepartmentFunding == 97 ~ "AF",
          codeDepartmentAward != 97 &
            codeDepartmentFunding == 97 ~ "F",
          codeDepartmentAward == 97 &
            codeDepartmentFunding != 97 ~ "A",
          codeDepartmentAward == 97 &
            is.na(codeDepartmentFunding) != 97 ~ "A"
        )
      )

    data
  }


.fix_duns <-
  function(data) {
    duns_names <-
      data %>% hasName("idDUNS") && data %>% hasName("idDUNSParent")
    if (!duns_names) {
      return(data)
    }

    data <-
      data %>%
      mutate_at(c("idDUNS", "idDUNSParent"), as.numeric) %>%
      mutate(
        idDUNS = case_when(idDUNS == 0 ~ NA_real_,
                           TRUE ~ idDUNS),
        idDUNSParent = case_when(idDUNSParent == 0 ~ NA_real_,
                                 TRUE ~ idDUNSParent),
        hasDUNSParent = !is.na(idDUNSParent),
        isMissingDUNS = is.na(idDUNS) & is.na(idDUNS),
        idDUNSParent = case_when(is.na(idDUNSParent) ~ idDUNS,
                                 TRUE ~ idDUNSParent)
      )

    data
  }

.fix_foreign_reference <-
  function(data) {
    has_code <- data %>% hasName("codeForeignFunding")
    has_type <- data %>% hasName("typeForeignFunding")

    if (!has_code | !has_type) {
      return(data)
    }

    data %>%
      mutate(
        typeForeignFunding = case_when(
          codeForeignFunding == "A" ~ "FOREIGN FUNDS FMS",
          codeForeignFunding == "B" ~ "FOREIGN FUNDS NON-FMS",
          typeForeignFunding == "N" ~ "NOT APPLICABLE",
          codeForeignFunding %in% c("X", "N") ~ "NOT APPLICABLE",
          TRUE ~ typeForeignFunding
        ),
        codeForeignFunding = case_when(
          typeForeignFunding == "NOT APPLICABLE" ~ "N",
          typeForeignFunding == "FOREIGN FUNDS NON-FMS" ~ "B",
          typeForeignFunding == "FOREIGN FUNDS FMS" ~  "A",
          TRUE ~ codeForeignFunding
        )
      )

  }

.fix_sam_exceptions <-
  function(data) {
    has_code <- data %>% hasName("typeSAMException")
    has_type <- data %>% hasName("idSAMException")

    if (!has_code | !has_type) {
      return(data)
    }

    data <- data %>%
      mutate(
        typeSAMException = case_when(
          typeSAMException == "C" ~ "CLASSIFIED CONTRACTS",
          typeSAMException == "A" ~ "AWARDS TO FOREIGN VENDORS FOR WORK PERFORMED OUTSIDE THE UNITED STATES",
          typeSAMException == "G" ~ "GOVERNMENT - WIDE COMMERCIAL PURCHASE CARD",
          typeSAMException == "M" ~ "MICRO-PURCHASES THAT DO NOT USE THE EFT",
          TRUE ~ typeSAMException
        )
      )

    data <- data %>%
      mutate(
        idSAMException = case_when(
          typeSAMException %in% c(
            "GOVERNMENT - WIDE COMMERCIAL PURCHASE CARD",
            "GOVERNMENT-WIDE COMMERCIAL PURCHASE CARD"
          ) ~ 1,
          typeSAMException == "CLASSIFIED CONTRACTS" ~ 2,
          typeSAMException == "CONTRACTING OFFICERS DEPLOYED IN THE COURSE OF MILITARY OPERATIONS" ~ 3,
          typeSAMException == "CONTRACTING OFFICERS CONDUCTING EMERGENCY OPERATIONS" ~ 4,
          typeSAMException == "CONTRACTS TO SUPPORT UNUSUAL OR COMPELLING NEEDS" ~ 5,
          typeSAMException == "AWARDS TO FOREIGN VENDORS FOR WORK PERFORMED OUTSIDE THE UNITED STATES" ~ 6,
          typeSAMException == "MICRO-PURCHASES THAT DO NOT USE THE EFT" ~ 7,
          TRUE ~ idSAMException
        )
      )
  }


.add_analysis_contract <- function(data) {
  if (data %>% hasName("idContractAnalysis")) {
    return(data)
  }

  has_contract <- data %>% hasName("idContract")
  has_parent <-
    data %>% hasName("idContractParent") |
    data %>% hasName("idContractIDV")

  if (!has_contract | !has_parent) {
    if (data %>% hasName("idContract")) {
      data <- data %>%
        mutate(idContractAnalysis = idContract) %>%
        dplyr::select(idContractAnalysis, everything())
    }

    return(data)
  }

  if (!data %>% hasName("idContractParent")) {
    data <- data %>%
      rename(idContractParent = idContractIDV)
  }
  data <- data %>%
    mutate(
      countCharsContract = nchar(idContract),
      countCharsParent = nchar(idContractParent),
      lettersContract = idContract %>% str_count("[A-Z]"),
      idContractAnalysis = case_when(
        countCharsContract <= 5 &
          !is.na(idContractParent) ~ idContractParent,
        is.na(idContractParent) ~ idContract,
        !is.na(idContractParent) &
          lettersContract <= 1 ~ idContractParent,
        TRUE ~ idContract
      )
    ) %>%
    mutate(
      typeContractIDAnalysis = case_when(idContractAnalysis == idContractParent ~ "IDV",
                                         TRUE ~ "Contract")
    ) %>%
    mutate(idContractAnalysis = case_when(
      is.na(idContractAnalysis) ~ idContractParent,
      TRUE ~ idContractAnalysis
    )) %>%
    dplyr::select(idContract, idContractAnalysis , everything()) %>%
    dplyr::select(-c(countCharsContract, countCharsParent, lettersContract))

  data <- data %>%
    rename(idContractIDV = idContractParent)
  data
}

.add_agency_cgacs <-
  function(data) {

    cleaned_names <- data %>% clean_names() %>% names()
    needs_agency_cgac_funding <- str_detect(cleaned_names, "name_agency_cgac_funding") %>% sum(na.rm = T) == 0

    needs_agency_cgac_award <- str_detect(cleaned_names, "name_agency_cgac_award") %>% sum(na.rm = T) == 0

    needs_agency_cgac_idv <- str_detect(cleaned_names, "name_cgac_agency_idv") %>% sum(na.rm = T) == 0
    none_needed <- needs_agency_cgac_idv + needs_agency_cgac_award + needs_agency_cgac_funding == 0
    if (none_needed) {
      return(data)
    }

    df_cgac <-
      .cgac_codes()

   if (needs_agency_cgac_funding)  {
    if (data %>% hasName("idAgencyFunding") &
        !data %>% hasName("idCGACFunding")) {

      df_agencies <-
        data %>%
        dplyr::select(idAgencyFunding) %>%
        distinct() %>%
        filter(!is.na(idAgencyFunding)) %>%
        mutate(idCGACFunding = case_when(
          !is.na(idAgencyFunding) ~ idAgencyFunding %>% substr(1, 2) %>% as.numeric(),
          TRUE ~ NA_real_
        )) %>%
        left_join(
          df_cgac %>% dplyr::select(
            idCGACFunding = idCGAC,
            nameAgencyCGACFunding = nameAgencyCGAC,
            slugCGACFunding = slugCGAC
          ),
          by = "idCGACFunding"
        ) %>%
        distinct()

      data <-
        data %>%
        left_join(df_agencies, by = "idAgencyFunding")
    }

   }

    if (needs_agency_cgac_award) {
      if (data %>% hasName("idAgencyAward") &
          !data %>% hasName("idCGACAward")) {
        df_agencies <-
          data %>%
          dplyr::select(idAgencyAward) %>%
          distinct() %>%
          filter(!is.na(idAgencyAward)) %>%
          mutate(idCGACAward = case_when(
            !is.na(idAgencyAward) ~ idAgencyAward %>% substr(1, 2) %>% as.numeric(),
            TRUE ~ NA_real_
          ))

        df_ids <-
          df_cgac %>% dplyr::select(
            idCGACAward = idCGAC,
            nameAgencyCGACAward = nameAgencyCGAC,
            slugCGACAward = slugCGAC
          ) %>%
          filter(idCGACAward %in% (df_agencies$idCGACAward %>% unique())) %>%
          group_by(idCGACAward) %>%
          dplyr::slice(1) %>%
          ungroup()

        df_agencies <-
          df_agencies %>%
          left_join(df_ids,
                    by = "idCGACAward") %>%
          distinct()

        data <- data %>%
          left_join(df_agencies, by  = "idAgencyAward")
      }
    }

    if (needs_agency_cgac_idv) {
    if (data %>% hasName("idAgencyAwardIDV") &
        !data %>% hasName("idCGACIDV")) {

      df_agencies <-
        data %>%
        dplyr::select(idAgencyAwardIDV) %>%
        distinct() %>%
        filter(!is.na(idAgencyAwardIDV)) %>%
        mutate(
          idCGACAgencyIDV = case_when(
            !is.na(idAgencyAwardIDV) ~ idAgencyAwardIDV %>% substr(1, 2) %>% as.numeric(),
            TRUE ~ NA_real_
          )
        ) %>%
        left_join(
          df_cgac %>% dplyr::select(
            idCGACAgencyIDV = idCGAC,
            nameCGACAgencyIDV = nameAgencyCGAC,
            slugCGACAgencyIDV = slugCGAC
          ),
          by = "idCGACAgencyIDV"
        ) %>%
        distinct()

      data <-
        data %>%
        left_join(df_agencies, by = "idAgencyAwardIDV")


    }
    }
    data
  }

.allocate_federal_accounts <-
  function(data) {

    if (!data %>% hasName("dateObligation")) {
      return(data)
    }
    if (data %>% hasName("idFederalAccounts")) {
      data <-
        data %>%
        rename(idFederalAccount = idFederalAccounts)
    }

    if (!data %>% hasName("idFederalAccount")) {
      return(data)
    }

    if (!data %>% hasName("idContractAnalysis")) {
      return(data)
    }

    if (data %>% hasName("idFederalAccountResolved")) {
      return(data)
    }

    df_accounts <-
      data %>%
      dplyr::select(idContractAnalysis, idFederalAccount, amountObligation) %>%
      filter(!is.na(idContractAnalysis)) %>%
      filter(!is.na(idFederalAccount)) %>%
      mutate(countAccounts = idFederalAccount %>% str_count("\\;") + 1)

    df_account_resolved <-
      df_accounts %>% dplyr::select(idContractAnalysis, idFederalAccount) %>%
      separate_rows(idFederalAccount, sep = "\\;") %>%
      group_by(idContractAnalysis) %>%
      summarise(
        idFederalAccountResolved = unique(idFederalAccount) %>% sort() %>% str_c(collapse = " | "),
        countFederalAccounts = n_distinct(idFederalAccount)
      ) %>%
      ungroup()

    df_transactions <-
      data %>%
      dplyr::select(dateObligation,
             idContractAnalysis,
             idFederalAccount,
             amountObligation) %>%
      left_join(df_account_resolved, by = "idContractAnalysis") %>%
      dplyr::select(dateObligation,
             idFederalAccount,
             idFederalAccountResolved,
             everything()) %>%
      mutate(
        countAccounts = idFederalAccount %>% str_count("\\;") + 1,
        countAccountsResolved = idFederalAccountResolved %>% str_count("\\|") + 1,
        amountObligationAllocated = case_when(
          is.na(idFederalAccount) &
            is.na(idFederalAccountResolved) ~ amountObligation,
          !is.na(idFederalAccount) ~ (amountObligation * (1 / countAccounts)),
          TRUE ~ (amountObligation * (1 / countAccountsResolved))
        )
      )

    df_transactions <-
      df_transactions %>%
      dplyr::select(
        dateObligation,
        idContractAnalysis,
        amountObligation,
        idFederalAccountResolved,
        amountObligationAllocated
      ) %>%
      mutate(hasFederalAccount = !is.na(idFederalAccountResolved))

    data <-
      data %>%
      left_join(df_transactions,
                by = c("idContractAnalysis", "dateObligation", "amountObligation")) %>%
      distinct()

    data

  }

.resolve_duns_listed <-
  function(data, exclude_bloat = F) {

    if (exclude_bloat) {
      return(data)
    }

    if (!data %>% hasName("nameVendor")) {
      return(data)
    }

    if (!data %>% hasName("idDUNS")) {
      return(data)
    }

    if (data %>% hasName("namesVendorListed")) {
      return(data)
    }

    if (data %>% filter(!is.na(idDUNS)) %>% filter(!is.na(nameVendor)) %>% nrow() == 0) {
      return(data)
    }

    data <- data %>%
      dplyr::select(-one_of(
        "countVendorsListed",
        "hasMultipleVendors",
        "namesVendorListed"
      )) %>%
      suppressWarnings()



    df_duns <-
       data %>% count(idDUNS, nameVendor, name = "count", sort = T)

    df_duns <-
      df_duns %>%
      group_by(idDUNS) %>%
      summarise(namesVendorListed = unique(nameVendor) %>% str_c(collapse = " | ")) %>%
      ungroup() %>%
      mutate(
        countVendorsListed =  namesVendorListed %>% str_count("\\|") + 1,
        hasMultipleVendors = countVendorsListed > 1
      ) %>%
      ungroup()

    data <- data %>%
      left_join(df_duns, by = "idDUNS")

    data
  }

.resolve_parent_duns_listed <-
  function(data, exclude_bloat = F) {

    if (exclude_bloat) {
      return(data)
    }

    if (!data %>% hasName("nameVendorParent")) {
      return(data)
    }

    if (!data %>% hasName("idDUNSParent")) {
      return(data)
    }

    if (data %>% hasName("namesVendorParentListed")) {
      return(data)
    }

    data <- data %>%
      dplyr::select(
        -one_of(
          "countVendorsParentListed",
          "hasMultipleParentVendors",
          "namesVendorParentListed"
        )
      ) %>%
      suppressWarnings()

    if (data %>% filter(!is.na(idDUNSParent)) %>% filter(!is.na(nameVendorParent)) %>% nrow() == 0) {
      return(data)
    }

    df_duns <-
      data %>% count(idDUNSParent,
                     nameVendorParent,
                     name = "count",
                     sort = T) %>%
      group_by(idDUNSParent) %>%
      summarise(namesVendorParentListed = unique(nameVendorParent) %>% str_c(collapse = " | ")) %>%
      ungroup() %>%
      mutate(
        countVendorsParentListed =  namesVendorParentListed %>% str_count("\\|") + 1,
        hasMultipleParentVendors = countVendorsParentListed > 1
      ) %>%
      ungroup()

    data <- data %>%
      left_join(df_duns, by = "idDUNSParent")
    rm(df_duns)
    gc()
    data
  }

.add_original_dates <-
  function(data) {
    if (!data %>% hasName("idContractAnalysis")) {
      return(data)
    }

    if (!data %>% hasName("dateObligation")) {
      return(data)
    }

    if (!data %>% hasName("dateContractCompletionCurrent")) {
      return(data)
    }

    if (!data %>% hasName("amountBaseAllOptionTotal")) {
      return(data)
    }

    if (!data %>% hasName("amountBaseAllOptionTotalOriginal")) {
      return(data)
    }

    df_first_dates <-
      data %>%
      group_by(idContractAnalysis) %>%
      filter(dateObligation == min(dateObligation)) %>%
      ungroup() %>%
      group_by(idContractAnalysis) %>%
      summarise(
        amountBaseAllOptionTotalOriginal = max(amountBaseAllOptionTotal),
        dateContractCompletionOriginal = max(dateContractCompletionCurrent)
      ) %>%
      ungroup()

    data <- data %>%
      left_join(df_first_dates, by = c("idContractAnalysis")) %>%
      mutate(
        hasChangedBaseAmount = amountBaseAllOptionTotalOriginal != amountBaseAllOptionTotal,
        hasChangedCompletionDate = dateContractCompletionCurrent != dateContractCompletionOriginal
      )

    data
  }

#' Resolve DUNS numbers for listed data
#'
#' Attempts to resolve for DUNS and Parent DUNS when listed within a tibble
#'
#' @param data
#'
#' @return
#' @export
#'
#' @examples
resolve_listed_duns <-
  function(data, exclude_bloat = F) {

    data <-
      .resolve_duns_listed(data = data, exclude_bloat = exclude_bloat)

    data <-
      .resolve_parent_duns_listed(data = data, exclude_bloat = exclude_bloat)

    data <-
      .resolve_cage_codes(data = data)

    data
  }

.add_department_funding_codes <-
  function(data) {
    if (data %>% hasName("nameDepartmentFunding")) {
      return(data)
    }

    if (!data %>% hasName("idAgencyFunding")) {
      return(data)
    }

    df_fpds_offices <- .fpds_departments()

    df_agencies <-
      data %>%
      filter(!is.na(idAgencyFunding)) %>%
      distinct(idAgencyFunding)

    df_agencies <- df_agencies %>%
      left_join(
        df_fpds_offices %>% distinct(
          idAgencyFunding = idAgency,
          idDepartmentFunding = idDepartment,
          nameDepartmentFunding = nameDepartment
        ),
        by = "idAgencyFunding"
      ) %>%
      group_by(idAgencyFunding) %>%
      dplyr::slice(1) %>%
      ungroup() %>%
      mutate(
        idDepartmentFunding = case_when(idAgencyFunding == "9700" ~ "9700",
                                        TRUE ~ idDepartmentFunding),
        nameDepartmentFunding = case_when(
          idAgencyFunding == "9700" ~ "DEPARTMENT OF DEFENSE",
          TRUE ~ nameDepartmentFunding
        )
      ) %>%
      mutate(codeDepartmentFunding = idDepartmentFunding %>% substr(1, 2)  %>% as.numeric())

    data <-
      data %>%
      left_join(df_agencies, by = "idAgencyFunding")

    data
  }

.add_department_award_codes <-
  function(data) {
    if (data %>% hasName("nameDepartmentAward")) {
      return(data)
    }

    if (!data %>% hasName("idAgencyAward")) {
      return(data)
    }

    df_fpds_offices <- .fpds_departments()

    df_agencies <-
      data %>%
      select(idAgencyAward) %>%
      filter(!is.na(idAgencyAward)) %>%
      distinct()

    df_offices <-
      df_fpds_offices %>% distinct(
        idAgencyAward = idAgency,
        idDepartmentAward = idDepartment,
        nameDepartmentAward = nameDepartment
      ) %>%
      filter(idAgencyAward %in% (df_agencies$idAgencyAward))
    df_agencies <-
      df_agencies %>%
      left_join(df_offices,
                by = "idAgencyAward") %>%
      group_by(idAgencyAward) %>%
      dplyr::slice(1) %>%
      ungroup() %>%
      mutate(
        idDepartmentAward = case_when(idAgencyAward == "9700" ~ "9700",
                                      TRUE ~ idDepartmentAward),
        nameDepartmentAward = case_when(
          idAgencyAward == "9700" ~ "DEPARTMENT OF DEFENSE",
          TRUE ~ nameDepartmentAward
        )
      ) %>%
      mutate(codeDepartmentAward = idDepartmentAward %>% substr(1, 2)  %>% as.numeric())

    data <-
      data %>%
      left_join(df_agencies, by = "idAgencyAward")

    data
  }

#' Adds missing FPDS department award and funding codes
#'
#' @param data
#'
#' @return
#' @export
#'
#' @examples
add_department_codes <-
  function(data) {
    data <-
      .add_department_award_codes(data = data)
    data <-
      .add_department_funding_codes(data = data)

    data
  }

.resolve_cage_codes <-
  function(data) {
    if (!data %>% hasName("cageVendor")) {
      return(data)
    }

    if (data %>% hasName("idDUNS")) {
      return(data)
    }

    df_cage <- data %>%
      dplyr::select(idDUNS, cageVendor) %>%
      filter(!is.na(cageVendor)) %>%
      group_by(idDUNS) %>%
      summarise(cageVendorResolved = unique(cageVendor) %>% sort() %>% str_c(sep = " | ")) %>%
      ungroup() %>%
      mutate(countCAGEResolved = cageVendorResolved %>% str_count("\\ | ") + 1)

    data <-
      data %>%
      left_join(df_cage, by = "idDUNS")

    data
  }
abresler/govtrackR documentation built on July 11, 2020, 12:30 a.m.