R/fpds_bulk.R

Defines functions .fpds_years

# sql ---------------------------------------------------------------------

## https://aws.amazon.com/blogs/aws/new-usaspending-gov-on-an-amazon-rds-snapshot/

# urls --------------------------------------------------------------------



.fpds_years <- function(url = "https://www.fpds.gov/ddps/directory_browser/index.php") {
  page <- url %>% read_html()
  links <- page %>% html_nodes("td a")
  urls <- links %>% html_attr('href') %>% discard(function(x){x %>% str_detect("/ddps")}) %>% str_c("https://www.fpds.gov/ddps/directory_browser/index.php",.)
  years <- links %>% html_text() %>% str_trim() %>% discard(function(x){x == ""})
  tibble(years, urlFPDSYear = urls) %>%
    separate(years, sep = "\\-", into = c("yearFiscal", "versionFPDS"),extra = "merge") %>%
    mutate(yearFiscal = yearFiscal %>% str_replace_all("FY", "20") %>% as.numeric())
}

.parse_fpds_year <-
  function(url = "https://www.fpds.gov/ddps/directory_browser/index.php?somepath=../FY05-V1.4", return_message = T) {
if (return_message) {
  glue("Parsing {url}") %>% message()
}
      page <- url %>% read_html()
  links <-
    page %>% html_nodes("td a")
  urls <-
    links %>% html_attr('href') %>% discard(function(x){x %>% str_detect("/ddps")}) %>% str_c("https://www.fpds.gov/ddps/directory_browser/index.php",.)
  offices <-
    links %>% html_text() %>% str_trim() %>% discard(function(x){x == ""})

  tibble(offices, urlOfficeYear = urls) %>%
    separate(offices,
             sep = "\\-",
             extra = "merge",
             into = c("idOffice", "slugOffice")) %>%
    mutate(
      slugOffice = ifelse(is.na(slugOffice), idOffice, slugOffice),
      idOffice = ifelse(slugOffice == idOffice, NA_character_, idOffice)
    ) %>%
    mutate(urlFPDSYear = url)

}

#' FPDS Archive XML Links
#'
#' Acquires all links for access to FPDS XML archives
#'
#' @param join_office_features if \code{TRUE} joins features for matching office id
#' @param return_message if \code{TRUE} returns messages
#'
#' @return \code{tibble}
#' @export
#'
#' @examples
#' fpds_xml_urls()
fpds_xml_urls <-
  function(join_office_features = T, return_message = T) {
    df_years <- .fpds_years()
    .parse_fpds_year_safe <- possibly(.parse_fpds_year, tibble())

    all_data <-
      1:nrow(df_years) %>%
      map_dfr(function(x){
        df_row <- df_years %>% dplyr::slice(x)
        fy <- df_row$yearFiscal
        url <- df_row$urlFPDSYear
        if (return_message) {
          glue("Acquiring all FPDS agency urls for {fy}") %>% message()
        }
        data <- .parse_fpds_year_safe(url = url, return_message = return_message)
        data %>%
          mutate(yearFiscal = fy) %>%
          select(yearFiscal, everything())
      })

    if (join_office_features){
      dict_agencies <- dictionary_government_agencies()
      d <- dict_agencies %>% select(idOffice, nameAgency, nameOffice, idDepartment, descriptionMission, url, urlCongressionalJustification)
      all_data <- all_data %>%
        left_join(d, by = "idOffice") %>%
        select(yearFiscal, idOffice, names(d), everything())
    }
      all_data
  }



# fpds_xml ----------------------------------------------------------------

.parse_fpds_xml <-
  function(doc) {
    contract_nodes <-
      doc %>% xml_children() %>% length()

    if (contract_nodes < 2) {
      return(tibble())
    }

    all_data <- 2:contract_nodes %>%
      map_dfr(function(x) {
        glue("Parsing {x-1} of {contract_nodes-1} expenditures") %>% message()
        xml_contract <-
          doc %>% xml_child(x) %>% xml_contents()
        contract_node_names <-
          xml_name(xml_contract)
        data <-
          1:length(contract_node_names) %>%
          map_dfr(function(table_no) {
            node_contents <-
              xml_contract[[table_no]] %>% as_list() %>% unlist()

            if (length(node_contents) == 0) {
              return(invisible())
            }
            tibble(item = names(node_contents),
                   value = as.character(node_contents)) %>%
              mutate(nameNode = contract_node_names[[table_no]]) %>%
              select(nameNode, everything())
          })

        data %>%
          mutate(idTransactionOfficeYear = x - 1) %>%
          select(idTransactionOfficeYear, everything())
      })

    df_items <-
      all_data %>%
      distinct(item)

    dict_items <-
      df_items$item %>%
      map_dfr(function(item) {
        tibble(
          item = item,
          itemActual =
            item %>% str_remove_all(
              "\\^placeOfPerformance.|vendorHeader.|vendorBusinessTypes.|federalGovernment.|vendorSiteDetails.|vendorSiteDetails.vendorLineOfBusiness.|vendorSocioEconomicIndicators.|minorityOwned.|vendorHeader.|vendorBusinessTypes.|treasuryAccount.treasuryAccountSymbol.|vendorSiteDetails.|awardContractID.|listOfTreasuryAccounts.|listOfTreasuryAccounts.treasuryAccount.treasuryAccountSymbol.|vendorSiteDetails.vendorOrganizationFactors.|vendorSiteDetails.vendorOrganizationFactors.|vendorSiteDetails.vendorCertifications.|vendorSiteDetails.typeOfEducationalEntity.|vendorSiteDetails.vendorOrganizationFactors.|vendorSiteDetails.typeOfGovernmentEntity.|vendorSiteDetails.vendorRelationshipWithFederalGovernment.|vendorSiteDetails.vendorBusinessTypes.businessOrOrganizationType.|vendorSiteDetails.vendorLineOfBusiness.|vendorSiteDetails.vendorBusinessTypes.localGovernment.|vendorSiteDetails.vendorBusinessTypes.federalGovernment.|vendorSiteDetails.vendorSocioEconomicIndicators."
            ) %>% str_remove_all(
              "listOfTreasuryAccounts.|vendorOrganizationFactors.|typeOfEducationalEntity.|localGovernment.|businessOrOrganizationType.|vendorLineOfBusiness.|vendorRelationshipWithFederalGovernment.|typeOfGovernmentEntity.|profitStructure.|vendorCertifications.|treasuryAccountSymbol.|relevantContractDates.|OtherTransactionAwardContractID.|purchaserInformation."
            ) %>% str_remove_all(
              "\\^vendor.|^dollarValues.|^contractData.|^competition.|^transactionInformation."
            )
        )

      })

    dict_names <- dictionary_fpds_names()

    fdps_names <-
      dict_items$itemActual

    actual_names <-
      fdps_names %>%
      map_chr(function(name) {
        df_row <-
          dict_names %>% filter(nameFPDS == name)
        if (nrow(df_row) == 0) {
          glue::glue("Missing {name}") %>% message()
          return(name)
        }

        df_row$nameActual
      })

    dict_items <-
      dict_items %>%
      mutate(nameActual = actual_names)

    all_data <-
      all_data %>%
      left_join(dict_items, by = "item")

    all_data <- all_data %>%
      select(-one_of(c("item", "itemActual", "nameNode")))

    col_order <-
      c("idTransactionOfficeYear", unique(all_data$nameActual))

    all_data <-
      all_data %>%
      group_by(idTransactionOfficeYear, nameActual) %>%
      mutate(countItem = 1:n() - 1) %>%
      ungroup() %>%
      mutate(nameActual = case_when(countItem == 0 ~ nameActual,
                                    TRUE ~ str_c(nameActual, countItem))) %>%
      select(-countItem) %>%
      spread(nameActual, value) %>%
      select(one_of(col_order), everything())

    all_data <-
      all_data %>%
      .munge_data(clean_address = F)
    all_data
  }


.dl_fpds_xml <-
  function(url = "https://www.fpds.gov/ddps/directory_browser/index.php?somepath=..%2FFY19-V1.5%2F1145-PEACECORPS&n=4", return_message = T) {
  page <- url %>% read_html()
  url_file <-
    page %>% html_nodes('a') %>% html_attr("href") %>% keep(function(x){
    x %>% str_detect("zip")
  }) %>%
    substr(4, nchar(.)) %>%
    str_c("https://www.fpds.gov/ddps/",.)

  outfile <- tempfile("download", fileext = ".zip")

  file <- curl::curl_download(url_file, outfile)
  unz_files <- unzip(file, exdir = "xml")

  .parse_fpds_xml_safe <- possibly(.parse_fpds_xml, tibble())

  all_data <-
    unz_files %>%
    map_dfr(function(file) {
      if (return_message) {
        glue("Parsing {file}")
      }
      doc <- read_xml(file)

      data <- .parse_fpds_xml(doc = doc)

      data
    })

  unz_files %>% unlink()
  file %>% unlink()
  unlink("xml", recursive = T)
  gc()
  all_data
  }

.dl_fpds_xml_urls <-
  function(urls = c(
    "https://www.fpds.gov/ddps/directory_browser/index.php?somepath=..%2FFY19-V1.5%2F1145-PEACECORPS&n=4"
  ),
  return_message = T) {
    df <-
      tibble()

    success <- function(res) {
      url <-
        res$url

      if (return_message) {
        glue::glue("Parsing {url}") %>%
          message()
      }
      .dl_fpds_xml_safe <-
        purrr::possibly(.dl_fpds_xml, tibble())

      all_data <-
        .dl_fpds_xml_safe(url = url, return_message = return_message)



      df <<-
        df %>%
        bind_rows(all_data)
    }
    failure <- function(msg) {
      tibble()
    }

    urls %>%
      map(function(x) {
        curl_fetch_multi(url = x, success, failure)
      })
    multi_run()
    df
  }

#' Download FPDS Agency URLS
#'
#' @param urls
#' @param include_psc
#' @param include_naics
#' @param return_message
#' @param ...
#'
#' @return
#' @export
#'
#' @examples
download_fpds_xml_urls <-
  function(urls = NULL,
  include_psc = T,
  include_naics = T,
  return_message = T,
  ...) {

    if (length(urls) == 0) {
      stop("Please enter vector of FPDS xml URLS")
    }

    all_data <-
      .dl_fpds_xml_urls(urls = urls, return_message = return_message)


    all_data <- all_data %>% .munge_data(clean_address = F)

    dict_agencies <- dictionary_government_agencies()

    if (all_data %>% hasName("idOfficeAward")) {
      all_data <-
        all_data %>%
        left_join(
          dict_agencies %>% select(
            idOfficeAward = idOffice,
            nameAgencyAward = nameAgency,
            nameOfficeAward = nameOffice
          ),
          by = "idOfficeAward"
        )
    }

    if (all_data %>% hasName("idOfficeContracting")) {
      all_data <-
        all_data %>%
        left_join(
          dict_agencies %>% select(
            idOfficeContracting = idOffice,
            nameOfficeContracting = nameOffice
          ),
          by = "idOfficeContracting"
        )
    }


    if (all_data %>% hasName("idOfficeIDV")) {
      all_data <-
        all_data %>%
        left_join(dict_agencies %>% select(idOfficeIDV = idOffice, nameOfficeIDV = nameOffice),
                  by = "idOfficeIDV")
    }

    if (include_naics) {
      all_data <-
        all_data %>%
        left_join(dictionary_naics_codes(), by = "idNAICS")
    }

    if (include_psc) {
      all_data <-
        all_data %>%
        left_join(dictionary_psc_active() %>% select(codeProductService, nameFullPSC),
                  by = "codeProductService")
    }

    all_data
  }

# angency -----------------------------------------------------------------
abresler/govtrackR documentation built on July 11, 2020, 12:30 a.m.