R/new_fpds_xml.R

Defines functions .parse_xml_entry_new .dict_xml_parent_items

.dict_xml_parent_items <-
  function() {
    parent_items <-
      c(
        "OtherTransactionAward",
        "OtherTransactionIDV",
        "award",
        "awardID",
        "IDV",
        "referencedIDVID",
        "awardContractID",
        "contractMarketingData",
        "treasuryAccount",
        "relevantContractDates",
        "dollarValues",
        "totalDollarValues",
        "purchaserInformation",
        "contractData",
        "listOfTreasuryAccounts",
        "legislativeMandates",
        "productOrServiceInformation",
        "vendor",
        "vendorSocioEconomicIndicators",
        "vendorSiteDetails",
        "businessOrOrganizationType",
        "minorityOwned",
        "vendorBusinessTypes",
        "federalGovernment",
        "localGovernment",
        "vendorHeader",
        "businessOrOrganizationType",
        "vendorLineOfBusiness",
        "vendorRelationshipWithFederalGovernment",
        "typeOfGovernmentEntity",
        "vendorOrganizationFactors",
        "listOfContractAdministrationsDelegated",
        "profitStructure",
        "typeOfEducationalEntity",
        "contractDetail",
        "preferencePrograms",
        "vendorCertifications",
        "vendorLocation",
        "vendorDUNSInformation",
        "ccrRegistrationDetails",
        "placeOfPerformance",
        "principalPlaceOfPerformance",
        "competition",
        "transactionInformation",
        "preferenceProgramssubcontractPlan",
        "NASASpecificAwardElements",
        "agencySpecificAwardElements",
        "agencySpecificIDVElements",
        "NASASpecificIDVElements",
        "principalInvestigator",
        "alternatePrincipalInvestigator"
      )
    tibble(parent_item = parent_items)
  }



.parse_xml_entry_new <- function(xml_entry) {
  xml_entry_nodes <- xml_contents(xml_entry)
  xml_entry_names <- xml_name(xml_entry_nodes)
  parent_items <-
    pull(.dict_xml_parent_items())
  dict_names <-
      dictionary_fpds_lower_names
  seq_along(xml_entry_names) %>%
    map_dfc(function(x) {
      table <-
        xml_entry_names[[x]]

      if (table == "title") {

        df <- tibble(nameContract = xml_text(xml_entry_nodes[1]) %>% str_to_upper() %>% str_squish())
        return(df)
      }

      if (table == "link") {
        df <- tibble(urlFPDSContractAtom = xml_entry_nodes[[x]] %>% html_attr("href"))
        return(df)
      }

      if (table == "modified") {
        value <-
          xml_text(xml_entry_nodes[x])
        if (value == "") {
          return(invisible())
        }
        df <- tibble(datetimeContractModified = lubridate::ymd_hms(value))
        return(df)
      }
      xml_part <- xml_entry_nodes[x]

      xml_nodes <-
        xml_find_all(xml_part, ".//*")

      items <-
        as.character(xml_name(xml_nodes))

      text <-
        xml_text(xml_nodes)

      node_attrs <-
        xml_attrs(xml_nodes)

      df_attrs <-
        seq_along(node_attrs) %>%
        map_dfr(function(x) {
          if (x == 1) {
            return(invisible())
          }


          value <- as.character(node_attrs[[x]])
          if (length(value) == 0) {
            return(invisible())
          }
          item <- names(node_attrs[[x]])
          tibble(item, value) %>%
            mutate(idNode = x) %>%
            select(idNode, everything())
        })

      df_new <-
        tibble(nameFPDS = items, text) %>%
        mutate(idNode = 1:n()) %>%
        filter(text != "")

      df_new <-
        df_new %>%
        filter(!nameFPDS %>% str_detect("genericTags|genericStrings|genericString01")) %>%
        filter(
          !nameFPDS %in% c(
            "award",
            "IDVID",
            "IDV",
            "contractID",
            "OtherTransactionAwardID",
            "OtherTransactionAward",
            "OtherTransactionIDV",
            "OtherTransactionIDVID",
            "OtherTransactionAwardContractID",
            "OtherTransactionIDVContractID"
          )
        ) %>%
        select(idNode, everything()) %>%
        mutate_if(is.character, str_squish)


      df_attrs <- df_attrs %>%
        filter(
          !item %in% c(
            "city",
            "country",
            "county",
            "departmentID",
            "departmentName",
            "productOrServiceType"
          )
        ) %>%
        arrange(idNode, item)

      df_descriptions <-
        df_attrs %>%
        group_by(idNode) %>%
        summarise(description = value %>% str_c(collapse = " | ")) %>%
        mutate(description = str_squish(description))

      df_new <-
        df_new %>%
        left_join(df_descriptions, by = "idNode")



      df_parents <-
        df_new %>%
        filter(nameFPDS %in% parent_items) %>%
        select(idNode, parent = nameFPDS) %>%
        mutate(idNode = idNode + 1)

      df_parents <-
        df_parents %>%
        mutate(
          slug = case_when(
            parent == "principalPlaceOfPerformance" ~ "Performance",
            parent == "vendorLocation" ~ "Vendor",
            parent %in% c("referencedIDVID", "IDV") ~ "IDV",
            parent %in% c("OtherTransactionAward") ~ "OTA",
            parent %in% c("agencySpecificAwardElements") ~ "AgencySpecificAward",
            parent %in% c("principalInvestigator") ~ "Investigator",
            parent %in% c("alternatePrincipalInvestigator") ~ "InvestigatorAlternate",
            TRUE ~ NA_character_
          )
        )

      df_new <-
        df_new %>%
        filter(!nameFPDS %in% c(parent_items)) %>%
        left_join(df_parents, by = "idNode") %>%
        select(parent, everything()) %>%
        select(-slug) %>%
        fill(parent) %>%
        left_join(df_parents %>% select(-idNode), by = "parent")

      tbl_off_id <- df_new %>% filter(nameFPDS == "contractingOfficeID")
      if (nrow(tbl_off_id) > 0) {
        office_id <-
          tbl_off_id %>% pull(text)
        df_new <- df_new %>%
          mutate(description = description %>% str_remove_all(office_id) %>% str_squish())
      }
      df_names <-
        df_new %>%
        distinct(nameFPDS, .keep_all = F) %>%
        mutate(nameFPDSLower = str_to_lower(nameFPDS)) %>%
        left_join(dict_names %>% select(nameFPDSLower, nameActual), by = "nameFPDSLower")

      missing_symbol <-
        df_names %>%
        filter(is.na(nameActual)) %>%
        nrow() > 0

      if (missing_symbol) {
        missing_items <-
          df_names %>%
          filter(is.na(nameActual)) %>%
          pull(nameFPDS)

        missing_items %>%
          walk(function(x) {
            glue("Missing: {x}") %>% message()
          })

        df_names <-
          df_names %>%
          mutate(nameActual = case_when(is.na(nameActual) ~ nameFPDS,
                                        TRUE ~ nameActual))
      }

      df_new <-
        df_new %>%
        left_join(df_names, by = "nameFPDS") %>%
        distinct() %>%
        mutate(nameActual = case_when(is.na(slug) ~ nameActual,
                                      TRUE ~ str_c(nameActual, slug, sep = "")))

      filter_agency <-
        df_new %>% filter(idNode == 4) %>% pull("nameFPDS") == "agencyID"

      if (filter_agency) {
        df_new <- df_new %>%
          filter(idNode != 4)
      }

      df_new  <-
        df_new %>%
        unite(n, parent, nameFPDS, sep = "_", remove = F) %>%
        filter(n != "awardContractID_agencyID") %>%
        select(-n)

      df_new <-
        df_new %>%
        select(idNode, nameFPDS, nameActual,  text, description) %>%
        group_by(nameActual) %>%
        filter(idNode == min(idNode)) %>%
        ungroup()

      df_base <-
        df_new %>%
        select(nameActual, text) %>%
        mutate(
          text = case_when(
            nameActual %>% str_detect('zipcode') ~  text %>% substr(1, 5),
            nameActual %>% str_detect("nameFirst") ~ text %>% str_remove_all("\\,"),
            nameActual %>% str_detect("nameLast") ~ text %>% str_remove_all("\\,"),
            TRUE ~ text
          )
        )

      df_base <-
        spread(df_base, nameActual, text)

      has_desc <-
        filter(df_new, !is.na(description)) %>%  nrow() > 0

      if (has_desc) {
        df_desc <- df_new %>%
          filter(!is.na(description)) %>%
          filter(nameActual != "descriptionObligation") %>%
          select(nameActual, description) %>%
          mutate(
            nameActual = nameActual %>% str_replace_all("^code|^is|^has", "type") %>% str_replace_all("^id", "name") %>%
              str_replace_all("^count", "description") %>%
              str_replace_all("typeCountry", "nameCountry") %>%
              str_replace_all("typeState", "nameState") %>%
              str_replace_all("typeProductService", "nameProductService")
          ) %>%
          mutate(description = str_to_upper(description)) %>%
          filter(
            !nameActual %>% str_detect(
              "nameContract|^transaction|nameCountryVendor|faxVendor|zipcode"
            )
          ) %>%
          mutate(nameActual = case_when(
            nameActual == "typeOrganization" ~ "stateOrganized",
            TRUE ~ nameActual
          )) %>%
          spread(nameActual, description)

        df_new <-
          df_desc %>%
          bind_cols(df_base) %>%
          select(one_of(names(df_base)), everything())
      }

      df_new <-
        df_new %>%
        select(matches("idContract|idAgency|nameAgency|idOffice|nameOffice"),
               everything())

      if (df_new %>% hasName("slugDUNS")) {
        df_new <- df_new %>%
          mutate(idDUNS = as.numeric(slugDUNS))
      }


      if (df_new %>% hasName("slugDUNSParent")) {
        df_new <- df_new %>%
          mutate(idDUNSParent = as.numeric(slugDUNSParent))
      }


      df_new

    })
}
abresler/govtrackR documentation built on July 11, 2020, 12:30 a.m.