R/bizopps.R

Defines functions fbo_opportunities fbo_ftp_urls fbo_ftp_data parse_fbo_ftp_xmls parse_fbo_ftp_csvs .parse_all_xml .parse_ftp_base .munge_biz_opps_names dictionary_federal_supply_codes dictionary_fbo_solicitation_codes

Documented in dictionary_federal_supply_codes fbo_ftp_data fbo_ftp_urls fbo_opportunities parse_fbo_ftp_csvs parse_fbo_ftp_xmls

# template guide  -------------------------------------------------------------------
#
# https://www.fbo.gov/?static=interface&s=generalinfo


# dict --------------------------------------------------------------------

dictionary_fbo_solicitation_codes <-  function() {
  tibble(
    codeSolicitation = c(
      "PRESOL",
      "SRCSGT",
      "SNOTE",
      "SSALE",
      "COMBINE",
      "AMDCSS",
      "MOD",
      "AWARD",
      "JA",
      "FAIROPP",
      "ARCHIVE",
      "UNARCHIVE",
      "ITB",
      "FSTD",
      "EPSUPLOAD",
      "DELETE"
    ),
    typeSolicitation = c(
      "PRESOLICITATION",
      "SOURCES SOUGHT",
      "SPECIAL NOTICE",
      "SALE OF SURPLUS PROPERTY",
      "COMBINED SYNOPSIS - SOLICITATION",
      "AMENDMENT TO A PREVIOUS COMBINED SOLICITATION",
      "MODIFICATION/AMENDMENT/CANCEL",
      "AWARD",
      "JUSTIFICATION AND APPROVAL",
      "FAIR OPPORTUNITY / LIMITED SOURCES JUSTIFICATION",
      "ARCHIVE NOTICE",
      "UNARCHIVE NOTICE",
      "INTENT TO BUNDLE REQUIREMENTS DOD-FUNDED",
      "FOREIGN GOVERNMENT STANDARD",
      "DOCUMENT UPLOAD",
      "DOCUMENT DELETE"
    )

  )
}

#' Federal Supply Code Dictionary
#'
#' Dictionary of parent solicitation groups
#' used by FBO.gov
#'
#' @return
#' @export
#'
#' @examples
#' dictionary_federal_supply_codes()
dictionary_federal_supply_codes <-
  function() {
    text_product <- "10 -- Weapons
11 -- Nuclear ordnance
12 -- Fire control equipment
13 -- Ammunition & explosives
14 -- Guided missiles
15 -- Aircraft & airframe structural components
16 -- Aircraft components & accessories
17 -- Aircraft launching, landing & ground handling equipment
18 -- Space vehicles
19 -- Ships, small craft, pontoons & floating docks
20 -- Ship and marine equipment
22 -- Railway equipment
23 -- Ground effects vehicles, motor vehicles, trailers & cycles
24 -- Tractors
25 -- Vehicular equipment components
26 -- Tires and tubes
28 -- Engines, turbines & components
29 -- Engine accessories
30 -- Mechanical power transmission equipment
31 -- Bearings
32 -- Woodworking machinery and equipment
34 -- Metalworking machinery
35 -- Service and trade equipment
36 -- Special industry machinery
37 -- Agricultural machinery & equipment
38 -- Construction, mining, excavating & highway maintenance equipment
39 -- Materials handling equipment
40 -- Rope, cable, chain & fittings
41 -- Refrigeration, air-conditioning & air circulating equipment
42 -- Fire fighting, rescue & safety equipment
43 -- Pumps & compressors
44 -- Furnace, steam plant & drying equipment; & nuclear reactors
45 -- Plumbing, heating, & sanitation equipment
46 -- Water purification & sewage treatment equipment
47 -- Pipe, tubing, hose & fittings
48 -- Valves
49 -- Maintenance & repair shop equipment
51 -- Hand tools
52 -- Measuring tools
53 -- Hardware & abrasives
54 -- Prefabricated structures and scaffolding
55 -- Lumber, millwork, plywood & veneer
56 -- Construction & building materials
58 -- Communication, detection, & coherent radiation equipment
59 -- Electrical and electronic equipment components
60 -- Fiber optics materials, components, assemblies & accessories
61 -- Electric wire & power & distribution equipment
62 -- Lighting fixtures & lamps
63 -- Alarm, signal & security detection equipment
65 -- Medical, dental & veterinary equipment & supplies
66 -- Instruments & laboratory equipment
67 -- Photographic equipment
68 -- Chemicals & chemical products
69 -- Training aids & devices
70 -- General purpose information technology equipment
71 -- Furniture
72 -- Household & commercial furnishings & appliances
73 -- Food preparation and serving equipment
74 -- Office machines, text processing systems & visible record equipment
75 -- Office supplies and devices
76 -- Books, maps & other publications
77 -- Musical instruments, phonographs & home-type radios
78 -- Recreational & athletic equipment
79 -- Cleaning equipment and supplies
80 -- Brushes, paints, sealers & adhesives
81 -- Containers, packaging, & packing supplies
83 -- Textiles, leather, furs, apparel & shoe findings, tents & flags
84 -- Clothing, individual equipment & insignia
85 -- Toiletries
87 -- Agricultural supplies
88 -- Live animals
89 -- Subsistence
91 -- Fuels, lubricants, oils & waxes
93 -- Nonmetallic fabricated materials
94 -- Nonmetallic crude materials
95 -- Metal bars, sheets & shapes
96 -- Ores, minerals & their primary products
99 -- Miscellaneous" %>%
      read_lines()

text_service <- "A -- Research & Development
B -- Special studies and analysis - not R&D
C -- Architect and engineering services
D -- Information technology services, including telecommunications services
E -- Purchase of structures & facilities
F -- Natural resources & conservation services
G -- Social services
H -- Quality control, testing & inspection services
J -- Maintenance, repair & rebuilding of equipment
K -- Modification of equipment
L -- Technical representative services
M -- Operation of Government-owned facilities
N -- Installation of equipment
P -- Salvage services
Q -- Medical services
R -- Professional, administrative, and management support services
S -- Utilities and housekeeping services
T -- Photographic, mapping, printing, & publication services
U -- Education & training services
V -- Transportation, travel, & relocation services
W -- Lease or Rental of equipment
X -- Lease or rental of facilities
Y -- Construction of structures and facilities
Z -- Maintenance, repair, and alteration of real property" %>%
  read_lines()

  tibble(text_product) %>%
  separate(text_product, into = c("idSolicitationGroup", "nameSolicitationGroup"), extra = "merge", sep = "--") %>%
      mutate_all(list(function(x){
        x %>% str_squish() %>% str_to_upper()
      })) %>%
      mutate(typeSolicitationGroup = "PRODUCT") %>%
      select(typeSolicitationGroup, everything()) %>%
  bind_rows(
    tibble(text_service) %>%
      separate(text_service, into = c("idSolicitationGroup", "nameSolicitationGroup"), extra = "merge",sep = "--") %>%
      mutate_all(list(function(x){
        x %>% str_squish() %>% str_to_upper()
      })) %>%
      mutate(typeSolicitationGroup = "SERVICE") %>%
      select(typeSolicitationGroup, everything())
  )
    }

.munge_biz_opps_names <-
  function(data) {
    names_dict <- names(data)

    dict <- dictionary_bizopps_names()
    actual_names <-
      names_dict %>%
      map_chr(function(name) {
        df_row <-
          dict %>% filter(nameGovernment == name)
        if (nrow(df_row) == 0) {
          glue::glue("Missing {name}") %>% message()
          return(name)
        }

        df_row$nameActual
      })

    data %>%
      set_names(actual_names)
  }

# ftp ---------------------------------------------------------------------

### https://www.fbo.gov/downloads/fbo_web_services_technical_documentation.pdf
.parse_ftp_base <- function(url = "ftp://ftp.fbo.gov/") {
  data <-
    url %>% read_table(col_names = F) %>%
    select(5:9) %>%
    setNames(c("sizeFileKB", "month", "day", "year", "nameFile")) %>%
    suppressMessages()

  data <-
    data %>%
    mutate(
      nameFile = nameFile %>% str_remove_all("\\~"),
      typeFile = nameFile %>% str_remove_all("[0-9]|\\.csv"),
      dateFile = nameFile %>% str_remove_all("FBOFeed|DSCC|FBORecovery|FBORecoveryAwards|ftpVAN") %>% substr(1, 8) %>% ymd(),
      urlFBOData = glue("ftp://ftp.fbo.gov/{nameFile}") %>% as.character(),
      urlFBOData = case_when(
        typeFile == "FBOBuyAmericanAct" ~ "ftp://ftp.fbo.gov/FBOBuyAmericanAct/Buy_American_Act.xls",
        typeFile == "datagov" ~ "ftp://ftp.fbo.gov/datagov/FBOFullXML.xml",
        typeFile == "FBORecoveryAwards" ~ glue("ftp://ftp.fbo.gov/FBORecoveryAwards/{nameFile}") %>% as.character(),
        typeFile == "FBORecovery" ~ glue("ftp://ftp.fbo.gov/FBORecovery/{nameFile}") %>% as.character(),
        TRUE ~ urlFBOData
      )
    ) %>%
    select(-one_of(c("month", "day", "year"))) %>%
    select(typeFile, dateFile, nameFile, everything())

  data

  data

}

.parse_ftp_xml <-
  memoise::memoise(function(url = "ftp://ftp.fbo.gov/FBOFeed20191018",
                            exclude_tags = NULL,
                            return_message = T) {
    parts <-
      url %>% str_split("/") %>% flatten_chr() %>% str_remove_all(".csv")

    dateData <-
      parts[length(parts)] %>% str_remove_all("\\FBORecovery|FBORecoveryAwards|Awards|FBOFeed|ftpVAN") %>% ymd()

    typeFile <-
      parts[length(parts)] %>% str_remove_all("[0-9]")

    fake_xml <- read_lines(url) %>% str_c(collapse = "")

    tags <- c(
      'PRESOL',
      'SRCSGT',
      'SNOTE',
      'SSALE',
      'COMBINE',
      'AMDCSS',
      'MOD',
      'AWARD',
      'JA',
      'FAIROPP',
      'ARCHIVE',
      'UNARCHIVE',
      'ITB',
      'FSTD',
      'EPSUPLOAD',
      'DELETE'
    )

    split_tags <-
      glue("</{tags}>") %>% str_c(collapse = "|")

    remove_tags <-
      glue("<{tags}>") %>% str_c(collapse = "|")

    parts <-
      fake_xml %>%
      str_split(split_tags) %>% flatten_chr() %>%
      str_remove_all(remove_tags)

    html_parts <-
      c(
        'a',
        'abbr',
        'acronym',
        'address',
        'applet',
        'area',
        'article',
        'aside',
        'audio',
        'b',
        'base',
        'basefont',
        'bdi',
        'bdo',
        'bgsound',
        'big',
        'blink',
        'blockquote',
        'body',
        'br',
        'button',
        'canvas',
        'caption',
        'center',
        'cite',
        'code',
        'col',
        'colgroup',
        'command',
        'content',
        'data',
        'datalist',
        'dd',
        'del',
        'details',
        'dfn',
        'dialog',
        'dir',
        'div',
        'dl',
        'dt',
        'element',
        'em',
        'embed',
        'fieldset',
        'figcaption',
        'figure',
        'font',
        'footer',
        'form',
        'frame',
        'frameset',
        'h1',
        'h2',
        'h3',
        'h4',
        'h5',
        'h6',
        'head',
        'header',
        'hgroup',
        'hr',
        'html',
        'i',
        'iframe',
        'image',
        'img',
        'input',
        'ins',
        'isindex',
        'kbd',
        'keygen',
        'label',
        'legend',
        'li',
        'link',
        'listing',
        'main',
        'map',
        'mark',
        'marquee',
        'math',
        'menu',
        'menuitem',
        'meta',
        'meter',
        'multicol',
        'nav',
        'nextid',
        'nobr',
        'noembed',
        'noframes',
        'noscript',
        'object',
        'ol',
        'optgroup',
        'option',
        'output',
        'p',
        'param',
        'picture',
        'plaintext',
        'pre',
        'progress',
        'q',
        'rb',
        'rbc',
        'rp',
        'rt',
        'rtc',
        'ruby',
        's',
        'samp',
        'script',
        'section',
        'select',
        'shadow',
        'slot',
        'small',
        'source',
        'spacer',
        'span',
        'strike',
        'strong',
        'style',
        'sub',
        'summary',
        'sup',
        'svg',
        'table',
        'tbody',
        'td',
        'template',
        'textarea',
        'tfoot',
        'th',
        'thead',
        'time',
        'title',
        'tr',
        'track',
        'tt',
        'u',
        'ul',
        'var',
        'video',
        'wbr',
        'xmp',
        "	r /"
      ) %>% str_c(collapse = "|")

    if (length(exclude_tags) > 0) {
      exclude_tags <- str_to_upper(exclude_tags)
      tags <- tags[!tags %in% exclude_tags]
    }

    all_data <-
      tags %>%
      map_dfr(function(tag) {
        tag %>% message()
        part <- parts[parts %>% str_detect(tag)]
        if (length(parts) == 0) {
          return(invisible())
        }
        all_parts <- seq_along(part)
        df_all <-
          all_parts %>%
          map_dfr(function(x) {
            glue("{x} of {max(all_parts)}") %>% message()
            df <-
              tibble(part = part[[x]] %>% str_split("<") %>% flatten_chr()) %>%
              separate(part, into = c("item", "value"), extra = "merge",sep = ">") %>%
              filter(!is.na(value)) %>%
              mutate_all(function(x) {
                x %>% str_replace_all("&nbsp;|N/A", NA_character_) %>% str_squish()
              })

            df_base <-
              df %>%
              filter(item %>% str_detect("^[A-Z]")) %>%
              filter(!value == "") %>%
              filter(!value %>% is.na())


            col_order <- df_base$item %>% unique()

            df_base <-
              df_base %>%
              group_by(item) %>%
              dplyr::slice(1) %>%
              ungroup() %>%
              spread(item, value) %>%
              mutate(numberItem = x) %>%
              select(numberItem, one_of(col_order), everything())

            has_descriptions <-
              df %>% filter(!item %>% str_detect("[A-Z]")) %>% filter(value != "") %>% nrow() > 0

            if (has_descriptions) {
              description <-
                df %>% filter(!item %>% str_detect("[A-Z]")) %>% filter(value != "") %>%
                pull(value) %>% str_c(collapse = " ") %>%
                str_squish()

              df_base <-
                df_base %>%
                mutate(detailsSolicitation = description)
            }
            df_base
          })

        df_all <-
          df_all %>%
          mutate(tag) %>%
          select(tag, everything())

        if (df_all %>% hasName("DATE")) {
          df_all <-
            df_all %>%
            unite(DATE, DATE, YEAR, sep = "")
        }

        df_all <-
          df_all %>%
          select(-matches("Upon|Enter|Alternate"))

        df_all <-
          df_all %>% .munge_fpds_names()

        df_all <-
          df_all %>%
          mutate_at(df_all %>% select(matches("date")) %>% names(),
                    mdy) %>%
          .munge_data(clean_address = F)

        df_all %>% nest(-tag)

      }) %>%
      rename(codeSolicitation = tag)

    all_data <-
      all_data %>%
      left_join(
        dictionary_fbo_solicitation_codes(),
        by = "codeSolicitation"
      ) %>%
      select(typeSolicitation, everything())

    all_data <- all_data %>%
      unnest() %>%
      select(-numberItem)

    if (all_data %>% hasName("zipcodeAgency")) {
      all_data <- all_data %>%
        rename(zipcodeOffice = zipcodeAgency)
    }


    if (all_data %>% hasName("nameAwardee")) {
      all_data <- all_data %>%
        clean_entity_data(entity_column = "nameAwardee")
    }

    if (return_message) {
      contracts <- nrow(all_data) %>% comma(digits = 0)
      glue("\n\nAcquired {contracts} for {dateData} from {typeFile}\n\n") %>% message()
    }

    all_data <-
      all_data %>%
      mutate(typeAward = ifelse(is.na(typeAward), typeSolicitation, typeAward))

    all_data
  })

.parse_all_xml <-
  function(url = "ftp://ftp.fbo.gov/datagov/FBOFullXML.xml") {
    data <- read_xml(url)

    nodes <-
      xml_contents(data)

    all_nodes <- seq_along(nodes)
    max_nodes <- max(all_nodes)
    all_data <-
      all_nodes %>%
      map_dfr(function(x) {
        glue("{x} of {max_nodes}") %>% message()
        node <- nodes[[x]] %>% xml_children()
        values <- node  %>% xml_text()
        items <- node %>% xml_name()
        tibble(item = items, value = values) %>%
          spread(item, value) %>%
          select(one_of(items))
      })
    rm(data)
    rm(nodes)
    gc()
    all_data <-
      all_data %>%
      .munge_fpds_names()

    all_data <-
      all_data %>%
      mutate_if(is.character,
                function(x) {
                  x %>% str_to_upper() %>% str_replace_all("N/A", NA_character_) %>% str_trim()
                })

    all_data <-
      all_data %>%
      .munge_data(clean_address = F) %>%
      mutate(isAward = !is.na(idAward))

    all_data

  }


.parse_ftp_csv <-
  memoise::memoise(function(url = "ftp://ftp.fbo.gov/FBORecovery/FBORecovery20090926.csv",
                            return_message = T) {
    parts <-
      url %>% str_split("/") %>% flatten_chr() %>% str_remove_all(".csv")
    dateData <-
      parts[length(parts)] %>% str_remove_all("\\FBORecovery|FBORecoveryAwards|Awards") %>% ymd()

    typeFile <- parts[length(parts)] %>% str_remove_all("[0-9]")

    data <-
      url %>%
      read_csv(col_names = T) %>%
      suppressMessages()

    data <-
      data %>%
      .munge_fpds_names() %>%
      .munge_data(clean_address = F) %>%
      mutate(dateData,
             typeFile,
             urlFBOData = url) %>%
      select(dateData, typeFile, everything()) %>%
      suppressWarnings()

    if (return_message) {
      contracts <- nrow(data) %>% formattable::comma(digits = 0)
      glue(("Acquired {contracts} from {typeFile} for {dateData}")) %>% message()
    }


    data



  })

#' Read FBO ftp CSV urls
#'
#' Acquire fbo csv data from FBO ftp
#'
#' @param urls vector of csv urls
#' @param return_message if \code{TRUE} returns a message
#'
#' @return
#' @export
#'
#' @examples
#' fbo_ftp_csvs(urls = "ftp://ftp.fbo.gov/FBORecoveryAwards/FBORecoveryAwards20190406.csv")
parse_fbo_ftp_csvs <-
  function(urls = NULL,
           return_message = T) {
    if (length(urls) == 0) {
      stop("Please enter FBO ")
    }
    .parse_ftp_csv_safe <-
      possibly(.parse_ftp_csv, tibble())

    urls %>%
      map_dfr(function(url) {
        .parse_ftp_csv_safe(url = url, return_message)
      }) %>%
      suppressWarnings() %>%
      .format_data() %>%
      arrange(dateSolicitation)
  }

#' Parse FBO ftp XML files
#'
#' @param urls vector of urls
#' @param return_message if \code{TRUE} returns a message
#' @param exclude_tags if not \code{NULL} fbo solicitation types to exclude
#'
#' @return a \code{tibble()}
#' @export
#'
#' @examples
#' parse_fbo_ftp_xmls(urls = "ftp://ftp.fbo.gov/FBOFeed20151017")
parse_fbo_ftp_xmls <-
  function(urls, exclude_tags = NULL,
           return_message = T) {
    if (length(urls) == 0) {
      stop("Please enter FBO ")
    }
    .parse_ftp_xml_safe <-
      possibly(.parse_ftp_xml, tibble())

    urls %>%
      map_dfr(function(url) {
        .parse_ftp_xml_safe(url = url, return_message)
      }) %>%
      suppressWarnings() %>%
      .format_data() %>%
      arrange(dateSolicitation)
  }

#' Acquire FBO bulk FTP data
#'
#' @param type type of file \itemize{
#' \item feed - daily feed
#' \item recovery - recovery data
#' \item recovery awards - recovery awards
#' \item buy american
#' \item all - all historic daily feeds
#' }
#' @param start_date if not \code{NULL} start date
#' @param end_date if not \code{NULL} end date
#' @param return_message if \code{TRUE} returns message
#' @param distinct_solicitations if \code{TRUE} returns only the most recent distinct solicitations
#' @param exclude_tags if not \code{NULL} fbo solicitation types to exclude
#'
#' @return
#' @export
#'
#' @examples
#' \dontrun{
#' fbo_ftp_data(type = "recovery")
#' }
fbo_ftp_data <-
  function(type = NULL,
           start_date = NULL,
           end_date = NULL,
           distinct_solicitations = TRUE,
           exclude_tags = NULL,
           return_message = T) {
    type_slug <- type %>% str_to_lower()

    if (type_slug == "buy american") {
      data <-
        fbo_buy_american_expenditures(return_message = return_message)
      return(data)
    }

    if (type_slug == "all") {
      data <- .parse_all_xml()
      return(data)
    }
    slugs <-
      case_when(
        type_slug == "feed" ~ c("FBOFeed" , "ftpVAN"),
        type_slug == "recovery" ~ "FBORecovery",
        type_slug ==  "recovery awards" ~ "FBORecoveryAwards"
      )

    df_urls <-
      fbo_ftp_urls()

    df_urls <-
      df_urls %>%
      filter(typeFile %in% slugs)

    if (length(start_date) > 0) {
      df_urls <- df_urls %>%
        filter(dateFile  >= start_date)
    }

    if (length(end_date) > 0) {
      df_urls <- df_urls %>%
        filter(dateFile  <= end_date)
    }

    if (type_slug == "feed") {
      urls <- df_urls$urlFBOData
      data <-
        parse_fbo_ftp_xmls(urls = urls, return_message = return_message, exclude_tags = exclude_tags)

      if (distinct_solicitations) {
        data <-
          data %>%
          group_by(idSolicitation, nameSolicitation, nameAgency, descriptionSolicitation, idAward, amountAward) %>%
          filter(dateSolicitation == max(dateSolicitation)) %>%
          ungroup()
      }
      return(data)
    }

    urls <- df_urls$urlFBOData

    all_data <-
      parse_fbo_ftp_csvs(urls = urls, return_message = return_message)

    all_data

  }

# fbo_functions -----------------------------------------------------------


#' FBO ftp file dictionary
#'
#' Acquires all active files on
#' fbo.gov's fbo file dictionary
#'
#' @return
#' @export
#'
#' @examples
#' fbo_ftp_urls()
fbo_ftp_urls <-
  function() {
    urls <-
      c(
        "ftp://ftp.fbo.gov/",
        "ftp://ftp.fbo.gov/FBORecoveryAwards/",
        "ftp://ftp.fbo.gov/FBORecovery/"
      )

    all_data <-
      urls %>%
      map_dfr(function(url) {
        .parse_ftp_base(url = url)
      })

    all_data <-
      all_data %>%
      filter(!nameFile %in% c("welcome.msg", "FBORecovery", "FBORecoveryAwards")) %>%
      arrange(desc(dateFile))

    all_data
  }


#' FBO Buy American Act data
#'
#' Bulk data from
#'
#' @param return_message if \code{TRUE} returns a message
#'
#' @return
#' @export
#'
#' @examples
#'  \dontrun{
#'  fbo_buy_american_expenditures()
#'  }
fbo_buy_american_expenditures <-
  memoise::memoise(function(return_message = T) {
    url <- "ftp://ftp.fbo.gov/FBOBuyAmericanAct/Buy_American_Act.xls"

    outfile <- tempfile("buy_american", fileext = ".xls")

    file <- curl::curl_download(url, outfile)
    data <-
      readxl::read_excel(file)

    data <-
      data %>%
      .munge_fpds_names() %>%
      separate(
        "typeDescriptionManufacter",
        into = c("locationManufactor", "descriptionReasonManufacture"),
        sep = "\\ - ",
        extra = "merge"
      )

    if (return_message) {
      start <- data$dateContractSigned %>% min(na.rm = T)
      end <- data$dateContractSigned %>% max(na.rm = T)
      contracts <-
        data %>% distinct(idContract) %>% nrow() %>% formattable::comma(digits = 0)
      glue("Acquired {contracts} from {start} to {end} sourced through the Buy American Act") %>% message()

    }

    data
  })




# options -----------------------------------------------------------------

function() {
  page <-
    "https://www.fbo.gov/index?s=opportunity&tab=search&mode=list" %>%
    read_html()

  naics_codes <-
    page %>%
    html_nodes("#scrollable_checkbox_dnf_class_values_procurement_notice__naics_code___ div") %>%
    html_text()

  classifications <-
    page %>%
    html_nodes(
      "#scrollable_checkbox_dnf_class_values_procurement_notice__classification_code___"
    ) %>%
    html_text() %>%
    str_split("\n") %>%
    flatten_chr()

  classifications <- classifications[!classifications == ""]

}


# agencies ----------------------------------------------------------------


#' FBO Active Opportunities
#'
#' @param sleep_time if not \code{NULL} time in between scrapes
#' @param parse_contract_details if \code{TRUE} parses all detailed contracts
#' @param return_message if \code{TRUE} returns a message()
#'
#' @return
#' @export
#'
#' @examples
fbo_opportunities <-
  function(sleep_time = NULL,
           parse_contract_details = F,
           exclude_awards = F,
           return_message = T) {
    df_urls <-
      .active_urls()

    data <-
      df_urls$urlPage %>%
      .parse.opportunity.urls()

    char_cols <-
      data %>%
      select_if(is.character) %>%
      select(-matches("url")) %>% names()

    data <-
      data %>%
      mutate_at(char_cols,
                list(function(x) {
                  ifelse(x == "N/A", NA_character_, x) %>% gsub("\\s+", " ", .) %>% str_to_upper()
                }))

    if (data %>% tibble::has_name("nameAgency")) {
      dict_logos <- dictionary_agency_logos()

      data <-
        data %>%
        select(-one_of("urlAgencyLogo")) %>%
        left_join(dict_logos, by = "nameAgency")

    }

    data <-
      data %>%
      mutate(statusBid = case_when(
        is.na(idSolicitation) ~ "AWARD",
        TRUE ~ statusBid
      )) %>%
      mutate(isAward = statusBid == "AWARD")

    data <-
      data %>%
      .munge_data(clean_address = F)

    if (exclude_awards) {
      data  <-
        data %>%
        filter(!isAward)
    }



    data
  }

# search ------------------------------------------------------------------

.fbo_cookies <-
  function() {
    structure(
      list(
        "balancer.172.16.121.3",
        "12c6b82f50a58f2490926325bf05cb7e",
        "1"
      ),
      .Names = c("BALANCEID", "PHPSESSID", "sympcsm_cookies_enabled")
    )

  }

.fbo_headers <-
  function() {
    data <-
      structure(
        list(
          "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
          "gzip, deflate, br",
          "en-US,en;q=0.9",
          "max-age=0",
          "close",
          "application/x-www-form-urlencoded",
          "1",
          "https://www.fbo.gov",
          "https://www.fbo.gov/",
          "1",
          "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.85 Safari/537.36"
        ),
        .Names = c(
          "Accept",
          "Accept-Encoding",
          "Accept-Language",
          "Cache-Control",
          "Connection",
          "Content-Type",
          "DNT",
          "Origin",
          "Referer",
          "Upgrade-Insecure-Requests",
          "User-Agent"
        )
      ) %>%
      flatten_df()
    df_ref <- .generate_url_reference()
    data$Referer <- df_ref$urlReferer
    data$`User-Agent` <- df_ref$userAgent
    as.list(data)
  }

.fbo_params <-
  function() {
    structure(list("list", "opportunity", "list"),
              .Names = c("mode",
                         "s", "tab"))

  }

.fbo_data <-
  function() {
    data <- structure(
      list(
        "",
        "1",
        "365",
        "",
        "procurement_notice",
        "",
        "a",
        "",
        "",
        "search",
        "0",
        "update",
        "",
        "vendor_procurement_notice_filter",
        "1",
        "dnf_"
      ),
      .Names = c(
        "autocomplete_input_dnf_class_values[procurement_notice][agency]",
        "clear_filters_from_home",
        "dnf_class_values[procurement_notice][_posted_date]",
        "dnf_class_values[procurement_notice][agency]",
        "dnf_class_values[procurement_notice][dnf_class_name]",
        "dnf_class_values[procurement_notice][keywords]",
        "dnf_class_values[procurement_notice][procurement_type][]",
        "dnf_class_values[procurement_notice][set_aside][]",
        "dnf_class_values[procurement_notice][zipstate]",
        "dnf_opt_action",
        "dnf_opt_finalize",
        "dnf_opt_mode",
        "dnf_opt_target",
        "dnf_opt_template",
        "dnf_opt_validate",
        "so_form_prefix"
      )
    )


    data
  }

.get_fbo_search_result <-
  function() {
    cookies <-
      .fbo_cookies()

    headers <-
      .fbo_headers()

    params <-
      .fbo_params()

    data <-
      .fbo_data()

    page <-
      Post(
        url = 'https://www.fbo.gov/index',
        headers = headers,
        data = data,
        cookies = cookies,
        params = params
      )

  }

.parse.result.page <-
  function(page) {
    name_solicitation <-
      page %>% .parse_for_text(css = ".solt") %>% str_to_upper()

    href_sol <-
      page %>% html_nodes(".lst-lnk-notice") %>%
      html_attr("href")

    urls_solicitation <-
      href_sol %>%
      str_c("https://www.fbo.gov/index", .)



    page_ids <-
      urls_solicitation %>%
      map_chr(function(url) {
        httr::parse_url(url = url) %>%
          flatten_df() %>%
          pull(id)
      })

    type_solicitation_group <-
      page %>%
      .parse_for_text(css = ".solcc")

    id_solicitation <-
      page %>%
      .parse_for_text(css = ".soln")


    data <-
      seq_along(name_solicitation) %>%
      map_dfr(function(x) {
        name_sol <- name_solicitation[[x]]
        if (name_sol %>% str_detect("\\--")) {
          name_sol <-
            name_sol %>% str_split("\\--") %>% flatten_chr() %>% .[[2]]
        }

        sol_details <-
          type_solicitation_group[[x]] %>% str_split("\\ -- ") %>%
          flatten_chr() %>%
          str_to_upper()
        tibble(
          idRow = x,
          idSolicitationGroup = sol_details[1],
          descriptionSolicitationGroup = sol_details[2],
          nameSolicitation = name_sol
        )
      }) %>%
      mutate(idSolicitation = id_solicitation)

    agency <-
      page %>%
      .parse_for_text(".pagency")

    agency_details <-
      page %>%
      html_nodes(".lst-cl-first+ .lst-cl") %>%
      gsub(pattern = '<.*?>', replacement = "|", .) %>%
      str_trim()

    offices <-
      seq_along(agency) %>%
      map_chr(function(x) {
        actual_name <- agency[[x]]
        actual_details <- agency_details[[x]]
        actual_details %>% str_remove_all(actual_name) %>%
          str_trim() %>%
          str_split("\\|") %>%
          flatten_chr() %>%
          str_trim() %>%
          gsub("\\s+", " ", .) %>%
          discard(function(x) {
            x == ""
          }) %>%
          str_c(collapse = "|")
      })

    data <-
      data %>%
      mutate(detailsAgency = agency_details,
             nameOffice = offices) %>%
      separate(nameOffice,
               into = c("nameOffice", "detailsOffice"),
               extra = "merge",
               sep = "\\|") %>%
      select(-detailsAgency)


    type_setaside <-
      page %>% .parse_for_text(css = ".lst-cl:nth-child(3)")

    posted <-
      page %>% .parse_for_text(css = ".lst-cl:nth-child(4)") %>%
      lubridate::mdy()

    data <-
      data %>%
      mutate(
        nameAgency = agency,
        typeSetAside = type_setaside,
        datePosted = posted,
        idURL = page_ids,
        urlSolicitation = urls_solicitation
      )

    data <-
      data %>%
      mutate(
        isModified = typeSetAside %>% str_detect("(Modified)"),
        isCombinedSolicitation = typeSetAside %>% str_detect("Combined Synopsis/Solicitation"),
        isAward = typeSetAside %>% str_detect("Award"),
        isPreSolicitation = typeSetAside %>% str_detect("Presolicitation"),
        typeSetAside = typeSetAside %>% str_replace_all(
          "Combined Synopsis/Solicitation",
          "Combined Synopsis - Solicitation"
        ) %>% str_replace_all("\\(", "") %>% str_replace_all("\\)", "") %>% str_replace_all("Modified", "")
      ) %>%
      separate(typeSetAside,
                      into = c("statusBid", "typeSetAside"),
               extra = "merge",
                      sep = " / ") %>%
      mutate_if(is.character,
                funs(. %>% str_trim())) %>%
      mutate_if(is.character,
                funs(ifelse(. == "", NA, .))) %>%
      suppressWarnings()

    data <-
      data %>%
      select(-one_of("idRow")) %>%
      dplyr::select(which(colMeans(is.na(.)) < 1)) %>%
      select(idSolicitation, one_of(c(
        "nameAgency", "nameOffice", "detailsOffice"
      )), everything()) %>%
      suppressWarnings()

    data
  }

.parse.opportunity.url <-
  function(url = "https://www.fbo.gov/index?s=opportunity&mode=list&tab=list&tabmode=list&pp=100&pageID=250") {
    page <-
      url %>%
      read_html()

    data <-
      .parse.result.page(page = page) %>%
      mutate(urlOpportunityPage = url) %>%
      dplyr::select(dplyr::matches("Solicitation"), everything()) %>%
      .munge_data()

    data
  }

.parse.opportunity.urls <-
  function(urls = c(
    "https://www.fbo.gov/index?s=opportunity&mode=list&tab=list&tabmode=list&pp=100&pageID=25",
    "https://www.fbo.gov/index?s=opportunity&mode=list&tab=list&tabmode=list&pp=100&pageID=29"
  ),
  sleep_time = NULL,
  return_message = T) {
    df <-
      tibble()

    success <- function(res) {
      url <-
        res$url

      if (return_message) {
        glue::glue("Parsing {url}") %>%
          message()
      }
      .parse.opportunity.url.safe <-
        purrr::possibly(.parse.opportunity.url, tibble())

      all_data <-
        .parse.opportunity.url.safe(url = url)

      if (!sleep_time %>% purrr::is_null()) {
        Sys.sleep(time = sleep_time)
      }


      df <<-
        df %>%
        bind_rows(all_data)
    }
    failure <- function(msg) {
      tibble()
    }

    urls %>%
      map(function(x) {
        curl_fetch_multi(url = x, success, failure)
      })
    multi_run()
    df
  }

.active_urls <-
  function() {
    page <-
      "https://www.fbo.gov/index?s=opportunity&mode=list&tab=list&tabmode=list&pp=100&pageID=1" %>%
      read_html()

    active <-
      page %>%
      html_nodes(".lst-head .lst-cnt") %>%
      html_text() %>%
      str_split(" of ") %>%
      flatten_chr() %>%
      .[[2]] %>%
      as.numeric()

    pages <- active %/% 100
    all_pages <- 1:pages
    urls <-
      glue::glue(
        "https://www.fbo.gov/index?s=opportunity&mode=list&tab=list&tabmode=list&pp=100&pageID={all_pages}"
      ) %>%
      as.character()

    tibble(idPage = all_pages,
           urlPage = urls)



  }





# offices -----------------------------------------------------------------


# urlsearch ---------------------------------------------------------------

.generate_url_reference <-
  function() {
    user_agents <-
      c(
        "Mozilla/5.0 (Linux; U; en-US) AppleWebKit/528.5+ (KHTML, like Gecko, Safari/528.5+) Version/4.0 Kindle/3.0 (screen 600x800; rotate)",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246",
        "Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.111 Safari/537.36"
      )


    user_agent <-
      user_agents[!user_agents %>% str_detect("bot|slurp")] %>%
      sample(1)

    tl_domain <-
      c('.com', '.gov', '.org', '.mil', '.co') %>%
      sample(1)

    word_length <-
      8:15

    words <-
      word_length %>% sample(1)

    domain_slug <-
      1:words %>%
      map_chr(function(x) {
        sample(letters, 1)
      }) %>%
      paste0(collapse = '')

    url <-
      list('http://', domain_slug, tl_domain) %>%
      purrr::reduce(paste0)
    df <-
      tibble(urlReferer = url,
             userAgent = user_agent)
    return(df)
  }



# contracts ---------------------------------------------------------------

.parse.contract_general <-
  function(page) {
    df_attrs <-
      .get_page_attribute_data(page)

    widgets <-
      df_attrs %>%
      filter(class == "widget") %>%
      pull(id)


    df_widgets <-
      widgets %>%
      map_dfr(function(id) {
        css_id <-
          glue::glue("#{id}") %>% as.character()
        name <-
          id %>% str_replace_all("dnf_class_values_procurement_notice__|__widget", "") %>% str_split("_") %>%
          flatten_chr() %>% discard( ~ .x == "") %>% str_c(collapse = "_")
        text <-
          page %>%
          html_nodes(css = css_id) %>%
          html_text() %>%
          str_trim()

        tibble(id, name, text)

      })

    data <-
      df_widgets %>%
      filter(!name %>% str_detect("packages|additional_info_link")) %>%
      select(-id) %>%
      distinct() %>%
      distinct()

    actual_names <- data$name %>% .resolve_bizopp_ids()

    data <-
      data %>%
      mutate(nameActual = actual_names) %>%
      select(nameActual, text) %>%
      mutate(
        text = if_else(text == "N/A", NA_character_, text),
        text = if_else(text == "-", NA_character_, text)
      ) %>%
      filter(!is.na(text))

    col_order <-
      data$nameActual

    data <-
      data %>%
      distinct() %>%
      spread(nameActual, text) %>%
      select(one_of(col_order))
    data
  }

.parse.contract.url <-
  function(url = "https://www.fbo.gov/index?s=opportunity&mode=form&id=2de499d566906e1f47a49e9ba00d0ea9&tab=core&tabmode=list&=",
           return_message = T) {
    if (return_message) {
      glue::glue("Parsing {url}") %>% message()
    }

    page <-
      url %>%
      read_html()

    name_solicitation <-
      page %>%
      html_nodes("h2") %>%
      html_text() %>%
      str_to_upper() %>%
      str_c(collapse = " ")


    id_solicitiation <-
      page %>%
      html_nodes(".sol-num") %>%
      html_text() %>%
      str_replace_all("Solicitation Number: ", "")

    agency_info <-
      page %>%
      html_nodes(".agency-name") %>%
      str_replace_all("<br>", "\n") %>%
      read_html() %>%
      html_text() %>%
      str_split("\n") %>%
      flatten_chr() %>%
      suppressWarnings()

    df_agency <-
      tibble(agency_info) %>%
      separate(agency_info,
               into = c("item", "value"),
               extra = "merge",
               sep = "\\: ") %>%
      left_join(tibble(
        item = c("Agency", "Office", "Location"),
        nameActual = c("nameAgency", "nameOfficeAgency", "nameLocationAgency")
      )) %>%
      select(nameActual, value) %>%
      spread(nameActual, value) %>%
      suppressMessages()


    data <-
      page %>%
      .parse.contract_general()

    if (name_solicitation %>% str_detect("\\--")) {
      sol_details  <-
        name_solicitation %>% str_split("\\--") %>% flatten_chr()
      idSol <-
        sol_details %>% .[[1]]
      name_sol <-
        sol_details %>% .[[2]]
      df_sol <-
        tibble(
          idSolicitationGroup = idSol,
          descriptionSolicitationGroup = name_sol,
          nameSolicitation = name_solicitation
        )
    } else {
      df_sol <- tibble(nameSolicitation = name_solicitation)
    }

    data <-
      df_agency %>%
      bind_cols(df_sol) %>%
      bind_cols(data) %>%
      mutate(idSolicitation = id_solicitiation) %>%
      select(idSolicitation, nameSolicitation, everything())

    data <-
      data %>%
      munge_data() %>%
      suppressWarnings()

    if (page %>% html_nodes("additional_info_link") %>% length() > 0) {
      names <-
        page %>%
        html_nodes("#dnf_class_values_procurement_notice__additional_info_link__widget a") %>%
        html_text() %>%
        str_trim()

      url <- page %>%
        html_nodes("#dnf_class_values_procurement_notice__additional_info_link__widget a") %>%
        html_attr("href")

      tibble(urlLinkAdditional = url,
             nameLinkAdditional = names)



    }

    notice_history <-
      page %>%
      html_nodes(".notice-hist")

    urls_solicitation <-
      notice_history %>% html_nodes("a") %>% html_attr("href") %>%
      str_c("https://www.fbo.gov", ., sep = "")
    url_length <-
      length(urls_solicitation) + 1

    df_releases <-
      1:url_length  %>%
      map_dfr(function(x) {
        css <-
          glue::glue("li:nth-child({x})") %>% as.character()
        text <-
          notice_history %>% html_nodes(css = css) %>% html_children() %>%
          html_text()
        if (text %>% length() == 0) {
          return(invisible())
        }
        if (text  %>% str_detect("Complete View") %>% sum(na.rm = T)) {
          return(invisible())
        }

        type <-
          text[1]
        start_d <-
          length(text) - 1
        dates <-
          text[start_d:length(text)] %>%
          str_c(collapse = " ") %>%
          lubridate::mdy_hm()


        tibble(
          datetimePosted = dates,
          typePosting = type %>% str_to_upper(),
          numberContractRelease = x
        ) %>%
          select(numberContractRelease, typePosting, everything())

      }) %>%
      mutate(urlSolicitationPrior = urls_solicitation)

    if (df_releases %>% nrow() > 0) {
      data <-
        data %>%
        mutate(dataRevisions = list(df_releases))
    }


    ## files
    file_nodes <-
      page %>% html_nodes(".sb:nth-child(1) .pkg")

    if (file_nodes %>% length() > 0) {
      df_files <-
        seq_along(file_nodes) %>%
        map_dfr(function(x) {
          nameFile <-
            file_nodes[[x]] %>% html_nodes("a") %>% html_text() %>% str_trim() %>% discard( ~ .x == "")
          urlFileZip <-
            file_nodes[[x]] %>% html_nodes("a") %>% html_attr("href") %>%
            keep( ~ .x %>% str_detect("dZip")) %>%
            str_c("https://www.fbo.gov", .)
          dateFile <-
            file_nodes[[x]]  %>% html_text() %>% str_replace_all(nameFile, "") %>% str_trim() %>% lubridate::mdy()
          df <-
            tibble(numberFile = x, nameFile, dateFile, urlFileZip)
          file_nodes[[x]] %>% html_nodes("a") %>% html_attr("target")
          df
        })
      file_n <-
        page %>% html_nodes(".sb:nth-child(1)")


      files <-
        file_n %>%
        html_nodes("a") %>% html_attr('href') %>%
        keep( ~ .x %>% str_detect("/utils/view?")) %>%
        str_c("https://www.fbo.gov", .)


      if (length(files) == nrow(df_files)) {
        df_files <-
          df_files %>%
          left_join(tibble(urlFile = files) %>%
                      mutate(numberFile = 1:n())) %>%
          suppressMessages()
      }

      data <-
        data %>%
        mutate(dataFiles = list(df_files))
    }


    data <-
      data %>%
      mutate(urlSolicitation = url)
    has_logo <-
      page %>%
      html_nodes('.agency-logo img') %>% length() > 0
    if (has_logo) {
      url_logo <-
        page %>%
        html_nodes('.agency-logo img') %>%
        html_attr("src") %>%
        str_c("https://www.fbo.gov/utils/", ., collapse = "")

      data <-
        data %>%
        mutate(urlAgencyLogo = url_logo)
    }

    data
  }

.parse_fbo_solicitation_urls <-
  function(urls = NULL,
           return_message = T,
           sleep_time = NULL) {
    if (urls %>% purrr::is_null()) {
      stop("Please Enter URLS")
    }
    df <-
      tibble()

    success <- function(res) {
      url <-
        res$url

      .parse.contract.url.safe <-
        purrr::possibly(.parse.contract.url, tibble())

      all_data <-
        .parse.contract.url.safe(url = url)

      if (!sleep_time %>% purrr::is_null()) {
        Sys.sleep(time = sleep_time)
      }


      df <<-
        df %>%
        bind_rows(all_data)
    }
    failure <- function(msg) {
      tibble()
    }
    urls %>%
      map(function(x) {
        curl_fetch_multi(url = x, success, failure)
      })
    multi_run()
    df
  }

#' Parse FBO contract url opportunities
#'
#'
#' @param urls a vector of urls
#' @param return_message if \code{TRUE} returns a message
#' @param sleep_time if not \code{NULL} system sleps for specified time
#'
#' @return a \code{tibble}
#' @export
#'
#' @examples
#' parse_fbo_solicitation_urls(urls =c("https://www.fbo.gov/index?s=opportunity&mode=form&id=887c71827400287a1d225950fab458e3&tab=core&_cview=0", "https://www.fbo.gov/index?s=opportunity&mode=form&id=5e77c77f41711c5774dac7e4b40e74be&tab=core&_cview=0"), return_message = T, sleep_time = NULL)

parse_fbo_solicitation_urls  <-
  function(urls = NULL,
           return_message = T,
           sleep_time = NULL) {
    if (length(urls)  == 0) {
      "Enter URLs" %>% message()
      return(invisible())
    }

    .parse_fbo_solicitation_urls_safe <-
      possibly(.parse_fbo_solicitation_urls, tibble())

    data <-
      .parse_fbo_solicitation_urls(urls = urls,
                                   return_message = return_message,
                                   sleep_time = sleep_time)


    char_cols <-
      data %>%
      select_if(is.character) %>%
      select(-matches("url")) %>% names()

    data <-
      data %>%
      mutate_at(char_cols,
                list(function(x) {
                  ifelse(x == "N/A", NA_character_, x) %>% gsub("\\s+", " ", .)
                })) %>%
      .munge_data(clean_address = F)

    if (data %>% tibble::has_name("nameAgency")) {
      dict_logos <- dictionary_agency_logos()

      data <-
        data %>%
        select(-one_of("urlAgencyLogo")) %>%
        left_join(dict_logos, by = "nameAgency")
    }

    d_f_cols <-
      data %>%
      select(matches("data|feature")) %>%
      names()

    if (length(d_f_cols) > 0) {
      df_features <-
        data %>%
        select(one_of("idSolicitation"), matches("data|feature")) %>%
        mutate_at(d_f_cols, list(function(x) {
          x %>% map_dbl(length) > 0
        }))

      new_name <- d_f_cols %>% substr(1, 1) %>% str_to_upper() %>%
        str_c(d_f_cols %>% substr(2, nchar(d_f_cols)), sep = "")
      new_names <- str_c("has", new_name)
      names(df_features)[names(df_features) %in% d_f_cols] <-
        new_names

      data <-
        data %>%
        left_join(df_features, by = "idSolicitation")

    }

    data
  }

#' FBO Active Solicitations
#'
#' Scraped and cached version of all non
#' award solicitations on FBO.gov
#'
#' @return
#' @export
#'
#' @examples
fbo_scraped_active <-
  function() {
    urls <- glue("https://asbcllc.com/r_packages/govtrackR/data/fbo/active_opportunity_cache/all_fbo_active/all_fbo_active_{1:10}.rda")
    .read_rda_safe <- possibly(read_rda, tibble())
    read_rda_m <- memoise::memoise(.read_rda_safe)
    data <-
      urls %>%
      map_dfr(function(url){
        read_rda_m(file = url)
      })

    data
  }
abresler/govtrackR documentation built on July 11, 2020, 12:30 a.m.