R/navy_sbir.R

Defines functions .parse_navy_url navy_open_topic_sbir_20.4 navy_20.4_selections navy_20.4_qa

Documented in navy_20.4_qa navy_20.4_selections navy_open_topic_sbir_20.4

#' Nay 20.4 QA
#'
#' @return
#' @export
#'
#' @examples
navy_20.4_qa <-
  function() {
	page <- read_html("https://navysbir.com/n20_4/n204-topic-qa.htm")
	tables <- page %>% html_table(fill = T)
	data <- tables[[1]] %>% as_tibble() %>% select(1:4)
	data <- data %>% slice(3:nrow(data)) %>%
	  setNames(c("id_topic", "date", "type", "text_question_answer"))

	data <- data %>%
	  separate(
	    "id_topic",
	    into = c("id_topic", "focus"),
	    sep = "Focus",
	    convert = T,
	    extra = "merge",
	    fill = "right"
	  )

	data <- data %>%
	  filter(!is.na(id_topic), !id_topic %>% str_detect("Topic")) %>%
	  mutate(type = case_when(type == "Q." ~ "question",
	                          TRUE ~ "answer"),
	         date = mdy(date)) %>%
	  separate_rows(text_question_answer, sep = "[1-9].") %>%
	  filter(text_question_answer != "") %>%
	  mutate(text_question_answer = str_squish(text_question_answer))

	data <- data %>%
	  filter(!id_topic %>% str_detect("How to submit a questionThe"))

	data <- data %>%
	  group_by(id_topic, focus, date, type) %>%
	  summarise(text_question_answer = str_c(text_question_answer, collapse = " ")) %>%
	  ungroup() %>%
	  munge_data() %>%
	  mutate(text_question_answer = text_question_answer %>% str_remove_all("\\.|\u001a|\\(|\\)") %>%
	           str_replace_all("\\ , ", "\\ ")) %>%
	  filter(id_topic != "TOP")

	data


}

#' Navy Selections
#'
#' @return
#' @export
#'
#' @examples
navy_20.4_selections <-
  function(join_fpds_data = F) {
    page <- read_html("https://navysbir.info/20_4-selections.html")
    id_topic <- page %>% html_nodes(".normtable td:nth-child(2)") %>% html_text()
    name_awardee <-
      page %>% html_nodes(".normtable td:nth-child(3)") %>% html_text() %>% str_squish()
    location_awardee <-
      page %>% html_nodes(".normtable td:nth-child(4)") %>% html_text() %>% str_squish()

    data <-
      tibble(id_topic, name_awardee, location_awardee) %>%
      filter(id_topic !="") %>%
      munge_data() %>%
      entities::refine_columns(entity_columns = "name_awardee") %>%
      clean_names()

    data <- data %>%
      filter(name_awardee != "TBA")

    if (join_fpds_data) {
      fpds_csv_safe <- possibly(fpds_csv, tibble())
      df_vendor_names <-
        data %>%
        distinct(name_awardee_clean) %>%
        mutate(name_awardee_search = name_awardee_clean %>% str_remove_all("LLC$|INC$|CORPORATION$") %>% str_squish())

      df_vendors <-
        unique(df_vendor_names$name_awardee_search) %>%
        map_dfr(function(x){
          d <- fpds_csv_safe(global_vendor_name = x, snake_names = T)
          if (length(d) == 0) {
            return(tibble())
          }
          d %>%
            mutate(name_awardee_search = x)
        })


      df_vendors <- df_vendors %>%
        group_by(name_awardee_search) %>%
        summarise(
          count_contracts = n_distinct(id_contract_analysis),
          count_actions = n(),
          amount_contracts = sum(amount_obligation, na.rm = T),
          date_first_contract = min(date_obligation),
          date_most_recent_contract = max(date_obligation),
          count_agencies_awards = n_distinct(id_cgac_agency),
          count_offices_awards = n_distinct(id_office_award),
          count_psc_awards = n_distinct(code_product_service)
        ) %>%
        left_join(
          df_vendors %>% count(name_awardee_search, id_duns, wt = amount_obligation, name = "amount") %>%
            group_by(name_awardee_search) %>% filter(amount == max(amount)) %>% slice(1) %>%
            ungroup() %>% select(id_duns, name_awardee_search), by = "name_awardee_search"
        )

      df_vendor_names <- df_vendor_names %>%
        left_join(df_vendors %>% mutate_if(is.numeric,as.numeric),  by = "name_awardee_search") %>%
        munge_data()

      data <-
        data %>% left_join(df_vendor_names, by = "name_awardee_clean") %>%
        select(id_topic, name_awardee, name_awardee_clean, id_duns, everything())

    }


    data
  }

# https://navysbir.com/topics20_4.htm

#' Navy Open Topic SBIRS
#'
#' @return
#' @export
#'
#' @examples
navy_open_topic_sbir_20.4 <- function(include_qa = T,unnest_data = F) {
  if (include_qa) {
    unnest_data <- F
  }
  page <- read_html("https://navysbir.com/topics20_4.htm")
  nodes <- page %>% html_nodes("ul .tnlnk")
  topics <- nodes %>% html_text()
  slugs <- nodes %>% html_attr("href")
  urls <- str_c("https://navysbir.com/",slugs)
  data <-
    tibble(topics, url_topic = urls) %>%
    separate(topics,
             into = c("id_topic", "name_topic"),
             sep = "   ") %>%
    munge_data()



  data <- 1:nrow(data) %>%
    map_dfr(function(x){
      df_row <- data[x,]
      df_text <- .parse_navy_url(url = df_row$url_topic)
      df_row %>%
        mutate(data_text = list(df_text %>% select(-url_topic)))
    })

  if (include_qa) {
    df_qa <- navy_20.4_qa()
    df_qa <- df_qa %>%
      mutate(type = str_c("text_", type, sep = "") %>% str_to_lower()) %>%
      spread(type, text_question_answer) %>%
      group_by(id_topic) %>%
      nest() %>%
      rename(data_question_answer = data)

    data <- data %>%
      left_join(df_qa, by = "id_topic")
  }

  if (unnest_data) {
    data <- data %>%
      unnest()
  }

  data
}

.parse_navy_url <-
  function(url = "https://navysbir.com/n20_4/n204-a01.htm") {
    page <- read_html(url)
    text <- page %>% html_text() %>% str_split("\n") %>% flatten_chr() %>% str_squish() %>%
      discard(function(x) {
        x == ""
      })

    data <- tibble(text) %>%
      mutate(row = 1:n())

    data <- data %>%
      mutate(is_parent = text %>% str_detect("^[A-Z][A-Z]"))

    df_sections <-
      data %>% filter(is_parent) %>%
      separate(text, into = c("section", "text"), sep = "\\:") %>%
      select(section, text_new = text, row)
    remove <- df_sections$section %>% str_c(collapse = "|")

    data <-
      data %>%
      left_join(df_sections, by = "row") %>%
      mutate(text = case_when(is.na(text_new) ~ text,
                              TRUE ~ text_new)) %>%
      select(section, text) %>%
      fill(section) %>%
      filter(!is.na(section)) %>%
      mutate_all(str_squish) %>%
      munge_data() %>%
      group_by(section) %>%
      summarise(text = str_c(text, collapse = " ")) %>%
      filter(!is.na(text))

    links <- page %>% html_nodes("a") %>% html_attr("href")
    links <- links[links %>% str_detect("http")] %>%
      discard(function(x){
        x %>% is.na()
      }) %>% str_c(collapse = " | ")



    if (length(links) >0) {
      data <- data %>%
        bind_rows(tibble(section = "LINKS", text = links))
    }

    data <-
      data %>%
      mutate(url_topic = url)

    data

  }
abresler/govtrackR documentation built on July 11, 2020, 12:30 a.m.