R/sam_ocr_tools.R

Defines functions extract_sam_attachment_opportunity_urls .extract_sam_attachment_opportunity

Documented in extract_sam_attachment_opportunity_urls

.extract_sam_attachment_opportunity <-
  function(url = "https://beta.sam.gov/api/prod/opps/v3/opportunities/5148f266294f4bb09d8f50fc4c7c8fa7/resources/download/zip",
           return_message = F) {
    if (return_message) {
      glue("Extracting attachment data from {url}") %>% message()
    }
    outfile <- tempfile("download", fileext = ".zip")

    # file <- curl::curl_download(url, outfile)
    file <- download.file(url, outfile)
    unz_files <- outfile %>% unzip(exdir = "attachments")
    data <-
      unz_files %>%
      map_dfr(function(x) {
        is_pdf <- x %>% str_detect(".pdf")
        if (is_pdf) {
          df_metadata <- pdftools::pdf_info(pdf = x) %>% flatten_df()
          df_metadata <- df_metadata %>%
            mutate_if(is.character,
                      list(function(x){
                        case_when(x == "" ~ NA_character_,
                                  TRUE ~ x)
                      })) %>%
            .remove_na()
          df_metadata <-
            .munge_data_sam_names(df_metadata)


          date_cols <- df_metadata %>% select(matches("date")) %>% names()

          if (length(date_cols) >0) {
            df_metadata <-
              df_metadata %>%
              mutate_at(date_cols,
                        list(function(x) {
                          as.POSIXct(x, origin = "1970-01-01", tz = "UTC") %>% as.Date()
                        }))
          }
          text <- pdftools::pdf_text(x)
          pdfText <- text %>% str_split("\n") %>%
            flatten_chr() %>%
            str_squish() %>%
            discard(function(x) {
              x == ""
            }) %>%
            discard(function(x) {
              x %>% str_detect("^Page")
            }) %>%
            str_to_upper() %>%
            str_c(collapse = " ")
          df_metadata <-
            df_metadata %>%
            mutate(pdfText)
          df_metadata %>%
            .munge_data()
        }
      }) %>%
      mutate_if(is.numeric, as.numeric)

    unz_files %>% unlink()
    file %>% unlink()
    unlink("attachments", recursive = T, force = T)
    gc()

    data <-
      data %>%
      mutate(urlOpportunityAttachmentZIP = url)

    data

  }

#' OCR SAM Opportunities
#'
#' @param urls vector of URLS
#' @param return_message if \code{TRUE} returns a message
#'
#' @return
#' @export
#'
#' @examples
extract_sam_attachment_opportunity_urls <-
  function(urls = NULL, return_message = F, snake_names = T) {

    if (length(urls) == 0) {
      stop("Need SAM opportunity URLS")
    }
    .extract_sam_attachment_opportunity_safe <-
      possibly(.extract_sam_attachment_opportunity, tibble())

    data <-
      unique(urls) %>%
      future_map_dfr(function(url) {
        .extract_sam_attachment_opportunity_safe(url = url, return_message = return_message)
      })

    if (snake_names) {
      data <- data %>% janitor::clean_names()
    }

    data
  }
abresler/govtrackR documentation built on July 11, 2020, 12:30 a.m.