irs990lookup: IRS 990 Lookup

#' Lookup IRS 990 Data Using EINs
#'
#' @param x user-supplied csv file via the file.choose() function
#'
#' @return csv file with data from IRS 990 xml files corresponding with user-supplied EINs
#'
#' @author Michael Pawlus, \email{michael.pawlus@@gmail.com}
#'
#' @examples
#' # simple run the following function call and select a csv file with one column of EINs
#' # please include a header row such as "eins"
#' get_990_data()
#'
#' @export

get_990_data  <- function(x) {

  # prompt user to choose a file
  x <- file.choose()

  # check if the file is a csv file
  if(!stringr::str_detect(x,".csv$")){

    # if not, stop with the following error message
    stop('Cannot read uploaded file. The uploaded file must be in csv format.')

    # if it is a csv file then continue
  } else {

    # possibly include a message here that only the first column will be used

    # read in the file
    eins <- readr::read_csv(x)

    # select just the first column
    eins <- eins %>% dplyr::select(1)

  }

  # if that first column is not all integers...
  if(!rlang::is_integer(eins[[1]])) {

    # Stop with error message below
    stop('Only the first column is used and must contain only EIN numbers and a header row')

  } else {

    # read in the JSON index for all 990 filings for tax year 2017
    index <- jsonlite::fromJSON("https://s3.amazonaws.com/irs-form-990/index_2017.json")[[1]]

    # convert the TaxPeriod column to data type: integer
    index$TaxPeriod <- as.integer(index$TaxPeriod)

    # keep on the most recent filing as there are some duplicates
    index <- index %>%
      dplyr::arrange(desc(TaxPeriod)) %>%
      dplyr::distinct(EIN, .keep_all = TRUE)

    # read in the csv file uploaded by the user
    eins <- readr::read_csv(x)

    # pad all eins with leading zeros so that all have a length of 9
    eins <- purrr::map_df(eins, ~stringr::str_pad(.x, 9, pad = "0"))

    # match up EINs from csv with those from the JSON index
    found_ids <- eins %>%
      dplyr::left_join(index)

    # extract the urls from the index for all matching cases
    urls <- found_ids$URL

    # function to get name and amount given out in contributions for each url
    get_name_and_grants <- function(url){
      # read in the xml file for each url
      xf <- xml2::read_xml( x=url, options=NULL )
      # strip out the name spaces (I'm not sure what this means exactly but these have to be removed)
      xml2::xml_ns_strip( xf )
      # follow the xml path and extract value as text
      name <- xf %>% xml2::xml_find_all("//Return/ReturnHeader/Filer/BusinessName/BusinessNameLine1Txt") %>% xml2::xml_text()
      # follow the xml path and extract the value as integer
      fy_gifts <- xf %>% xml2::xml_find_all(".//CYGrantsAndSimilarPaidAmt") %>% xml2::xml_integer()
      # create a tibble with these two values
      dplyr::tibble(org_name = name,
             grants = fy_gifts)
    }

    # create a list of foundations based on matches as well as an error list using purrr::safely()
    found_list <- urls %>%
      purrr::map(purrr::safely(get_name_and_grants)) %>%
      purrr::transpose()

    # create a logical vector where the error list is null to get only successful matches
    is_ok <- found_list$error %>%
      purrr::map_lgl(rlang::is_null)

    # create the results table by keeping only succesful matches with org names
    res <- found_list$result[is_ok] %>%
      purrr::keep(~ length(.x$org_name) > 0) %>%
      {
        dplyr::tibble(
          `Foundation Name` = purrr::map_chr(., "org_name"),
          `Current FY Giving` = purrr::map_int(., "grants")
        )
      }

    # let the user know that the a portion of their data is printed in the console
    cat("Below are the first ten rows of your output file\n ")

    # print the top ten rows
    print(head(res, n=10))

    # write the file to a csv (possibly make naming the file a user option in the future)
    readr::write_csv(res,"output_990_data.csv")

    # put name of csv in a file to prep for making this a user option
    output_file_name <- "output_990_data.csv"

    # print message the entire file has been saved as: output_file_name
    cat(paste0("Output file saved as: ",output_file_name))

  }

}