R/irs.R

Defines functions irs_990_data .irs_990_file .dl_990_zip .dl_990_dat .dl_990_data_excel .munge_990_names .parse_990_extract_dictionary .parse_irs_url irs_detailed_exempt_entities .parse_irs_charity_detail irs_revoked_entities dictionary_irs_exempt_urls dictionary_irs_data_sets dictionary_irs_names dictionary_irs_deductibility dictionary_irs_affiliations .munge_irs_names .block_text_to_dict .munge_irs

Documented in dictionary_irs_affiliations dictionary_irs_deductibility dictionary_irs_exempt_urls dictionary_irs_names irs_990_data irs_detailed_exempt_entities

.munge_irs <-
  function(data) {
    if (data %>% hasName("code_ein")) {
      data <- data %>%
        mutate(id_ein = as.numeric(code_ein), .before = "code_ein")

      data <- data %>%
        select(-code_ein)
    }

    if (data %>% hasName("code_subsection")) {
      data <- data %>%
        mutate(code_subsection = as.character(code_subsection))
    }

    code_names <- data %>% select(matches("code_")) %>% names()

    if (length(code_names) > 0) {
      data <- data %>%
        mutate_at(code_names, as.character)
    }

    ym_cols <-
      data %>%
      select(matches("year_month")) %>%
      names()

    if (length(ym_cols) > 0) {
      data <-
        data %>%
        mutate_at(ym_cols, list(function(x) {
          case_when(x == 0 ~ NA_real_,
                    TRUE ~ x)
        }))

      data <-
        data %>%
        mutate(
          date_tax_period = glue("{year_month_tax_period}01") %>% ymd() %m+% months(1) - 1,
          .before = "year_month_tax_period"
        )
    }

    logical_cols <-
      data %>% select_if(is.character) %>% select(matches("^is_|^has_")) %>% names()

    if (length(logical_cols) > 0) {
      data <-
        data %>% mutate_at(logical_cols,
                           list(function(x) {
                             case_when(
                               str_to_upper(x) %in% c("Y", "YES") ~ TRUE,
                               str_to_upper(x) %in% c("N", "NO", "F") ~ F,
                               str_to_upper(x) == "TRUE" ~ TRUE,
                               TRUE ~ as.logical(x)
                             )
                           }))
    }


    data
  }

.block_text_to_dict <-
  function(x,
           column_names = c("codesNationalTaxonomyExemptEntities",
                            "namesNationalTaxonomyExemptEntities")) {
    text <-
      x %>% str_split("\n") %>% flatten_chr() %>% str_squish() %>%
      discard(function(x) {
        x == ""
      })
    tibble(text) %>%
      mutate(text = text %>% str_replace("\\ ", "|")) %>%
      separate(
        text,
        into = column_names,
        sep = "\\|",
        extra = 'merge',
        fill = 'right'
      ) %>%
      mutate_if(is.character, list(function(x) {
        x %>% str_to_upper() %>% str_squish()
      }))
  }

.munge_irs_names <-
  function(data) {
    dict_names <- dictionary_irs_names()
    irs_names <-
      names(data)

    actual_names <-
      irs_names %>%
      map_chr(function(x) {
        df_row <-
          dict_names %>%
          filter(nameIRS == str_to_lower(x)) %>%
          distinct() %>%
          slice(1)
        if (nrow(df_row) == 0) {
          glue::glue("Missing {x}") %>% message()
          return(x)
        }

        df_row$nameActual
      })

    data %>%
      set_names(actual_names)
  }


#' IRS Affiliation Codes
#'
#' @return
#' @export
#' @family tax-exempt, IRS
#'
#' @examples
dictionary_irs_affiliations <-
  function() {
    tibble(
      idAffiliation = c(1:3, 6:9),
      nameAffiliation = c(
        "CENTRAL",
        "Intermediate",
        "Independent",
        "Central",
        "Intermediate",
        "Central",
        "Subordinate"
      ),
      descriptionAffiliation = c(
        "This code is used if the organization is a central type organization (no group exemption) of a National,
Regional or Geographic grouping of organizations.",
        "This code is used if the organization is an intermediate organization (no group exemption) of a
National, Regional or Geographic grouping of organizations (such as a state headquarters of a national
organization)",
        "This code is used if the organization is an independent organization or an independent auxiliary (i.e., not affiliated with a National, Regional, or Geographic grouping of organizations)",
        "This code is used if the organization is a parent (group ruling) and is not a church or 501(c)(1)
organization." ,
        " This code is used if the organization is a group exemption intermediate organization of a National, Regional or Geographic grouping of organizations",
        "This code is used if the organization is a parent (group ruling) and is a church or 501(c)(1) organization.",
        " This code is used if the organization is a subordinate in a group ruling."
      )
    ) %>% munge_data()
  }

#' IRS Tax Deductiblity Dictionary
#'
#' @return
#' @family tax-deductibility, IRS
#' @export
#'
#' @examples
dictionary_irs_deductibility <-
  function() {
    tibble(
      idDeductibility = c(0, 1:2, 4),
      typeDeductiblity = c(
        "UNKNOWN",
        "DEDUCTIBLE",
        "NON-DEDUCTIBLE",
        "DEDUCTIBLE BY TREATY"
      )
    )
  }

#' IRS Name Dictioanry
#'
#' Dictionary of IRS names against govtrackR
#' schema
#'
#' @return
#' @export
#'
#' @examples
dictionary_irs_names <-
  function() {
    tibble(
      nameIRS = c(
        "ein",
        "name",
        "ico",
        "street",
        "city",
        "state",
        "zip",
        "group",
        "subsection",
        "affiliation",
        "classification",
        "ruling",
        "deductibility",
        "foundation",
        "activity",
        "organization",
        "status",
        "tax_period",
        "asset_cd",
        "income_cd",
        "filing_req_cd",
        "pf_filing_req_cd",
        "acct_pd",
        "asset_amt",
        "income_amt",
        "revenue_amt",
        "ntee_cd",
        "sort_name"
      ),
      nameActual = c(
        "codeEIN",
        "nameOrganization",
        "nameInCareOf",
        "addressStreetOrganization",
        "cityOrganization",
        "stateOrganization",
        "zipcodeOrganization",
        "codeGroupExemption",
        "codeSubsection",
        "idAffiliation",
        "idClassification",
        "yearMonthRuling",
        "idDeductibility",
        "codeFoundation",
        "codesActivity",
        "idOrganization",
        "codeExemptionStatus",
        "yearMonthTaxPeriod",
        "idAssetAmount",
        "idIncomeAmount",
        "codeFilingRequirement",
        "has990PF",
        "codeAccountingPeriod",
        "amountAssets",
        "amountIncome",
        "amountRevenue",
        "codeNationalTaxonomyExemptEntitiesClassification",
        "nameSort"
      )

    )
  }

dictionary_irs_data_sets <-
  function() {
    download_excel_file("https://www.irs.gov/pub/irs-soi/soiprogramdetails.xlsx")
  }

# tax_exempt_entities -----------------------------------------------------

# https://www.irs.gov/charities-non-profits/exempt-organizations-business-master-file-extract-eo-bmf

#' IRS data dictionary for tax exempt entities
#'
#' \url{https://www.irs.gov/charities-non-profits/exempt-organizations-business-master-file-extract-eo-bmf}
#'
#' @return \code{tibble()}
#' @export
#' @family tax-exempt, IRS
#'
#' @examples
#' dictionary_irs_exempt_urls()
dictionary_irs_exempt_urls <-
  function() {
    page <-
      read_html(
        "https://www.irs.gov/charities-non-profits/exempt-organizations-business-master-file-extract-eo-bmf"
      )

    slugs <- page %>% html_nodes(".field--type-text-with-summary a")

    file_names <- html_text(slugs)
    urls <- slugs %>% html_attr('href')
    urls <-
      case_when(!urls %>% str_detect("http") ~ str_c("https://www.irs.gov", urls),
                TRUE ~ urls)

    data <- tibble(nameFile = file_names, urlIRS = urls)

    data <-
      data %>%
      mutate(
        typeFile = case_when(
          urlIRS %>% str_detect("/eo") ~ "Exempt Entity",
          urlIRS %>% str_detect("SIT|sit") ~ "Split Income",
          urlIRS %>% str_detect("soi-tax-stats") ~ "SOI",
          TRUE ~ "Other"
        )
      ) %>%
      mutate(
        regionIRS = case_when(
          urlIRS %>% str_detect("_xx") ~ "international",
          urlIRS %>% str_detect("_pr") ~ "puerto rico",
          urlIRS %>% str_detect("eo1") ~ "northeast",
          urlIRS %>% str_detect("eo2") ~ "midatlantic",
          urlIRS %>% str_detect("eo3") ~ "coasts",
          urlIRS %>% str_detect("eo4") ~ "other"
        ),
        isCSV = urlIRS %>% str_detect(".csv"),
        isPDF = urlIRS %>% str_detect(".pdf")
      ) %>%
      select(typeFile, regionIRS, everything())


    data

  }

#' IRS Tax Exempt entitiy masterfile
#'
#' Returns basic information for IRS tax exempt entities
#'
#' @param clean_entities if \code{TRUE} cleans organization column
#' @param snake_names if \code{TRUE} returns snake case names
#' @family tax-exempt, IRS
#'
#' @return \code{tibble()}
#' @export
#'
#' @examples
#' irs_master_exempt_entities()
irs_master_exempt_entities <-
  memoise::memoise(function(clean_entities = T,
                            snake_names = F) {
    url <-
      "https://apps.irs.gov/pub/epostcard/data-download-pub78.zip"
    outfile <- tempfile("download", fileext = ".zip")
    file <- download.file(url, outfile)
    unz_files <- outfile %>% unzip(exdir = "zip")

    data <-
      unz_files %>% fread(showProgress = FALSE) %>% as_tibble()

    data <-
      data %>%
      setNames(
        c(
          "idEIN",
          "nameOrganization",
          "cityOrganization",
          "stateOrganization",
          "countryOrganization",
          "codeDeductibility"
        )
      )
    data <- data %>%
      mutate(urlIRS = url)
    unz_files %>% unlink()
    file %>% unlink()

    data <-
      data %>%
      mutate(codeDeductibility = codeDeductibility %>%  str_count("\\,") + 1) %>%
      munge_data(parse_dates = F, snake_names = F)

    if (clean_entities) {
      data <-
        data %>%
        entities::refine_columns(entity_columns = "nameOrganization")
    }

    if (snake_names) {
      data <- data %>% clean_names()
    }

    data



  })

irs_revoked_entities <-
  function(clean_entities = T,
           snake_names = F) {
    url <- "https://apps.irs.gov/pub/epostcard/data-download-revocation.zip"

    outfile <- tempfile("download", fileext = ".zip")
    file <- download.file(url, outfile)
    unz_files <- outfile %>% unzip(exdir = "zip")

    data <-
      unz_files %>% fread(showProgress = FALSE) %>% as_tibble()
  }

.parse_irs_charity_detail <-
  function(url = "https://www.irs.gov/pub/irs-soi/eo_xx.csv",
           return_message = T) {
    if (return_message) {
      glue("Parsing {url}") %>% message()
    }
    data <-
      vroom(url)

    names(data) <- names(data) %>% str_to_lower()
    data <-
      data %>%
      .munge_irs_names()

    data <-
      data %>%
      mutate(urlIRS = url)

    data <- data %>%
      mutate(
        classificationNTEE = case_when(
          nchar(codeNationalTaxonomyExemptEntitiesClassification) == 4 ~ substr(codeNationalTaxonomyExemptEntitiesClassification, 4, 4)
        ),
        codeNTEE = case_when(
          !is.na(codeNationalTaxonomyExemptEntitiesClassification) ~ substr(codeNationalTaxonomyExemptEntitiesClassification, 1, 3),
          TRUE ~ NA_character_
        )
      ) %>%
      select(-codeNationalTaxonomyExemptEntitiesClassification)

    ym_cols <-
      data %>%
      select_if(is.character) %>%
      select(matches("yearMonth")) %>%
      names()

    if (length(ym_cols) > 0) {
      data <- data %>%
        mutate_at(ym_cols, readr::parse_number)
    }


    ym_cols <-
      data %>%
      select(matches("yearMonth")) %>%
      names()

    if (length(ym_cols) > 0) {
      data <-
        data %>%
        mutate_at(ym_cols, list(function(x) {
          case_when(x == 0 ~ NA_real_,
                    TRUE ~ x)
        }))

      data <- data %>%
        mutate(
          dateRuling = glue("{yearMonthRuling}01") %>% ymd() %m+% months(1) - 1,
          dateTaxPeriod = glue("{yearMonthTaxPeriod}01") %>% ymd() %m+% months(1) - 1
        )
    }

    data <-
      data %>%
      mutate(has990PF = as.logical(has990PF))



    data <-
      data %>%
      mutate(zipcodeOrganization = case_when(
        zipcodeOrganization == "00000-0000" ~ NA_character_,
        TRUE ~ zipcodeOrganization
      ))

    data


  }


#' Detailed IRS Exempt Entity Data
#'
#' Data on all regsitered non taxable
#' corporate entities
#'
#'
#' @param regions if `NULL` returns data on all regions otherwise \itemize{
#' \item international
#' \item puerto rico
#' \item northeast
#' \item coasts
#' \item other
#' }
#' @param clean_entities if `TRUE` cleans entitiy columns
#' @param snake_names if `TRUE` returns snake case names
#' @param join_locations if `TRUE` returns full location address
#' @param return_message if `TRUE` returns a message
#'
#' @return \code{tibble}
#' \url{https://www.irs.gov/charities-non-profits/exempt-organizations-business-master-file-extract-eo-bmf}
#' @family tax-exempt
#' @export
#'
#' @examples
irs_detailed_exempt_entities <-
  function(regions = NULL,
           clean_entities = F,
           snake_names = T,
           join_locations = T,
           return_message = T) {

    tbl_regions <-
      dictionary_irs_exempt_urls() %>%
      filter(typeFile %>% str_detect("Exempt Ent")) %>%
      filter(isCSV)

    if (length(regions) != 0) {
      slugs <- regions %>% str_to_lower() %>% str_c(collapse = "|")
      tbl_regions <-
        tbl_regions %>%
        filter(regionIRS %>% str_detect(slugs))
    }
    .parse_irs_charity_detail_safe <-
      possibly(.parse_irs_charity_detail, tibble())

    data <-
      tbl_regions$urlIRS %>%
      map_dfr(function(x) {
        data <-
          .parse_irs_charity_detail_safe(url = x, return_message = return_message)
        if (data %>% hasName("idClassification"))  {
          data <- data %>%
            mutate(idClassification = as.numeric(idClassification))
        }
        data
      })

    data <-
      data %>%
      left_join(tbl_regions %>% select(regionIRS, urlIRS),
                by = "urlIRS")

    data <-
      data %>%
      mutate(
        idEIN = as.numeric(codeEIN),
        hasCareOf = !is.na(nameInCareOf),
        nameInCareOf = nameInCareOf %>% str_remove_all("^% ") %>%
          str_remove_all("C/O") %>% str_squish()
      ) %>%
      separate(
        zipcodeOrganization,
        into = c("zipcodeOrganization", "zip4Organization"),
        sep = "\\-",
        extra = "merge",
        fill = "right"
      ) %>%
      select(idEIN, everything())

    zip_cols <- data %>% select(matches("^zip")) %>% names()
    data <-
      data %>% mutate_at(zip_cols,
                         list(function(x) {
                           case_when(x == "00000" ~ NA_character_,
                                     x == "0000" ~ NA_character_,
                                     TRUE ~ x)
                         }))

    data <-
      data %>%
      left_join(dictionary_irs_ntee_codes, by = "codeNTEE") %>%
      left_join(dictionary_irs_status_exemptions, by = "codeExemptionStatus") %>%
      left_join(dictionary_irs_filing_requirements, by = "codeFilingRequirement") %>%
      left_join(dictionary_irs_asset_codes, by = "idAssetAmount")

    data <-
      data %>%
      left_join(dictionary_irs_affiliations(), by = "idAffiliation")

    data <-
      data %>%
      mutate(id = 1:n()) %>%
      left_join(dictionary_irs_foundation_types, by = "codeFoundation") %>%
      distinct() %>%
      group_by(id) %>%
      slice(1) %>%
      ungroup()



    if (data %>% hasName("codesActivity")) {
      df_codes <-
        data %>%
        select(id, codesActivity) %>%
        gather(typeActivity, codeActivity, -id) %>%
        mutate(
          codeActivity01 = codeActivity %>% substr(1, 3),
          codeActivity02 = codeActivity %>% substr(4, 6),
          codeActivity03 = codeActivity %>% substr(7, 9)
        ) %>%
        select(-c(codeActivity, typeActivity)) %>%
        gather(variable, codeActivity, -id) %>%
        filter(codeActivity != "000") %>%
        mutate(variable = "codeActivity") %>%
        group_by(id) %>%
        mutate(number = 1:n()) %>%
        ungroup() %>%
        mutate(number = number - 1) %>%
        select(-variable) %>%
        left_join(dictionary_irs_activity_codes, by = "codeActivity")

      df_primary_codes <-
        df_codes %>% group_by(id) %>% filter(number == min(number)) %>% ungroup() %>%
        rename(
          codeActivityPrimary = codeActivity,
          nameActivityParentPrimary = nameActivityParent,
          nameActivityPrimary = nameActivity
        ) %>%
        select(-number)

      df_codes <-
        df_codes %>%
        group_by(id) %>%
        nest() %>%
        rename(dataActivities = data) %>%
        ungroup() %>%
        mutate(countActivities = dataActivities %>% map_dbl(nrow)) %>%
        left_join(df_primary_codes, by = "id") %>%
        select(-dataActivities, everything())

      data <-
        data %>%
        select(-codesActivity) %>%
        left_join(df_codes, by = "id")

      rm(df_codes)

    }

    if (join_locations) {
      data <-
        data %>%
        build_address()
    }

    tbl_regions <-
      tbl_regions %>%
      mutate(regionIRS = str_to_upper(regionIRS)) %>%
      select(regionIRS, urlIRS)

    data <-
      data %>%
      select(-one_of("regionIRS")) %>%
      left_join(tbl_regions, by = "urlIRS") %>%
      select(regionIRS, everything())

    data <- data %>%
      mutate(
        hasDBA = nameSort %>% str_detect("DBA "),
        nameSort = nameSort %>% str_remove_all("^DBA ")
      )

    data <-
      data %>%
      select(-codeEIN) %>%
      group_by(idEIN) %>%
      slice(1) %>%
      ungroup() %>%
      select(-id)

    data <- data %>%
      arrange(desc(amountAssets))

    if (snake_names) {
      data <- data %>% clean_names()
    }

    data

  }


# SOI ---------------------------------------------------------------------

## https://www.irs.gov/statistics/soi-tax-stats-integrated-business-data

# https://www.irs.gov/statistics/soi-tax-stats-upcoming-data-releases
# soi 501c3 ---------------------------------------------------------------

# https://www.irs.gov/statistics/soi-tax-stats-annual-extract-of-tax-exempt-organization-financial-data


.parse_irs_url <-
  function(url, url_name = "url_irs_file") {
    parts <- url %>% str_split("/") %>% flatten_chr()
    slug_file <- parts[[length(parts)]]
    year_slug <- slug_file %>% substr(1,2)
    file_parts <- slug_file %>% str_split("\\.") %>% flatten_chr()
    format_file <- file_parts[[2]]
    name_file <- file_parts[[1]]

    slug_file <- name_file %>% substr(3, nchar(name_file))

    year_file <-
      glue("20{year_slug}") %>% as.numeric()

    tibble(UQ(url_name) := url, name_file, year_file, slug_file, format_file)

  }

.parse_990_extract_dictionary <-
  function(url = "https://www.irs.gov/pub/irs-soi/19eofinextractdoc.xlsx") {
    data <- .download_excel_file(url = url, has_col_names = T)

    data <-
      data %>%
      slice(3:nrow(data)) %>%
      select(1:3) %>%
      setNames(c("name_irs", "description", "location"))

    data <-
      data %>%
      mutate(is_logical = description %>% str_detect("\\?"))

    if (data %>% hasName("location")) {
      data <- data %>%
        mutate(is_logical = case_when(location %>% str_detect(" IV") ~ T,
                                      TRUE ~ is_logical))
    }

    data <-
      data %>%
      mutate(
        name_irs = name_irs %>% str_to_lower(locale = "en"),
        description = str_to_lower(description, locale = "en"),
        name_actual = case_when(
          name_irs == "ein" ~ "code_ein",
          name_irs == "elf" ~ "type_filer",
          name_irs == "miscrev11acd" ~ "code_miscrev11a",
          name_irs == "miscrev11bcd" ~ "code_miscrev11bcd",
          name_irs == "nonpfrea" ~ "code_non_profit_type",
          name_irs == "tax_pd" ~ "year_month_tax_period",
          name_irs == "subseccd" ~ "code_subsection",
          name_irs %>% str_detect("cnt$") ~ glue("count_{name_irs}") %>% as.character(),
          is_logical ~ glue("has_{name_irs}") %>% as.character(),
          description %>% str_detect("service revenue code|qualified health plan in multiple|other revenue code 11c") ~ glue("code_{name_irs}") %>% as.character(),
          description %>% str_detect("filed form|qualified health plan in multiple states
|payments for indoor tanning") ~ glue("has_{name_irs}") %>% as.character(),
          description %>% str_detect("service revenue amount") ~ glue("amount_{name_irs}") %>% as.character(),
          TRUE ~ glue("amount_{name_irs}") %>% as.character()
        )
      )


    data <- data %>%
      mutate(url_irs_file  = url)


    data
  }

#' IRS 990 name dictionary
#'
#' @return tibble
#' @export
#'
#' @examples
dictionary_irs_990_extract_names <-
  memoise::memoise(function(filter_years = NULL) {
    dict <- dictionary_irs_990_extract_urls()
    years <- dict %>% distinct(year_file) %>% pull()
    df <- dict %>% filter(type_file == "dictionary")
    data <-
      df$url_irs_file %>%
      map_dfr(function(url){
        url %>% message()
        .parse_990_extract_dictionary(url = url)
      })

    data <-
      data %>%
      left_join(df %>%
      select(url_irs_file, year_file), by = "url_irs_file") %>%
      select(year_file, everything())

    if (length(filter_years) > 0) {
      data <- data %>%
        filter(year_file %in% filter_years)
    }

    data
  })

#' IRS 990 extract data
#'
#' Returns links for IRS 990 and
#' related form summary of selected extracts
#'
#' @return tibble
#' \url{https://www.irs.gov/statistics/soi-tax-stats-annual-extract-of-tax-exempt-organization-financial-data}
#' @export
#' @family tax-exempt, IRS, dictionary
#'
#' @examples
#' dictionary_irs_990_extracts()
dictionary_irs_990_extract_urls <-
  memoise::memoise(function() {
    page <- read_html("https://www.irs.gov/statistics/soi-tax-stats-annual-extract-of-tax-exempt-organization-financial-data")
    nodes <- page %>% html_nodes(".text-align-center a")
    files <- nodes %>% html_text()
    urls <- nodes %>% html_attr("href")
    urls <- urls %>% str_replace_all("\\.dat.dat", "\\.dat")

    data <-
      urls %>%
      map_dfr(function(url){
        .parse_irs_url(url = url)
      }) %>%
      mutate(name_file = files) %>%
      mutate(is_excel = format_file %>% str_detect("excel")) %>%
      select(name_file, everything())

    data <-
      data %>%
      mutate(type_file = case_when(
        slug_file %>% str_detect("eofinextractdoc") ~ "dictionary",
        slug_file %>% str_detect("ez") ~ "990EZ",
        name_file %>% str_detect("-EZ") ~ "990EZ",
        slug_file %>% str_detect("990pf") ~ "990PF",
        TRUE ~ "990"
      )) %>%
      select(year_file, type_file, everything())

    data
  })

.munge_990_names <-
  function(data, file_year = 2019) {
    irs_names <- names(data) %>% str_to_lower()
    tbl_names <- tibble(name_irs = irs_names)
    dict_names <- dictionary_irs_990_extract_names()
    df_names <- dict_names %>% filter(year_file == file_year)

    actual_names <-
      irs_names %>%
      map_chr(function(x){
        x <- str_to_lower(x)
        df_n <-
          df_names %>% filter(name_irs == x)

        if (nrow(df_n) == 0) {
          is_amt <- data %>% select(x) %>% select_if(is.numeric) %>% ncol() == 1
          if (is_amt) {
            x <- glue("{x}") %>% as.character()
          }
          return(as.character(x))
        }
        as.character(df_n$name_actual)
      })

    data %>%
      setNames(actual_names)
  }

.dl_990_data_excel <-
  function(url = "https://www.irs.gov/pub/irs-soi/19eoextract990.xlsx", use_col_names = T) {
    df_metadata  <-
      .parse_irs_url(url = url)

    data <-
      .download_excel_file(url = url, has_col_names = use_col_names)
    names(data) <-
      str_to_lower(names(data))

    data <- data %>%
      .munge_990_names(file_year = df_metadata$year_file)

    data <-
      data %>%
      .munge_irs() %>%
      mutate(url_irs_file = url)

    data <-
      data %>%
      left_join(df_metadata, by = "url_irs_file") %>%
      select(year_file, everything())

    data

  }

.dl_990_dat <-
  function(url = "https://www.irs.gov/pub/irs-soi/15eofinextractEZ.dat") {
    df_metadata  <-
      .parse_irs_url(url = url)

    data <- vroom(url)

    names(data) <-
      str_to_lower(names(data))

    data <-
      data %>%
      .munge_990_names(file_year = df_metadata$year_file)

    data <-
      data %>%
      .munge_irs() %>%
      mutate(url_irs_file = url)

    data <-
      data %>%
      left_join(df_metadata, by = "url_irs_file") %>%
      select(year_file, everything())

    data
  }

.dl_990_zip <-
  function(url) {
    df_metadata  <-
      .parse_irs_url(url = url)
    outfile <- tempfile("download", fileext = ".zip")
    file <- download.file(url, outfile)
    unz_files <- outfile %>% unzip(exdir = "zip")

    data <-
      unz_files %>%
      vroom()

    names(data) <-
      str_to_lower(names(data))

    data <-
      data %>%
      .munge_990_names(file_year = df_metadata$year_file)

    data <-
      data %>%
      .munge_irs() %>%
      mutate(url_irs_file = url)

    data <-
      data %>%
      left_join(df_metadata, by = "url_irs_file") %>%
      select(year_file, everything())
    unz_files %>% unlink()
    file %>% unlink()
    unlink("zip", recursive = T, force = T)

    data
  }

.irs_990_file <-
  function(year = 2018, type = "990") {
    slug_type <- str_to_upper(type)
    if (!slug_type %in% c("990", "990PF", "990EZ")) {
      stop("Type can only be 990, 990PF or 990EZ")
    }
    dict_urls <-
      dictionary_irs_990_extract_urls()

    df_url <-
      dict_urls %>%
      filter(year_file == year) %>%
      filter(type_file == slug_type)
    format_file <- df_url$format_file %>% unique()
    url <- df_url$url_irs_file
    if (format_file %>% str_detect("xls")) {
      data <- .dl_990_data_excel(url = url )
    }

    if (format_file %>% str_detect("zip")) {
      data <- .dl_990_zip(url = url)
    }

    if (format_file %>% str_detect("dat")) {
      data <- .dl_990_dat(url = url)
    }

    data <-
      data %>%
      mutate(type_file = slug_type) %>%
      select(type_file, everything())

    data
  }

#' IRS Tax-Exempt data
#'
#' Acquires and parses information about
#' non-taxable entities
#'
#' @param years vector of years
#' @param types Type of data dump \itemize{
#' \item `990` - Form 990
#' \item `990EZ` - 990 E-Z filing data
#' \item `990PF` - 990 PF data
#' }
#' @param join_ein_data if \code{TRUE} returns EIN data
#'
#' @return `tibble`
#' @export
#'
#' @examples
irs_990_data <-
  function(years = 2019,
           types = "990",
           join_ein_data = F) {
    df_inputs <-
      expand.grid(year = years,
                  type = types,
                  stringsAsFactors = F) %>% as_tibble()
    dict_names <- dictionary_irs_990_extract_names()
    .irs_990_file_safe <- possibly(.irs_990_file, tibble())
    all_data <-
      1:nrow(df_inputs) %>%
      map_dfr(function(x) {
        df_row <- df_inputs[x, ]

        data <-
          .irs_990_file_safe(year = df_row$year, type = df_row$type)
        data
      })

    if (join_ein_data) {
      eins <- all_data %>% distinct(id_ein) %>% pull()
      tbl_entities <-
        irs_detailed_exempt_entities(
          snake_names = T,
          regions = NULL,
          join_locations = T,
          clean_entities = T
        )
      tbl_entities <-
        tbl_entities %>%
        filter(id_ein %in% eins)

      all_data <- all_data %>%
        left_join(tbl_entities %>% select(-matches("data")) %>%
                    select(-one_of(
                      c(
                        "date_tax_period",
                        "year_month_tax_period",
                        "code_subsection"
                      )
                    )), by = "id_ein")
    }

    all_data
  }

# xml ---------------------------------------------------------------------

# https://www.irs.gov/statistics/soi-tax-stats-domestic-private-foundation-and-charitable-trust-statistics#4
abresler/govtrackR documentation built on July 11, 2020, 12:30 a.m.