R/dod.R

Defines functions .dod_budget_year_urls dictionary_dod_budget_codes .munge_dod_names dictionary_dod_budget_names .add_cgac

Documented in dictionary_dod_budget_codes dictionary_dod_budget_names

.add_cgac <-
  function(data) {
    if (!data %>% hasName("slugTreasuryAgency")) {
      return(data)
    }

    df_cgac <- data %>%
      distinct(slugTreasuryAgency) %>%
      mutate(
        nameAgencyCGAC = case_when(
          slugTreasuryAgency == "A" ~ "DEPARTMENT OF THE ARMY",
          slugTreasuryAgency == "N" ~ "DEPARTMENT OF THE NAVY",
          slugTreasuryAgency == "F" ~ "DEPARTMENT OF THE AIR FORCE",
          slugTreasuryAgency == "D" ~ "DEPARTMENT OF DEFENSE",
        ),
        idCGAC = case_when(
          slugTreasuryAgency == "A" ~ 21,
          slugTreasuryAgency == "N" ~ 17,
          slugTreasuryAgency == "F" ~ 57,
          slugTreasuryAgency == "D" ~ 97,
        )
      )

    data <- data %>%
      left_join(df_cgac, by = "slugTreasuryAgency") %>%
      select(yearBudget, names(df_cgac), everything())

    data
  }



# budget_urls -------------------------------------------------------------

#' DOD Budget name dictionaryd
#'
#' @return
#' @export
#'
#' @examples
dictionary_dod_budget_names <-
  function() {
    tibble(
      nameDOD = c(
        "idRow",
        "Account",
        "Account Title",
        "Organization",
        "Budget Activity",
        "Budget Activity Title",
        "Budget Sub Activity",
        "Budget Sub Activity Title",
        "Add/ Non-Add",
        "Include in TOA",
        "Classification",
        "item",
        "value",
        "AG / BSA",
        "AG / Budget SubActivity (BSA) Title",
        "Line Number",
        "SAG / BLI",
        "SAG / Budget Line Item (BLI) Title",
        "BSA",
        "Budget Sub Activity (BSA) Title",
        "Line Item",
        "Line Item Title",
        "Cost Type",
        "Cost Type Title",
        "PE / BLI",
        "Program Element / Budget Line Item (BLI) Title",
        "Treasury Agency",
        "Account Short Title",
        "Budget Activity Short Title",
        "State Country",
        "State Country Title",
        "Fiscal Year",
        "Facility Category Title",
        "Location Title",
        "Construction Project",
        "Construction Project Title",
        "Appropriation",
        "BSA Title",
        "Mil Dept DW",
        "AG Title",
        "SAG", "SAG Title", "PE", "Program Element Title",
        "PL Title Name",
        "Location",
        "PE Title",
        "Organization Title",
        "Appn",
        "Comp",
        "Appn Name",
        "Org",
        "Org Name",
        "Line Num",
        "BA",
        "BA Name",
        "PE Name",
        "Sec",
        "TreasuryCode",
        "Treasury",
        "TreasuryAccount Title",
        "R1 LineNumber",
        "BudgetActivity",
        "Budget",
        "ProgramElement",
        "Program",
        "SecurityClass."
      ),
      nameActual =
        c(
          "idRow",
          "codeAccountOMB",
          "nameAccountOMB",
          "slugOrganization",
          "slugBudgetParent",
          "nameBudgetParent",
          "slugBudgetActivity",
          "nameBudgetActivity",
          "isAdded",
          "isTOA",
          "slugClassification",
          "item",
          "value",
          "slugBudgetActivity",
          "nameBudgetActivity",
          "idLineNumber",
          "codeProgramElement",
          "nameProgramElement",
          "slugBudgetActivity",
          "nameBudgetActivity",
          "codeProgramElement",
          "nameProgramElement",
          "slugCost",
          "typeCost",
          "codeProgramElement",
          "nameProgramElement",
          "slugTreasuryAgency",
          "nameAccountOMB",
          "nameBudgetParent",
          "slugStateCountry",
          "nameStateCountry",
          "yearBudget",
          "nameBudgetActivity",
          "nameLocation",
          "nameConstructionProject",
          "nameProgramElement",
          "nameAccountOMB",
          "nameBudgetActivity",
          "remove",
          "nameBudgetActivity",
          "codeProgramElement", "nameProgramElement", "codeProgramElement", "nameProgramElement",
          "nameDODBudgetGroup",
          "idLocation",
          "nameProgramElement",
          "nameOrganization",
          "codeAccountOMB",
          "slugTreasuryAgency",
          "nameAccountOMB",
          "slugOrganization",
          "nameOrganization",
          "idLineNumber",
          "slugBudgetParent",
          "nameBudgetParent",
          "nameProgramElement",
          "slugClassification",

          "codeAccountOMB",
          "slugTreasuryAgency",
          "nameAccountOMB",
          "idLineNumber",
          "slugBudgetActivity",
          "nameBudgetActivity",
          "codeProgramElement",
          "nameProgramElement",
          "slugClassification"
        )
    )
  }

.munge_dod_names <-
  function(data) {
    dict_names <- dictionary_dod_budget_names()
    fdps_names <-
      names(data)

    actual_names <-
      fdps_names %>%
      map_chr(function(name) {
        df_row <-
          dict_names %>% filter(nameDOD == name)
        if (nrow(df_row) == 0) {
          glue::glue("Missing {name}") %>% message()
          return(name)
        }

        df_row$nameActual
      })

    data %>%
      set_names(actual_names)
  }

#' DOD Budgets Codes
#'
#' @return
#' @export
#'
#' @examples
dictionary_dod_budget_codes <-
  function() {
    tibble(
      slugDODBudgetGroup = c("m1", "o1", "rf1", "p1", "p1r", "r1", "c1", "OGSI") %>% str_to_upper(),
      nameDODBudgetGroup = c(
        "Military Personnel",
        "Operation and Maintenance",
        "Revolving and Management Fund",
        "Procurement",
        "Procurement Reserve",
        "RDTE",
        "Military Construction",
        "Opportunity, Growth, and Security Initiative"
      )
    )
  }

.dod_budget_year_urls <-
  function() {
    page <-
      "https://comptroller.defense.gov/Budget-Materials/" %>% read_html()
    tibble(
      yearBudget = page %>% html_nodes("#sitetitle p a") %>% html_text() %>% as.numeric(),
      urlBudgetDODYear = page %>% html_nodes("#sitetitle p a") %>% html_attr("href") %>% str_c("https://comptroller.defense.gov", .)
    ) %>%
      mutate(isBudgetCurrentYear = yearBudget == max(yearBudget))
  }

.parse_dod_buget_url <-
  function(url = "https://comptroller.defense.gov/Budget-Materials/Budget2020/",
           include_all_page_files = F,
           return_message = T) {
    if (return_message) {
      glue("Parsing {url} for budget files") %>% message()
    }
    page <- url %>% read_html()
    excel_files <-
      page %>% html_nodes("a") %>% html_attr("href") %>% discard(function(x) {
        is.na(x)
      }) %>%
      keep(function(x) {
        x %>% str_detect("xls")
      }) %>%
      str_c("https://comptroller.defense.gov", .) %>%
      discard(function(x) {
        x %>% str_detect("display")
      })


    parts <-
      excel_files %>%
      str_split("/") %>% flatten_chr() %>% keep(function(x) {
        x %>% str_detect("xls")
      }) %>%
      str_split("\\.")

    slugDODBudgetGroup <-
      seq_along(parts) %>%
      map_chr(function(x) {
        z <- parts[[x]][[1]] %>% str_to_upper()
        z <-
          z %>% str_remove_all("FY_")
        if (z %>% str_count("\\_") == 1) {
          z <- z %>% str_split("\\_") %>% flatten_chr() %>% .[[2]]
        }
        z
      })


    parts <-
      url %>% str_split("/") %>% flatten_chr() %>%
      discard(function(x) {
        x == ""
      })

    yearBudget <-
      parts[length(parts)] %>% str_remove_all("\\.aspx") %>% str_remove_all("budget") %>%
      parse_number()


    data <-
      tibble(
        yearBudget,
        slugDODBudgetGroup,
        urlBudgetDODFile = excel_files,
        urlBudgetDODYear = url
      )

    data <-
      data %>%
      mutate(isAmendedBudget = slugDODBudgetGroup %>% endsWith("A")) %>%
      select(yearBudget, isAmendedBudget, everything()) %>%
      mutate(
        slugDODBudgetGroup = case_when(
          isAmendedBudget ~ slugDODBudgetGroup %>% substr(1, nchar(slugDODBudgetGroup) - 1),
          TRUE ~ slugDODBudgetGroup
        )
      ) %>%
      left_join(dictionary_dod_budget_codes(), by = "slugDODBudgetGroup") %>%
      select(yearBudget, nameDODBudgetGroup, everything())

    if (!include_all_page_files) {
      return(data)
    }

    urls <-
      page %>% html_nodes("strong a") %>% html_attr("href")
    file_names <-
      page %>% html_nodes("strong a") %>% html_text()
    data <-
      tibble(urlBudgetDODFile = urls, nameFile = file_names) %>% distinct() %>%
      mutate(
        nameFile = nameFile %>% str_trim(),
        nameFile = case_when(nameFile == "" ~ NA_character_,
                             TRUE ~ nameFile),
        urlBudgetDODFile = case_when(
          urlBudgetDODFile %>% str_detect("http") ~ urlDODFile,
          TRUE ~ str_c("https://comptroller.defense.gov", urlBudgetDODFile)
        ),
        yearBudget,
        typeFile = case_when(
          urlBudgetDODFile %>% str_detect(".xls") ~ "excel",
          urlBudgetDODFile %>% str_detect(".pdf") ~ "pdf",
          urlBudgetDODFile %>% str_detect(".zip") ~ "zip",
          TRUE ~ "html"
        ),
        urlBudgetDODYear = url
      ) %>%
      fill(nameFile) %>%
      mutate(
        slugDODBudgetGroup = case_when(
          nameFile %>% str_detect("M-1|M1") ~ "M1",
          nameFile %>% str_detect("O1|O-1") ~ "O1",
          nameFile %>% str_detect("P1|P-1") ~ "P1",
          nameFile %>% str_detect("P1-R|P1R") ~ "P1R",
          nameFile %>% str_detect("R-1|R1") ~ "R1",
          nameFile %>% str_detect("RF-1|RF1") ~ "RF1",
          nameFile %>% str_detect("C-1|C1") ~ "C1"
        ),
        isGreenBook = nameFile %>% str_detect("Green Book")
      ) %>%
      group_by(urlBudgetDODFile) %>%
      dplyr::slice(1) %>%
      ungroup() %>%
      filter(!urlBudgetDODFile %>% str_detect(".aspx")) %>%
      select(yearBudget, everything()) %>%
      left_join(dictionary_dod_budget_codes(), by = "slugDODBudgetGroup")

    data


  }

#' Department of Defense Budget Data dictionary
#'
#' @param filter_years if not \code{NULL} filters the years to specification
#' @param include_all_page_files if \code{TRUE} returns all data urls otherwise
#' returns just excel links
#' @param return_message if \code{TRUE} returns a message
#'
#' @return
#' @export
#'
#' @examples
dictionary_dod_budget_urls <-
  function(filter_years = NULL,
           include_all_page_files = F,
           return_message = T) {
    df_urls <- .dod_budget_year_urls()

    if (length(filter_years) > 0) {
      df_urls <-
        df_urls %>%
        filter(yearBudget %in% filter_years)
    }

    all_data <-
      df_urls$urlBudgetDODYear %>%
      map_dfr(memoise::memoise(function(url) {
        .parse_dod_buget_url(
          url = url,
          include_all_page_files = include_all_page_files,
          return_message = return_message
        )
      }))

    all_data <-
      all_data %>%
      mutate(
        isBudgetGroupNormal = !is.na(nameDODBudgetGroup),
        nameDODBudgetGroup = case_when(
          is.na(nameDODBudgetGroup) ~ slugDODBudgetGroup,
          TRUE ~ nameDODBudgetGroup
        ) %>% str_to_upper(),
        nameDODBudgetGroup = nameDODBudgetGroup %>% str_replace_all("\\_|\\-", " "),
        isAmendedBudget = case_when(
          isAmendedBudget == F ~ nameDODBudgetGroup %>% str_detect("AMENDMENT|SUPPLA"),
          TRUE ~ isAmendedBudget
        )
      )

    all_data
  }

# parser ------------------------------------------------------------------

.parse_normal_excel <-
  function(tmp, sheet_no = 1) {
    data <-
      tmp %>% read_excel(sheet = sheet_no, col_names = F)
    is_na_1 <-
      data[, 1] %>% pull() %>% .[[1]] %>% is.na()
    is_appropriation <-
      data[,1] %>% pull() %>% str_detect("Appropriation") %>% sum(na.rm = T) >= 1
    only_4 <- ncol(data) == 4 &&
      data %>% pull(1) %>% str_to_upper() %>% str_detect("PL TITLE") %>% sum(na.rm = T) > 0

    early_budget <- data %>% pull(1) %>% str_to_upper() %>% str_detect("CODE") %>% sum(na.rm = T) > 0 && ncol(data) == 12

    if (is_na_1) {
      df_cols <-
        data %>% dplyr::slice(1:2) %>% t() %>% as_tibble()

      df_cols <-
        df_cols %>%
        mutate(
          V1 = V1 %>% str_remove_all("[0-9]|Total of Displayed Rows"),
          V1 =  case_when(V1 == "" ~ NA_character_,
                          TRUE ~ V1)
        )

      column_names <-
        df_cols %>%
        mutate(
          V1 = case_when(V1 == "\\-" ~ NA_character_,
                         TRUE ~ V1),
          item = case_when(is.na(V1) ~ V2,
                           TRUE ~ str_c(V1, V2))
        ) %>%
        pull(item) %>%
        str_replace_all("\r\n|\n", " ") %>%
        str_replace_all("\\-FY", "\\FY")

      if (unique(column_names) %>% length() != ncol(data)) {
        df_cols <-
          tibble(col = column_names) %>% mutate(id = 1:n()) %>% group_by(col) %>%
          mutate(count = 1:n()) %>%
          filter(count == 1) %>%
          ungroup()

        data <- data[, df_cols$id]

        column_names <- df_cols$col
      }

      data <-
        data %>% dplyr::slice(3:nrow(data)) %>%
        set_names(column_names)
    } else if (is_appropriation) {
      val_rows <-
        data[, 1] %>%
        setNames("value") %>%
        mutate(idRow = 1:n()) %>%
        filter(value == "Appropriation") %>% pull(idRow)

      data <-
        data %>% dplyr::slice(val_rows + 1:nrow(data)) %>%
        set_names(
          data %>% dplyr::slice(val_rows) %>% t() %>% as.character() %>% str_replace_all("\r\n|\n", " ") %>%
            str_replace_all("\\-FY", "\\FY")
        ) %>%
        filter(!Appropriation %>% str_detect("^Total"))

    } else if (only_4) {
      val_rows <-
        data[, 1] %>%
        setNames("value") %>%
        mutate(idRow = 1:n()) %>%
        filter(value == "PL Title Name") %>% pull(idRow)

      data <-
        data %>% dplyr::slice(val_rows + 1:nrow(data)) %>%
        set_names(
          data %>% dplyr::slice(val_rows) %>% t() %>% as.character() %>% str_replace_all("\r\n|\n", " ") %>%
            str_replace_all("\\-FY", "\\FY")
        ) %>%
        filter(!`PL Title Name` %>% str_detect("^Total"))
    } else if (early_budget){
      df_cols <-
        data %>% slice(4:5)
      df_cols <- df_cols %>% t() %>% as_tibble() %>% fill(V1)
      cols <- df_cols %>%
        mutate_if(is.character,
                  list(function(x) {
                    x %>% coalesce("")
                  })) %>%
        unite(item, V1, V2, sep = "") %>%
        pull()

      data <- data %>% slice(6:nrow(data))

      data <- data %>%
        setNames(cols)
    } else {
      df_cols <-
        data %>% dplyr::slice(1) %>% t() %>% as_tibble()

      column_names <-
        df_cols$V1 %>%
        str_replace_all("\r\n|\n", " ") %>%
        str_replace_all("\\-FY", "\\FY")

      data <-
        data %>% dplyr::slice(2:nrow(data)) %>%
        set_names(column_names)
    }


    data <-
      data %>%
      mutate(idRow = 1:n()) %>%
      select(idRow, everything())

    remove_cols <-
      data %>% select(matches("FY")) %>%
      select(matches("TOTAL|Total|Remaining Req|remove")) %>%
      names()

    if (length(remove_cols) > 0) {
      data <- data %>%
        select(-one_of(remove_cols))
    }

    gather_cols <-
      data %>% select(matches("FY|Emergency|Amount|Amt")) %>% names()

    key_cols <- data %>% select(-one_of(gather_cols)) %>% names()

    data <-
      data %>% gather(item, value, -key_cols) %>%
      filter(!is.na(value))

    data <-
      .munge_dod_names(data = data)

    if (data %>% hasName("isAdded")) {
      data <- data %>%
        mutate(isAdded = isAdded == "Addd")
    }

    if (data %>% hasName("isTOA")) {
      data <-
        data %>%
        mutate(isTOA = case_when(isTOA == "Y" ~ TRUE,
                                 isTOA == "N" ~ F))
    }

    data <-
      data %>%
      mutate(
        value = parse_number(value) * 1000,
        item = item %>% str_replace_all("\\-FY", "FY"),
        yearBudget = item %>% str_remove_all("FY |Emergency Disaster Relief Act of") %>% substr(1, 5) %>% parse_number(),
        item = item %>% str_remove_all("^FY |\\(|\\)|[0-9]") %>% str_trim(),
        isBudgetCurrentYear = yearBudget == max(yearBudget)
      ) %>%
      filter(value != 0) %>%
      select(idRow, yearBudget, everything())

    data <-
      data %>%
      filter(!item %in% c("Total Enacted", "Total OCO", "Total Base + OCO", "Total"))


    data
  }

.parse_fy_excel <-
  function(tmp) {
    sheet_names <-
      tmp %>% excel_sheets() %>% str_to_upper()
    has_cover <-sheet_names %>% str_detect("COVER") %>% sum(na.rm = T) > 0
    data <-
      seq_along(sheet_names) %>%
      map_dfr(function(x) {
        item <- sheet_names[[x]]
        if (item == "COVER") {
          return(invisible())
        }
        data <- tmp %>% read_excel(sheet = x, col_names = F)
        new_item <-
          sheet_names[[x]] %>% str_remove_all("^FY ") %>%
          str_remove_all("[0-9]") %>%
          str_trim()

        if (new_item == "") {
          new_item <- "Base"
        }

        if (has_cover) {
          df_cols <-
            data %>% dplyr::slice(5) %>% t() %>% as_tibble()
          column_names <- df_cols$V1 %>%
            str_replace_all("\r\n|\n", " ") %>%
            str_replace_all("\\-FY", "\\FY")

          data <-
            data %>%
            dplyr::slice(6:nrow(data)) %>%
            set_names(column_names)

          yearBudget <-
            column_names[column_names %>% str_detect("FY")][[1]] %>% parse_number()
        }
        is_na_1 <- data[, 1] %>% pull() %>% .[[1]] %>% is.na()
        if (is_na_1) {
          yearBudget <- item %>% parse_number()
          df_cols <- data %>% dplyr::slice(1:2) %>% t() %>% as_tibble()
          df_cols <- df_cols %>%
            mutate(
              V1 = V1 %>% str_remove_all("[0-9]|Total of Displayed Rows"),
              V1 =  case_when(V1 == "" ~ NA_character_,
                              TRUE ~ V1)
            )

          column_names <-
            df_cols %>%
            mutate(item = case_when(is.na(V1) ~ V2,
                                    TRUE ~ str_c(V1, V2))) %>%
            pull(item) %>%
            str_replace_all("\r\n|\n", " ") %>%
            str_replace_all("\\-FY", "\\FY")

          data <-
            data %>% dplyr::slice(3:nrow(data)) %>%
            set_names(column_names)
        }


        if (!has_cover & !is_na_1) {
          yearBudget <- item %>% parse_number()
          df_cols <-
            data %>% dplyr::slice(1) %>% t() %>% as_tibble()

          column_names <-
            df_cols$V1 %>%
            str_replace_all("\r\n|\n", " ") %>%
            str_replace_all("\\-FY", "\\FY")

          data <-
            data %>% dplyr::slice(2:nrow(data)) %>%
            set_names(column_names)
        }

        data <-
          data %>%
          mutate(idRow = 1:n()) %>%
          select(idRow, everything())

        remove_cols <-
          data %>% select(matches("FY")) %>% select(matches("TOTAL|Total|Remaining Req|^remove")) %>%
          names()

        if (length(remove_cols) > 0) {
          data <- data %>%
            select(-one_of(remove_cols))
        }

        gather_cols <-
          data %>% select(matches("FY|Amount")) %>% names()

        key_cols <-
          data %>% select(-one_of(gather_cols)) %>% names()

        data <-
          data %>% gather(item, value, -key_cols) %>%
          filter(!is.na(value))

        data <-
          data %>%
          mutate(item = item %>% gsub("\\s+", " ", .))

        data <-
          .munge_dod_names(data = data)

        if (data %>% hasName("isAdded")) {
          data <- data %>%
            mutate(isAdded = isAdded == "Addd")
        }

        if (data %>% hasName("isTOA")) {
          data <-
            data %>%
            mutate(isTOA = case_when(isTOA == "Y" ~ TRUE,
                                     isTOA == "N" ~ F))
        }
        if (data %>%
            filter(item %>% str_detect("TOA")) %>% nrow() > 0) {
          data <-
            data %>%
            filter(item %>% str_detect("TOA"))
        }

        if (data %>% hasName("nameAccountOMB")) {
          data <- data %>%
            filter(!nameAccountOMB %>% str_detect("Total")) %>%
            filter(!is.na(nameAccountOMB))
        }

        if (!has_cover) {
          data <- data %>%
            mutate(item = new_item)
        }

          data <-
          data %>%
          mutate(
            value = as.numeric(value) * 1000,
            yearBudget,
            item = item %>% str_remove_all("^FY |\\(|\\)|[0-9]") %>% str_trim(),
            isBudgetCurrentYear = yearBudget == max(yearBudget)
          ) %>%
          filter(value != 0) %>%
          select(idRow, yearBudget, everything())

        data <-
          data %>%
          filter(!item %in% c("Total Enacted", "Total OCO", "Total Base + OCO", "Total"))
        data
      })

    if (data %>% hasName("yearBudget")) {
      data <- data %>%
        mutate(yearBudget = as.numeric(yearBudget))
    }

    data <- data %>%
      mutate(isBudgetCurrentYear = yearBudget == max(yearBudget, na.rm = T))

    data
  }

.dl_dod_excel <-
  function(url = "https://comptroller.defense.gov/Portals/45/Documents/defbudget/fy2020/m1.xlsx",
           only_current_year = T,
           return_message = T) {
    if (return_message) {
      glue("Downloading {url}") %>% message()
    }
    tmp <-
      tempfile()
    curl_download(url, tmp)
    fileparts <- url %>% str_split("/") %>% flatten_chr()
    budget_group <-
      fileparts[length(fileparts)] %>%
      str_remove_all("\\.xlsx") %>%
      str_remove_all("\\.xls") %>%
      str_to_upper()

    if (budget_group %>% endsWith("A")) {
      budget_group <- budget_group %>% str_remove_all("A")
    }

    sheet_names <- tmp %>% excel_sheets() %>% str_to_upper() %>% str_trim()
    sheet_names %>% print()

    starts_fy <-
      sheet_names[[1]] %>% str_trim() %>%
      startsWith("FY")

    is_cover <-
      sheet_names[[1]] %>% str_to_upper() == "COVER"
    is_ogsi <- url == "https://comptroller.defense.gov/portals/45/documents/defbudget/fy2015/ogsi.xlsx"
    use_fy <- starts_fy | is_cover
    if (use_fy) {
      data <-
        .parse_fy_excel(tmp = tmp)
    } else if (is_ogsi) {
      data <-
        .parse_normal_excel(tmp = tmp, sheet_no = 2)
    } else {
      data <-
        .parse_normal_excel(tmp = tmp, sheet_no = 1)
    }

    data <-
      data %>%
      mutate(item = case_when(
        item == "" ~ "Base",
        TRUE ~ item
      ))

    data <-
      data %>%
      mutate(slugDODBudgetGroup = budget_group) %>%
      select(idRow, slugDODBudgetGroup, everything())

    if (only_current_year) {
      data <-
        data %>%
        filter(isBudgetCurrentYear)
    }

    data <-
      data %>%
      mutate(urlBudgetDODFile = url)

    data <- data %>%
      mutate_if(is.character,
                list(function(x) {
                  gsub("\\s+", " ", x) %>% stri_enc_toascii() %>% str_remove_all(" \032 ")
                })) %>%
      dplyr::select(which(colMeans(is.na(.)) < 1))

    tmp %>%
      unlink()
    data

  }

.dl_dod_urls <-
  function(urls = "https://comptroller.defense.gov/Portals/45/Documents/defbudget/fy2020/m1.xlsx",
           only_current_year = T,
           return_message = T) {
    .dl_dod_excel_safe <- possibly(.dl_dod_excel, tibble())

    all_data <-
      urls %>%
      map_dfr(function(url) {
        .dl_dod_excel(
          url = url,
          only_current_year = only_current_year,
          return_message = return_message
        )
      })

    all_data <-
      all_data %>%
      left_join(dictionary_dod_budget_codes(), by = "slugDODBudgetGroup") %>%
      select(one_of(
        c(
          "yearBudget",
          "isBudgetCurrentYear",
          "slugDODBudgetGroup",
          "nameDODBudgetGroup"
        )
      ),
      everything())

    all_data <-
      all_data %>%
      select(one_of(all_data %>% select(-one_of(c(
        "item", "value"
      ))) %>% names()), everything()) %>%
      mutate(item = str_to_upper(item)) %>%
      rename(amountItem = value,
             typeBudget = item)


    if (all_data %>% filter(typeBudget %>% str_detect("QUANTITY")) %>% nrow() > 0) {
      df_counts <-
        all_data %>% filter(typeBudget %>% str_detect("QUANTITY")) %>%
        select(
          yearBudget,
          codeProgramElement,
          isBudgetCurrentYear,
          slugOrganization,
          slugDODBudgetGroup,
          codeAccountOMB,
          matches("nameBudget"),
          typeBudget,
          countItem = amountItem
        ) %>%
        mutate(countItem = countItem / 1000)

      df_counts <-
        df_counts %>%
        mutate(typeBudget = typeBudget %>% str_remove_all(" QUANTITY") %>% str_trim())

      all_data <-
        all_data %>%
        filter(!typeBudget %>% str_detect("QUANTITY"))

      all_data <-
        all_data %>%
        mutate(
          typeBudget = case_when(
            typeBudget %>% endsWith(" AMOUNT") ~ typeBudget %>% str_remove_all(" AMOUNT"),
            TRUE ~ typeBudget
          )
        )

      all_data <-
        all_data %>%
        left_join(df_counts,
                  by = names(df_counts)[names(df_counts) %in% names(all_data)])

      all_data <-
        all_data %>%
        mutate(amountUnitCost = case_when(is.na(countItem) ~ NA_real_,
                                          TRUE ~ amountItem / countItem))
    }

    all_data <-
      all_data %>%
      .munge_data(clean_address = F)

    ignore_types <-
      c(
        "TOTAL BASE + OCO AMOUNT",
        "Total Base + OCO Quantity",
        "TOTAL OCO QUANTITY",
        "TOTAL BASE + OCO AMOUNT",
        "TOTAL OCO",
        "TOTAL BASE + OCO"
      ) %>% str_to_upper() %>% unique()

    all_data <-
      all_data %>%
      filter(!typeBudget %in% ignore_types)


    has_parent_line <-
      all_data %>% hasName("nameBudgetActivity") &
      all_data %>% hasName("nameProgramElement")

    if (has_parent_line) {
      all_data <-
        all_data %>%
        mutate(
          nameBudgetActivity = case_when(
            is.na(nameBudgetActivity) ~ nameProgramElement,
            TRUE ~ nameBudgetActivity
          ),
          nameProgramElement = case_when(
            nameProgramElement == nameBudgetActivity ~ NA_character_,
            TRUE ~ nameProgramElement
          )
        )
    }

    all_data <-
      all_data %>%
      mutate(
        typeBudgetActual = typeBudget,
        typeBudgetActual = case_when(typeBudgetActual %>% str_detect("OCO") ~ "OCO",
                                     TRUE ~ "BASE"),
        typeBudgetSub = typeBudget %>% str_remove_all("\\+| FOR | BASE |OCO") %>% str_trim(),
        typeBudgetSub = case_when(
          typeBudgetSub == typeBudgetActual ~ NA_character_,
          TRUE ~ typeBudgetSub
        )
      ) %>%
      select(-typeBudget) %>%
      rename(typeBudget = typeBudgetActual) %>%
      select(
        yearBudget,
        slugDODBudgetGroup,
        nameDODBudgetGroup,
        typeBudget,
        typeBudgetSub,
        everything()
      )

    if (all_data %>% hasName("nameProgramElement") & all_data %>% hasName("nameBudgetActivity")) {
      all_data <-
        all_data %>%
        mutate(nameProgramElement = case_when(
          is.na(nameProgramElement) ~ nameBudgetActivity,
          TRUE ~ nameProgramElement
        ))
    }



    if (all_data %>% hasName("nameAccountOMB")) {
      all_data <-
        all_data %>%
        separate(
          nameAccountOMB,
          into = c("nameAccountOMB", "slugOrganizationAccount"),
          sep = ",\\s*(?=[^,]+$)",
          fill = "right",
          extra = "merge"
        )

      if (data %>% hasName("slugOrganization")) {


      all_data <-
        all_data %>%
        mutate(
          slugOrganizationAccount = case_when(
            slugOrganizationAccount %in% c("AIR FORCE") ~ "AF",
            slugOrganizationAccount %in% c("ANG") ~ "AFNG",
            slugOrganizationAccount %in% c("AF RES") ~ "AF RESERVE",
            slugOrganizationAccount %in% c("MARINE CORPS") ~ "MC",
            slugOrganizationAccount %in% c("MC RES") ~ "MC RESERVE",
            slugOrganizationAccount %in% c("NAVY & MC") ~ "N/MC",
            slugOrganizationAccount %in% c("N") ~ "NAVY",
            slugOrganizationAccount %in% c("A") ~ "ARMY",
            slugOrganizationAccount %in% c("A RES", "ARMY R", "ARMY RES") ~ "ARMY RESERVE",
            slugOrganizationAccount %in% c("A GUARD") ~ "ARNG",
            slugOrganizationAccount %in% c("N RES", "NAVY RES") ~ "NAVY RESERVE",
            slugOrganizationAccount %in% c("DEFENSE-WIDE", "DEF-WIDE", "DEFENSE", "DEF") ~ "DW",
            slugOrganizationAccount %>% str_detect("DEF COUNTERINTELLIGENCE|AND CIVIC AID") ~ "DW",
            TRUE ~ slugOrganizationAccount
          )
        ) %>%
        mutate(slugOrganization = case_when(
          is.na(slugOrganization) ~ slugOrganizationAccount,
          TRUE ~ slugOrganization
        )) %>%
        mutate(
          slugOrganizationAccount = case_when(
            is.na(slugOrganizationAccount) ~ slugOrganization,
            TRUE ~ slugOrganizationAccount
          )
        )
      }
    }

    if (all_data %>% hasName("nameBudgetActivity")) {
      all_data <-
        all_data %>%
        mutate(nameBudgetActivityActual = nameBudgetActivity)

      activities <-
        all_data %>%
        filter(!is.na(nameBudgetActivityActual)) %>%
        distinct(nameBudgetActivityActual) %>%
        pull()

      df_activities <-
        activities %>%
        map_dfr(function(x) {
          x %>% message()
          parts <-
            x %>%
            str_split('\\(') %>%
            flatten_chr() %>%
            str_split("\\)") %>%
            flatten_chr() %>%
            str_trim() %>%
            discard(function(x) {
              x == ""
            })

          if (length(parts) == 1) {
            d <- tibble(nameBudgetActivityActual = x,
                        nameBudgetActivity = x)

            return(d)
          }

          if (length(parts) == 3) {
            d <- tibble(
              nameBudgetActivityActual = x ,
              nameBudgetActivity = parts[c(1, 3)] %>% str_c(collapse = " "),
              slugBudgetActivity = parts[2]
            )
            return(d)
          }

          d <- tibble(
            nameBudgetActivityActual = x ,
            nameBudgetActivity = parts[1],
            slugBudgetActivity = parts[2]
          )
          d
        })

      all_data <-
        all_data %>%
        select(-one_of("nameBudgetActivityActual")) %>%
        left_join(df_activities, by = "nameBudgetActivity") %>%
        mutate(
          nameBudgetActivity = nameBudgetActivity %>% str_trim(),
          nameBudgetActivity = nameBudgetActivity %>% gsub("\\)|\\(", "" , .) %>% str_trim()
        ) %>%
        separate(
          nameBudgetActivity,
          into = c("nameBudgetActivity", "nameBudgetActivityDetail"),
          sep = "\\ - |\\- ",
          fill = "right",
          extra = "merge"
        )
    }

    if (all_data %>% hasName("nameBudgetParent")) {
      all_data <-
        all_data %>%
        mutate(
          nameBudgetParent = case_when(
            nameBudgetParent %in% c("ADVANCED COMPONENT DEVELOPMENT & PROTOTYPES") ~ "ADVANCED COMPONENT DEVELOPMENT AND PROTOTYPES",
            nameBudgetParent %in% c("OPERATIONAL SYSTEM DEVELOPMENT") ~ "OPERATIONAL SYSTEMS DEVELOPMENT",
            nameBudgetParent %in%  c("SYSTEM DEVELOPMENT & DEMONSTRATION") ~ "SYSTEM DEVELOPMENT AND DEMONSTRATION",
            nameBudgetParent %in% c("RESEARCH, DEVELOPMENT, TEST, AND EVALUATION", "RDT&E") ~ "RDT&E",
            nameBudgetParent %in% c("RDT&E MANAGEMENT SUPPORT") ~ "RDTE SUPPORT",
            nameBudgetParent %in% c(
              "ADMIN & SRVWD ACTIVITIES",
              "ADMIN & SRVWIDE ACTIVITIES",
              "ADMINISTRATION AND SERVICE-WIDE ACTIVITIES",
              "ADMINISTRATION AND SERVICEWIDE ACTIVITIES"
            ) ~ "ADMINISTRATION AND SERVICEWIDE ACTIVITIES",
            nameBudgetParent %in% c("ADMIN EXPNS") ~ "ADMINISTRATIVE EXPENSES",
            nameBudgetParent %in% c(
              "AIRCRAFT SUPPORT EQUIP & FACILITIES",
              "AIRCRAFT SUPT EQUIPMENT & FACILITIES"
            ) ~ "AIRCRAFT SUPPORT EQUIPMENT AND FACILITIES",
            nameBudgetParent %in% c("COMMUNICATIONS & ELECTRONICS EQUIP") ~ "COMMUNICATIONS AND ELECTRONICS EQUIPMENT",
            nameBudgetParent %in% c("MAJOR CONST") ~ "MAJOR CONSTRUCTION",
            nameBudgetParent %in% c("MINOR CONST") ~ "MINOR CONSTRUCTION",
            nameBudgetParent %in% c("OPERATION & MAINTENANCE") ~ "OPERATIONS AND MAINTENANCE",
            nameBudgetParent %in% c("OPERATIONAL SYSTEM DEVELOPMENT") ~ "OPERATIONAL SYSTEMS DEVELOPMENT",
            nameBudgetParent %in% c("SPARES", "SPARE AND REPAIR PARTS") ~ "SPARES AND REPAIR PARTS",
            nameBudgetParent %in% c("SYSTEM DEVELOPMENT & DEMONSTRATION") ~ "SYSTEM DEVELOPMENT AND DEMONSTRATION",
            nameBudgetParent %in% c(
              "PAY AND ALLOWANCES OF ENLISTED",
              "PAY AND ALLOWANCES OF ENLISTED PERSONNEL"
            ) ~
              "PAY AND ALLOWANCES - ENLISTED PERSONNEL",
            nameBudgetParent %in% c("PAY AND ALLOWANCES OF CADETS") ~ "PAY AND ALLOWANCES - CADETS",
            nameBudgetParent %in% c("PAY AND ALLOWANCES OF OFFICERS") ~ "PAY AND ALLOWANCES - OFFICERS",
            TRUE ~ nameBudgetParent
          )
        ) %>%
        mutate(
          nameBudgetParent = nameBudgetParent %>% str_replace_all("SUPPLY CHAIN MANAGEMENT", replacement =  "SUPPLY MANAGEMENT") %>% str_replace_all("\\, MC", replacement = "\\ - MC") %>% str_replace_all("\\, NAVY", "\\ - NAVY")
        ) %>%
        rename(nameBudgetParentActual = nameBudgetParent) %>%
        separate(
          nameBudgetParentActual,
          into = c("nameBudgetParent", "nameBudgetParentDetail"),
          sep = "\\ - ",
          fill = "right",
          remove = F,
          extra = "merge"
        ) %>%
        mutate_if(is.character, str_trim)
    }

    if (all_data %>% hasName("nameProgramElement")) {
      all_data <-
        all_data %>%
        mutate(nameProgramElementActual = nameProgramElement)

      budget_items <-
        all_data %>%
        distinct(nameProgramElement) %>%
        filter(!is.na(nameProgramElement)) %>%
        pull()

      df_items <-
        budget_items %>%
        map_dfr(function(x) {
          x %>% message()
          is_semi <-
            x %>% str_detect("\\:|^EDI-|^SOF |^OCO-")

          if (is_semi) {
            parts <-
              x %>%
              str_replace_all("^SOF ", "SOF:") %>%
              str_split('\\:|\\-') %>%
              flatten_chr() %>%
              str_split("\\)") %>%
              flatten_chr() %>%
              str_trim() %>%
              discard(function(x) {
                x == ""
              })

            d <-
              tibble(
                nameProgramElementActual = x ,
                nameProgramElement = parts[2],
                codeProgramElement = parts[1]
              )

            return(d)
          }

          parts <-
            x %>%
            str_split('\\(') %>%
            flatten_chr() %>%
            str_split("\\)") %>%
            flatten_chr() %>%
            str_trim() %>%
            discard(function(x) {
              x == ""
            })

          if (length(parts) == 1) {
            d <-
              tibble(nameProgramElementActual = x,
                     nameProgramElement = x)
            return(d)
          }

          if (length(parts) == 3) {
            d <- tibble(
              nameProgramElementActual = x ,
              nameProgramElement = parts[c(1, 3)] %>% str_c(collapse = " "),
              codeProgramElement = parts[2]
            )
            return(d)
          }

          d <- tibble(
            nameProgramElementActual = x ,
            nameProgramElement = parts[1],
            codeProgramElement = parts[2]
          )
          d
        }) %>%
        select(-matches("slug"))


      if (df_items %>% hasName("codeProgramElement")) {
        df_items <- df_items %>%
          rename(detailProgramElement = codeProgramElement)
      }


      all_data <-
        all_data %>%
        select(-one_of("nameProgramElementActual")) %>%
        left_join(df_items, by = "nameProgramElement") %>%
        mutate(
          nameProgramElement = nameProgramElement %>% str_trim(),
          nameProgramElement = nameProgramElement %>% gsub("\\)|\\(", "" , .) %>% str_trim()
        )
    }


    if (all_data %>% hasName("slugOrganization")) {
      all_data <-
        all_data %>%
        mutate(
          slugOrganization = case_when(
            slugOrganization %in% c("AIR FORCE", "F") ~ "AF",
            slugOrganization %in% c("A") ~ "ARMY",
            slugOrganization %in% c("N") ~ "NAVY",
            slugOrganization %in% c("MARINE CORPS") ~ "MC",
            slugOrganization %in% c("DEFENSE-WIDE", "DEF-WIDE", "DEFENSE",
                                    "DEF", "D") ~ "DW",

            TRUE ~ slugOrganization
          )
        )

    }

      all_data <-
        all_data %>%
        select(one_of(all_data %>%
                        select(-matches("amount|count")) %>% names()),
               everything()) %>%
        select(yearBudget:typeBudget, matches("name"), everything()) %>%
        arrange(slugDODBudgetGroup, idRow) %>%
        select(-idRow)

      all_data
  }



# download ----------------------------------------------------------------


#' Department of Defense Budgets
#'
#' Acquires DOD budgets for specified
#' years
#'
#' @param budget_years vector of budgegt years
#' @param use_ammendments if \code{TRUE} uses
#' ammended budgets
#' @param only_current_year if \code{TRUE} includes only
#' current year budget
#' @param return_message if \code{TRUE} returns message
#'
#' @return
#' @export
#'
#' @examples
#' \dontrun{
#' df_2020 <-
#' dod_years_budgets(budget_years = 2020)
#' }
dod_years_budgets <-
  function(budget_years = 2020,
           use_ammendments = F,
           only_current_year = T,
           only_normal_budgets = F,
           return_message = T) {

    if (length(budget_years) == 0) {
      stop("Enter a budget year")
    }
    df_urls <-
      dictionary_dod_budget_urls(
        include_all_page_files = F,
        filter_years = budget_years,
        return_message = return_message
      ) %>%
      rename(yearBudgetData = yearBudget)

    if (df_urls$isAmendedBudget %>% sum(na.rm = T) > 0 &
        use_ammendments) {
      df_urls <- df_urls %>% filter(isAmendedBudget)
    } else {
      df_urls <- df_urls %>% filter(!isAmendedBudget)
    }

    if (only_normal_budgets) {
      df_urls <- df_urls %>% filter(isBudgetGroupNormal)
    }

    df_urls <- df_urls %>%
      group_by(yearBudgetData, nameDODBudgetGroup, isAmendedBudget) %>%
      dplyr::slice(1) %>%
      ungroup()

    urls <- df_urls$urlBudgetDODFile

    all_data <-
      .dl_dod_urls(
        urls = urls,
        only_current_year = only_current_year,
        return_message = return_message
      ) %>%
      select(-matches("remove"))

    if (all_data %>% hasName("slugTreasuryAgency")) {
      all_data <- all_data %>%
        mutate(slugTreasuryAgency = case_when(
          is.na(slugTreasuryAgency) ~ codeAccountOMB %>% str_remove_all("[0-9]"),
          TRUE ~ slugTreasuryAgency
        ),
        codeAccountOMB = codeAccountOMB %>% str_remove_all("[A-Z]")
        )
    }

    column_order <-
      c(
        "yearBudget",
        "isBudgetCurrentYear",
        "slugDODBudgetGroup",
        "nameDODBudgetGroup",
        "slugClassification",
        "typeBudget",
        "typeBudgetSub",
        "slugBudgetParent",
        "codeAccountOMB",
        "nameAccountOMB",
        "slugOrganization",
        "slugOrganizationAccount",
        "slugBudgetParent",
        "nameBudgetParentActual",
        "nameBudgetParent",
        "nameBudgetParentDetail",
        "slugBudgetActivity",
        "nameBudgetActivityActual",
        "nameBudgetActivity",
        "nameBudgetActivityDetail",
        "codeProgramElement",
        "nameProgramElementActual",
        "nameProgramElement",
        "nameProgramElementDetail",
        "nameLocation",
        "nameConstructionProject",
        "nameStateCountry",
        "isAdded",
        "isTOA",
        "urlBudgetDODFile",
        "idLineNumber",
        "slugCost",
        "typeCost",
        "slugTreasuryAgency",
        "slugStateCountry",
        "amountItem",
        "countItem",
        "amountUnitCost"

      )

    all_data <-
      all_data %>%
      mutate(
        nameProgramElement = case_when(
          is.na(nameProgramElement) ~ nameProgramElementActual,
          TRUE ~ nameProgramElement
        ),
        nameProgramElementActual = case_when(
          is.na(nameProgramElementActual)  ~ nameProgramElement,
          TRUE ~ nameProgramElementActual
        )
      )

    all_data <- all_data %>%
      mutate(codeProgramElement = case_when(
        is.na(codeProgramElement) ~ nameConstructionProject,
        TRUE ~ codeProgramElement
      ))

    all_data <- all_data %>%
      select(one_of(column_order), everything())

    all_data <- all_data %>%
      fill(slugClassification)

    all_data <-
      all_data %>%
      .add_cgac() %>%
      .generate_federal_account_ids(cgac_column = "idCGAC", account_column = "codeAccountOMB")

    all_data

  }
abresler/govtrackR documentation built on July 11, 2020, 12:30 a.m.