data-raw/processData2017recast.R

################################################################################
#
# Load required libraries
#
################################################################################

library(pdftools)
library(tm)
library(tabulizer)
library(stringr)
library(tidyverse)
library(tidytext)


################################################################################
#
# Function to process tables
#
################################################################################

get_table <- function(tab, nrow = NULL, ncol = NULL) {
  labs <- NULL
  df <- matrix(data = NA, nrow = nrow, ncol = ncol)

  for(i in 1:(length(tab))) {
    words <- tab[[i]][str_detect(string = tab[[i]], pattern = "^[a-zA-Z]+$")]
    phrase <- ""

    for(j in 1:length(words)){
      phrase <- paste(phrase, words[j], sep = " ")
    }

    phrase <- str_remove(string = phrase, pattern = " ")
    labs <- c(labs, phrase)
    numbers <- tab[[i]][str_detect(string = tab[[i]],
                                   pattern = "^(\\d{6}|\\d{7}|\\d{8})$")]
    numbers <- as.numeric(numbers)
    df[i, ] <- numbers
  }
}



################################################################################
#
# Create list for information and tables
#
################################################################################

## Extract tables
health1 <- extract_tables(file = "data-raw/budget/2018 Ministry of Finance and Development Planning Recast.pdf",
                          pages = 15:21,
                          method = "decide")

tab1 <- health1[[1]]

tab1 <- tab1[c(8:36), ]

tab1[3, 1] <- paste(tab1[3, 1], tab1[5, 1], sep = " ")
tab1[3, 2:8] <- tab1[4, 2:8]

tab1[9, 1] <- paste(tab1[9, 1], tab1[11, 1], sep = " ")
tab1[9, 2:8] <- tab1[10, 2:8]


tab2 <- health1[[2]]

################################################################################

tab3 <- health1[[3]]

tab3 <- tab3[24:43, ]
tab3[2, 1] <- paste(tab3[2, 1], tab3[4, 1], sep = " ")
tab3[2, 2:8] <- tab3[3, 2:8]

tab3[5, 1] <- paste(tab3[5, 1], tab3[7, 1], sep = " ")
tab3[5, 2:8] <- tab3[6, 2:8]

tab3[8, 1] <- paste(tab3[8, 1], tab3[10, 1], sep = " ")
tab3[8, 2:8] <- tab3[9, 2:8]

tab3[11, 1] <- paste(tab3[11, 1], tab3[13, 1], sep = " ")
tab3[11, 2:8] <- tab3[12, 2:8]

tab3[15, 1] <- paste(tab3[15, 1], tab3[17, 1], sep = " ")
tab3[15, 2:8] <- tab3[16, 2:8]

tab3[18, 1] <- paste(tab3[18, 1], tab3[20, 1], sep = " ")
tab3[18, 2:8] <- tab3[19, 2:8]

tab3 <- tab3[c(1:2, 5, 8, 11, 14:15, 18), ]

tab3[ , 1] <- str_replace_all(string = tab3[ , 1], pattern = ",", replacement = "")
tab3[ , 2] <- str_replace_all(string = tab3[ , 2], pattern = ",", replacement = "")
tab3[ , 3] <- str_replace_all(string = tab3[ , 3], pattern = ",", replacement = "")
tab3[ , 4] <- str_replace_all(string = tab3[ , 4], pattern = ",", replacement = "")
tab3[ , 5] <- str_replace_all(string = tab3[ , 5], pattern = ",", replacement = "")
tab3[ , 6] <- str_replace_all(string = tab3[ , 6], pattern = ",", replacement = "")
tab3[ , 7] <- str_replace_all(string = tab3[ , 7], pattern = ",", replacement = "")
tab3[ , 8] <- str_replace_all(string = tab3[ , 8], pattern = ",", replacement = "")

tab3 <- data.frame(str_split_fixed(string = tab3[ , 1], pattern = " - ", n = 2), tab3[ , 2:8])

tab3[ , 7] <- str_replace_all(string = tab3[ , 7], pattern = "-", replacement = "")
tab3[ , 7] <- str_remove_all(string = tab3[ , 7], pattern = "\\(")
tab3[ , 7] <- str_remove_all(string = tab3[ , 7], pattern = "\\)")

names(tab3) <- c("code", "item", "original", "revised", "allot", "balance",
                 "netAdjust", "recast", "adjRecast")

tab3[ , 1] <- as.numeric(as.character(tab3[ , 1]))
tab3[ , 2] <- as.character(tab3[ , 2])
tab3[ , 3] <- as.numeric(as.character(tab3[ , 3]))
tab3[ , 4] <- as.numeric(as.character(tab3[ , 4]))
tab3[ , 5] <- as.numeric(as.character(tab3[ , 5]))
tab3[ , 6] <- as.numeric(as.character(tab3[ , 6]))
tab3[ , 7] <- as.numeric(as.character(tab3[ , 7]))
tab3[ , 8] <- as.numeric(as.character(tab3[ , 8]))
tab3[ , 9] <- as.numeric(as.character(tab3[ , 9]))

################################################################################


tab4 <- health1[[4]]

tab5 <- health1[[5]]

tab6 <- health1[[6]]

tab7 <- health1[[7]]
validmeasures/liberiaNutriBudget documentation built on June 4, 2019, 5:45 p.m.