#' Read a Nordea bank statement pdf-file.
#'
#' \code{read_statement} reads and cleans up a Finnish Nordea bank statement
#' pdf-file and returns a tidy dataset (of class data.frame, tibble).
#'
#' The function assumes a fairly strict format in the pdf-file, i.e., small
#' changes in the formatting can cause problems.
#'
#' @param file A full path to a Nordea bank statement pdf-file that is typically
#' named "attachment.pdf" when downloaded from the online bank.
#'
#' @return A tibble (of balances and transactions).
#'
#' @importFrom dplyr "%>%"
#'
#' @export
read_statement <- function(file) {
output <- pdftools::pdf_text(file) %>%
stringr::str_split("\n") %>%
unlist() %>%
stringr::str_squish() %>%
dplyr::tibble(line = 1:length(.), text = .) %>%
dplyr::filter(
stringr::str_detect(text, ",[0-9]+\\+") |
stringr::str_detect(text, ",[0-9]+-")
) %>%
tidyr::extract(text, "amount", "([0-9\\.]+,[0-9\\.\\+-]+)", remove = FALSE) %>%
tidyr::extract(text, "counterparty", "([a-zA-Z]+.+[a-zA-Z]+)", remove = FALSE) %>%
tidyr::extract(text, "record_date", "(^[0-9][0-9\\.]+[0-9]+)") %>%
dplyr::filter(!is.na(record_date)) %>%
dplyr::mutate(type = dplyr::case_when(
is.na(record_date) ~ "balance",
stringr::str_detect(record_date, "\\.[0-9]+\\.") ~ "balance",
TRUE ~ "transaction"
)) %>%
dplyr::filter(!(dplyr::row_number() == nrow(.) & type == "transaction")) %>%
tidyr::separate(record_date, c("day", "month", "year"), fill = "right") %>%
dplyr::mutate(year = dplyr::if_else(
condition = is.na(year),
true = year %>%
na.omit() %>%
unique() %>%
as.numeric() %>%
max() %>%
as.character(),
false = year
)) %>%
tidyr::unite(col = record_date, day, month, year, sep = "-") %>%
dplyr::mutate(
record_date = lubridate::dmy(record_date),
amount = amount %>%
stringr::str_remove_all("\\.") %>%
stringr::str_replace(",", ".") %>%
stringr::str_remove("\\+")
) %>%
dplyr::mutate(amount = dplyr::if_else(
stringr::str_detect(amount, "-"),
amount %>% stringr::str_remove("-") %>% stringr::str_c("-", .),
amount
)) %>%
dplyr::mutate(
amount = as.numeric(amount),
counterparty = dplyr::if_else(counterparty %in% c("Saldo", "Loppusaldo"), "N/A", counterparty)
) %>%
dplyr::select(
type,
record_date,
counterparty,
amount,
-line
) %>%
dplyr::arrange(record_date)
derived_balances <- output %>%
dplyr::filter(dplyr::row_number() == 1 | type == "transaction") %>%
dplyr::mutate(cumulative_amount = cumsum(amount)) %>%
dplyr::filter(type == "transaction") %>%
dplyr::mutate(type = "balance", counterparty = "N/A") %>%
dplyr::mutate(amount = cumulative_amount) %>%
dplyr::select(-cumulative_amount)
output %>%
dplyr::bind_rows(derived_balances) %>%
dplyr::arrange(type, record_date)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.