R/read_statement.R

#' Read a Nordea bank statement pdf-file.
#'
#' \code{read_statement} reads and cleans up a Finnish Nordea bank statement
#' pdf-file and returns a tidy dataset (of class data.frame, tibble).
#'
#' The function assumes a fairly strict format in the pdf-file, i.e., small
#' changes in the formatting can cause problems.
#'
#' @param file A full path to a Nordea bank statement pdf-file that is typically
#' named "attachment.pdf" when downloaded from the online bank.
#'
#' @return A tibble (of balances and transactions).
#'
#' @importFrom dplyr "%>%"
#'
#' @export

read_statement <- function(file) {

  output <- pdftools::pdf_text(file) %>%
    stringr::str_split("\n") %>%
    unlist() %>%
    stringr::str_squish() %>%
    dplyr::tibble(line = 1:length(.), text = .) %>%
    dplyr::filter(
      stringr::str_detect(text, ",[0-9]+\\+") |
        stringr::str_detect(text, ",[0-9]+-")
    ) %>%
    tidyr::extract(text, "amount", "([0-9\\.]+,[0-9\\.\\+-]+)", remove = FALSE) %>%
    tidyr::extract(text, "counterparty", "([a-zA-Z]+.+[a-zA-Z]+)", remove = FALSE) %>%
    tidyr::extract(text, "record_date", "(^[0-9][0-9\\.]+[0-9]+)") %>%
    dplyr::filter(!is.na(record_date)) %>%
    dplyr::mutate(type = dplyr::case_when(
      is.na(record_date) ~ "balance",
      stringr::str_detect(record_date, "\\.[0-9]+\\.") ~ "balance",
      TRUE ~ "transaction"
    )) %>%
    dplyr::filter(!(dplyr::row_number() == nrow(.) & type == "transaction")) %>%
    tidyr::separate(record_date, c("day", "month", "year"), fill = "right") %>%
    dplyr::mutate(year = dplyr::if_else(
      condition = is.na(year),
      true = year %>%
        na.omit() %>%
        unique() %>%
        as.numeric() %>%
        max() %>%
        as.character(),
      false = year
    )) %>%
    tidyr::unite(col = record_date, day, month, year, sep = "-") %>%
    dplyr::mutate(
      record_date = lubridate::dmy(record_date),
      amount = amount %>%
        stringr::str_remove_all("\\.") %>%
        stringr::str_replace(",", ".") %>%
        stringr::str_remove("\\+")
    ) %>%
    dplyr::mutate(amount = dplyr::if_else(
      stringr::str_detect(amount, "-"),
      amount %>% stringr::str_remove("-") %>% stringr::str_c("-", .),
      amount
    )) %>%
    dplyr::mutate(
      amount = as.numeric(amount),
      counterparty = dplyr::if_else(counterparty %in% c("Saldo", "Loppusaldo"), "N/A", counterparty)
    ) %>%
    dplyr::select(
      type,
      record_date,
      counterparty,
      amount,
      -line
    ) %>%
    dplyr::arrange(record_date)

  derived_balances <- output %>%
    dplyr::filter(dplyr::row_number() == 1 | type == "transaction") %>%
    dplyr::mutate(cumulative_amount = cumsum(amount)) %>%
    dplyr::filter(type == "transaction") %>%
    dplyr::mutate(type = "balance", counterparty = "N/A") %>%
    dplyr::mutate(amount = cumulative_amount) %>%
    dplyr::select(-cumulative_amount)

  output %>%
    dplyr::bind_rows(derived_balances) %>%
    dplyr::arrange(type, record_date)

}
eteppo/r-nordea-analyysi documentation built on June 14, 2019, 3:48 a.m.