R/utility.R

Defines functions day_holdings cost_basis share_count clean_raw_export

Documented in clean_raw_export cost_basis day_holdings share_count

# This script stores utility files used in processing the raw data from Vanguard.
# Running this script puts helper functions in the global namespace.

library(tidyverse)
library(janitor)
library(BatchGetSymbols)
library(scales)

#' Cleans the raw Vanguard transaction history.
#'
#' @description
#' Vanguard permits a user to download the transaction history for one or more
#' of the accounts that user owns. This function removes the cash
#' holdings and transactions between banks and the settlement fund.
#' Only mutual funds (excluding money market accounts), ETFs, stocks, and bonds
#' are tracked. Dividends and capital gains that get reinvested are also
#' retained. But the transaction history records both the accumulation of the
#' dividend and its reinvestment, so we retain only the reinvestment
#' (recorded with a minus sign, like is done when buying a share of a stock).
#'
#' @param filename A string corresponding to the filepath of the
#'                exported Vanguard data (.xlsx format).
#' @return The cleaned transaction history tibble
#' @export
clean_raw_export <- function(filename) {
  # Read in the data to identify extraneous lines
  raw = readxl::read_excel(filename)
  header_skip = which(raw['Account Number'] == 'Account Number')

  raw_df =
    readxl::read_excel(filename, skip = header_skip) %>%
    janitor::clean_names() %>%
    select(account_number:transaction_type,
           symbol:share_price,
           net_amount) %>%
    mutate(
      account_number = as.factor(account_number),
      transaction_type = as.factor(transaction_type),
      symbol = as.factor(symbol)
    ) %>%
    # Generate arbitrary integer numberings to replace the account numbers
    mutate(account_number = fct_anon(account_number, prefix = "account")) %>%
    # Remove the bank transfers, which have no symbol
    filter(!is.na(symbol)) %>%
    # Omit duplicate or irrelevant transaction types
    filter(!(
      transaction_type %in% c(
        "Sweep in",
        "Sweep out",
        "Funds Received",
        "Dividend",
        "Contribution",
        "Capital gain (LT)",
        "Capital gain (ST)"
      )
    )) %>%
    mutate(
      trade_date = lubridate::date(trade_date),
      settlement_date = lubridate::date(settlement_date)
    ) %>%
    # Now view investments as positive quantities (assets)
    mutate(net_amount = -net_amount) %>%
    group_by(symbol) %>%
    arrange(trade_date) %>%
    mutate(
      # Total number of shares at this date
      share_sum = cumsum(shares),

      # Total cost basis at this date
      share_cumulative_cost_basis = cumsum(net_amount)
    ) %>%
    ungroup()

  return(raw_df)
}

#' Compute the number of shares of a particular stock held at a date
#' for a particular account.
#'
#' @param ticker The stock of interest
#' @param account The account holding the shares
#' @param date The date at which to calculate the holding
#' @param cleaned_df The cleaned tibble generated by clean_transaction_export().
#' @return The number of shares, rounded to the nearest ten-thousandth share.
#' @export
share_count <- function(ticker, account, date, cleaned_df) {
  # Single row of interest: Most recent trade date/transaction for this stock
  last_trade = cleaned_df %>% filter(symbol == ticker,
                                     trade_date <= date,
                                     account_number == account) %>%
    slice_max(trade_date) %>% # Gets most recent trade date
    slice_tail() # Last transaction on this trade date

  cumulative_shares = last_trade$share_sum

  if (is_empty(cumulative_shares)) {
    return(0)
  }
  else {
    return(round(cumulative_shares, digits = 5))
  }
}

#' Compute the total cost basis of a particular stock held at a date.
#'
#' @param ticker The stock of interest.
#' @param account The account holding the shares.
#' @param date The date at which to calculate the holding.
#' @param cleaned_df The cleaned dataframe generated using clean_transaction_export().
#' @export
#' @return The cost basis, rounded to the nearest ten-thousandth dollar
cost_basis <- function(ticker, account, date, cleaned_df) {
  # Single row of interest: Most recent trade date/transaction for this stock
  last_trade = cleaned_df %>% filter(symbol == ticker,
                                     trade_date <= date,
                                     account_number == account) %>%
    slice_max(trade_date) %>% # Gets most recent trade date
    slice_tail() # Last transaction on this trade date

  # Cumulative cost basis at this date
  cost_basis = last_trade$share_cumulative_cost_basis

  if (is_empty(cost_basis)) {
    return(0)
  }
  else {
    return(round(cost_basis, digits = 5))
  }
}

#' Obtain a tibble of the quantity of shares held on a single day.
#'
#' @param date The data to find holdings for.
#' @param cleaned_df A cleaned transaction history dataframe
#' @param account_list A list of accounts in the dataset.
#' @param stock_list All stocks of interest.
#' @return A tibble of data for all stocks on one date
#' @export
day_holdings <-
  function(date,
           cleaned_df,
           stock_list,
           account_list) {
    # We need a row for each stock, date, and account.
    holdings_df = crossing(stock = stock_list,
                           date = date,
                           account = account_list)

    # Store cost basis and share count at each stock/date/account combination.
    holdings_df = holdings_df %>% rowwise() %>%
      mutate(
        count = share_count(stock, account, date, cleaned_df),
        cost_basis = cost_basis(stock, account, date, cleaned_df)
      ) %>%
      ungroup()

    return(holdings_df)
  }
akprasadan/scrape_vanguard_transactions documentation built on Feb. 21, 2022, 12:04 a.m.