# This script stores utility files used in processing the raw data from Vanguard.
# Running this script puts helper functions in the global namespace.
library(tidyverse)
library(janitor)
library(BatchGetSymbols)
library(scales)
#' Cleans the raw Vanguard transaction history.
#'
#' @description
#' Vanguard permits a user to download the transaction history for one or more
#' of the accounts that user owns. This function removes the cash
#' holdings and transactions between banks and the settlement fund.
#' Only mutual funds (excluding money market accounts), ETFs, stocks, and bonds
#' are tracked. Dividends and capital gains that get reinvested are also
#' retained. But the transaction history records both the accumulation of the
#' dividend and its reinvestment, so we retain only the reinvestment
#' (recorded with a minus sign, like is done when buying a share of a stock).
#'
#' @param filename A string corresponding to the filepath of the
#' exported Vanguard data (.xlsx format).
#' @return The cleaned transaction history tibble
#' @export
clean_raw_export <- function(filename) {
# Read in the data to identify extraneous lines
raw = readxl::read_excel(filename)
header_skip = which(raw['Account Number'] == 'Account Number')
raw_df =
readxl::read_excel(filename, skip = header_skip) %>%
janitor::clean_names() %>%
select(account_number:transaction_type,
symbol:share_price,
net_amount) %>%
mutate(
account_number = as.factor(account_number),
transaction_type = as.factor(transaction_type),
symbol = as.factor(symbol)
) %>%
# Generate arbitrary integer numberings to replace the account numbers
mutate(account_number = fct_anon(account_number, prefix = "account")) %>%
# Remove the bank transfers, which have no symbol
filter(!is.na(symbol)) %>%
# Omit duplicate or irrelevant transaction types
filter(!(
transaction_type %in% c(
"Sweep in",
"Sweep out",
"Funds Received",
"Dividend",
"Contribution",
"Capital gain (LT)",
"Capital gain (ST)"
)
)) %>%
mutate(
trade_date = lubridate::date(trade_date),
settlement_date = lubridate::date(settlement_date)
) %>%
# Now view investments as positive quantities (assets)
mutate(net_amount = -net_amount) %>%
group_by(symbol) %>%
arrange(trade_date) %>%
mutate(
# Total number of shares at this date
share_sum = cumsum(shares),
# Total cost basis at this date
share_cumulative_cost_basis = cumsum(net_amount)
) %>%
ungroup()
return(raw_df)
}
#' Compute the number of shares of a particular stock held at a date
#' for a particular account.
#'
#' @param ticker The stock of interest
#' @param account The account holding the shares
#' @param date The date at which to calculate the holding
#' @param cleaned_df The cleaned tibble generated by clean_transaction_export().
#' @return The number of shares, rounded to the nearest ten-thousandth share.
#' @export
share_count <- function(ticker, account, date, cleaned_df) {
# Single row of interest: Most recent trade date/transaction for this stock
last_trade = cleaned_df %>% filter(symbol == ticker,
trade_date <= date,
account_number == account) %>%
slice_max(trade_date) %>% # Gets most recent trade date
slice_tail() # Last transaction on this trade date
cumulative_shares = last_trade$share_sum
if (is_empty(cumulative_shares)) {
return(0)
}
else {
return(round(cumulative_shares, digits = 5))
}
}
#' Compute the total cost basis of a particular stock held at a date.
#'
#' @param ticker The stock of interest.
#' @param account The account holding the shares.
#' @param date The date at which to calculate the holding.
#' @param cleaned_df The cleaned dataframe generated using clean_transaction_export().
#' @export
#' @return The cost basis, rounded to the nearest ten-thousandth dollar
cost_basis <- function(ticker, account, date, cleaned_df) {
# Single row of interest: Most recent trade date/transaction for this stock
last_trade = cleaned_df %>% filter(symbol == ticker,
trade_date <= date,
account_number == account) %>%
slice_max(trade_date) %>% # Gets most recent trade date
slice_tail() # Last transaction on this trade date
# Cumulative cost basis at this date
cost_basis = last_trade$share_cumulative_cost_basis
if (is_empty(cost_basis)) {
return(0)
}
else {
return(round(cost_basis, digits = 5))
}
}
#' Obtain a tibble of the quantity of shares held on a single day.
#'
#' @param date The data to find holdings for.
#' @param cleaned_df A cleaned transaction history dataframe
#' @param account_list A list of accounts in the dataset.
#' @param stock_list All stocks of interest.
#' @return A tibble of data for all stocks on one date
#' @export
day_holdings <-
function(date,
cleaned_df,
stock_list,
account_list) {
# We need a row for each stock, date, and account.
holdings_df = crossing(stock = stock_list,
date = date,
account = account_list)
# Store cost basis and share count at each stock/date/account combination.
holdings_df = holdings_df %>% rowwise() %>%
mutate(
count = share_count(stock, account, date, cleaned_df),
cost_basis = cost_basis(stock, account, date, cleaned_df)
) %>%
ungroup()
return(holdings_df)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.