#' clean price data from Yahoo
#'
#' @param scraped_price_data tbl_df of scraped data
#' @param frequency string indicating the frequency to return price data, e.g. 'monthly'
#'
#' @return object of class \code{tbl_df} with date, variable and value
#'
clean_yahoo_price <- function(scraped_price_data, frequency) {
assertR::assert_present(c("daily", "weekly", "monthly", "quarterly", "annual"), frequency)
reqd_cols <- c("Date", "Open", "High", "Low", "Close", "Adj.Close", "Volume")
assertR::assert_present(names(scraped_price_data), reqd_cols)
if(all(is.na(scraped_price_data))) {
price_data_tbl <- tibble::tibble(date = as.Date(NA), variable = as.character(NA), value = as.numeric(NA))
} else {
price_df_long <- scraped_price_data %>%
dplyr::mutate(Date = as.Date(Date)) %>%
dplyr::rename(date = Date) %>%
dplyr::rename(AdjClose = Adj.Close) %>%
tidyr::gather(variable, value, -date) %>%
dplyr::mutate(value = as.numeric(value)) %>%
dplyr::mutate(value = ifelse(value >= 10, value / 100, value)) %>% # some values are quoted in cents while others in the currency unit
dplyr::mutate(value = ifelse(value < 10, value * 100, value))
price_data_tbl <- price_df_long %>%
dplyr::group_by(variable) %>%
dateR::to_period(., frequency) %>%
dplyr::ungroup() %>%
tibble::tibble()
}
price_data_tbl
}
#' clean financial statement tables from Yahoo
#'
#' @param scraped_fs_data tbl_df of scraped financial statement data
#' @param type string indicating the data type, e.g. 'IS'
#'
#' @return object of class \code{tbl_df} with date, variable and value
#'
clean_yahoo_fs <- function(scraped_fs_data, type) {
if (all(is.na(scraped_fs_data$raw)) | length(scraped_fs_data$raw) < 4) {
fs_data_tbl <- tibble::tibble(date = as.Date(NA), variable = as.character(NA), value = as.numeric(NA))
} else {
rep_u <- scraped_fs_data[1,]
dt_tbl <- scraped_fs_data %>%
dplyr::filter(grepl("\\/", raw_text)) %>%
dplyr::mutate(split_1 = strsplit(raw_text, "\\/")) %>%
tidyr::unnest(split_1) %>%
dplyr::mutate(year = substring(split_1, 1,4)) %>%
dplyr::mutate(year = ifelse(grepl("TTM", split_1), format(Sys.Date(), "%Y"), year)) %>%
dplyr::filter(nchar(year) == 4) %>%
dplyr::mutate(raw2 = gsub("TTM", "", raw_text)) %>%
dplyr::mutate(month = sub("\\/.*", "", raw2)) %>%
dplyr::mutate(month = ifelse(grepl("TTM", split_1), as.numeric(format(Sys.Date(), "%m"))-1, month)) %>%
dplyr::mutate(year = ifelse(month == 0, as.numeric(format(Sys.Date(), "%Y"))-1, year)) %>% # if the current date is Jan, the prev line sets month to 0 not 12
dplyr::mutate(month = ifelse(month == 0, 12, month)) %>% # we need to go back to the last month of the previous year not month 0 or the last month of the current year
dplyr::mutate(day = 1) %>%
tidyr::unite(date_str, year,month,day, sep = "-") %>%
dplyr::mutate(date = dateR::get_eom_dates(as.Date(date_str))) %>%
dplyr::mutate(col = dplyr::row_number()) %>%
dplyr::select(raw_text, raw2, date, col)
cnt_dt <- length(dt_tbl$date)
# scraped data will always begin with the reporting units which we retrieve for reference but don't use in the output,'Breakdown'
# (the variable column header) and the dates
reqd_raw <- scraped_fs_data %>%
dplyr::filter(dplyr::row_number() != 1,
raw_text != "Breakdown",
raw_text != dt_tbl$raw_text[1])
val_str <- reqd_raw %>%
dplyr::mutate(value = stringr::str_split(raw_text, "(?<!-)\\s")) %>% # split string either on space or a hyphen followed by a space (negative numbers)
tidyr::unnest(value) %>%
dplyr::mutate(value = gsub("\\s", "", value)) %>% # remove any spaces that may be left as a result of negative numbers
dplyr::mutate(value = gsub(",", "", value)) %>%
dplyr::filter(stringr::str_detect(value, "\\d+")) %>% # hyphens may be left behind. just keep the numbers before converting
dplyr::mutate(dbl_hyph = substr(value, 1, 2)) %>% # sometimes a blank cell (denoted with a hyphen) exists before a negative value and so a double hyphen prefix occurs
dplyr::mutate(value = ifelse(dbl_hyph == "--", substring(value, 2), value)) %>%
dplyr::select(-dbl_hyph)
val_tbl <- val_str %>%
dplyr::mutate(value = as.numeric(value)) %>%
dplyr::select(raw_text, value) %>%
dplyr::group_by(raw_text) %>%
dplyr::mutate(row = dplyr::row_number()) %>%
dplyr::ungroup() %>%
tidyr::spread(row, value) %>% # annoyingly, the values get reordered after the spread so rejoin reqd raw by raw to get the order back
dplyr::select(1:(cnt_dt + 1))
fs_data <- dplyr::left_join(reqd_raw, val_tbl, by = "raw_text") %>%
tidyr::fill(paste(dt_tbl$col), .direction = "up") %>%
dplyr::filter(!(raw_text %in% val_tbl$raw_text)) %>%
dplyr::rename(variable = raw_text)
fs_data_tbl <- fs_data %>%
tidyr::gather(row, value, -variable) %>%
dplyr::mutate(col = as.numeric(row)) %>%
dplyr::left_join(dt_tbl, by = "col") %>%
dplyr::select(date, variable, value)
}
fs_data_tbl
}
#' clean Yahoo data
#'
#' @param scraped_data tbl_df of scraped data
#' @param type string indicating the data type, e.g. 'IS'
#' @param frequency string
#'
#' @return tbl_df
#'
clean_yahoo_data <- function(scraped_data, type, frequency = NULL) {
assertR::assert_true(length(type) == 1, "logic error - only one type at a time")
if("price" %in% type) {
assertR::assert_present(c("daily", "weekly", "monthly", "quarterly", "annual"), frequency)
}
clean_df <- switch(type,
price = clean_yahoo_price(scraped_data, frequency),
IS = clean_yahoo_fs(scraped_data, type),
BS = clean_yahoo_fs(scraped_data, type),
CFS = clean_yahoo_fs(scraped_data, type))
clean_df
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.