#' Fetch representative tickers of a specific industry
#' @description
#' Call SEC API and fetch the constituent industry tickers.
#' @param industry_name **Character** Industry name
#' @param api_key **Character** SEC API key
#' @return **List** List of tickers
#' @examples
#' # Company ticker
#' # industry_name <- "Consumer Electronics"
#' # API key
#' # api_key <- "XXXXXXXXX"
#' @importFrom dplyr `%>%`
#' @importFrom reticulate import r_to_py py_install
#' @importFrom purrr map
#' @export
get_industry_tickers <- function(api_key, industry_name) {
py_install("sec_api", pip=T)
sec <- import("sec_api")
query_api <- sec$MappingApi(api_key = api_key)
by_industry <- query_api$resolve("industry", industry_name)
result <- map(.x =seq_along(by_industry),
.f=function(.x) {
by_industry[[.x]]$ticker})
return(result %>% unlist())
}
#' Fetch the sample of tickers links
#' @description
#' Call SEC API and fetch the constituent industry tickers links
#' @param ticker_list **Character** List of tickers
#' @param api_key **Character** SEC API key
#' @return **List** List of tickers htm urls
#' @examples
#' # Company ticker
#' # industry_name <- get_industry_tickers output
#' # API key
#' # api_key <- "XXXXXXXXX"
#' @importFrom dplyr `%>%`
#' @importFrom purrr map compact
#' @export
get_industry_links <- function(ticker_list, api_key) {
result <- map(.x = ticker_list, .f = function(.x) {
print(.x)
Sys.sleep(0.5)
get_all_10k_filings(.x, api_key)
})
return(result %>% compact() %>% unlist())
}
#' Fetch the company filings that represent an industry
#' @description
#' Call SEC API and fetch the constituent industry filings
#' @param filing_links **Character** List of industry links
#' @param api_key **Character** SEC API key
#' @return **List** List of filings (whole industry)
#' @examples
#' # Company ticker
#' # filing_links <- "get_industry_links output"
#' # API key
#' # api_key <- "XXXXXXXXX"
#' @importFrom dplyr `%>%`
#' @importFrom purrr map compact
#' @export
get_industry_filings <- function(filing_links, api_key) {
result <- map(.x = filing_links, .f = function(.x) {
print(.x)
Sys.sleep(1)
get_filing(.x, api_key)
})
return(result %>% compact())
}
#' Get the parameter values
#' @description
#' Get the parameter values from the filtered filings
#' @param filings **Character** List of filtered filings
#' @return **List** List of tibbles w/ the parameters of interest
#' @importFrom stringr str_detect
#' @importFrom dplyr arrange slice_tail select
get_bulk_values <- function(filings) {
result <- list()
for (i in seq_along(filings)) {
if (any(str_detect(colnames(filings[[i]]), "segment")) & any(str_detect(colnames(filings[[i]]), "value"))) {
result[[length(result)+1]] <- data.frame(filings[[i]]) %>% arrange(segment) %>% slice_tail(n=2) %>% select(value)
} else if(any(str_detect(colnames(filings[[i]]), "value"))) {
result[[length(result)+1]] <- data.frame(filings[[i]]) %>% select(value)
} else {
print("Passing...")
}
}
return(result)
}
#' Format the parameter values and calculate median
#' @description
#' Format the parameter values, calculate the median, and consolidate into a data frame
#' @param industry_name **Character** Industry name
#' @param bulk_values **List** List of filtered data frames
#' @param type_of **Character** Parameter name
#' @return **List** List w/ list of values and median value
#' @importFrom tidyr tibble
#' @importFrom readr parse_number
get_formatted_value <- function(bulk_values, industry_name, type_of) {
if (length(unlist(bulk_values))>0) {
values_list <- unlist(bulk_values)
summarized_df <- tibble(industry = industry_name,
median_value = median(parse_number(values_list)/1000000, na.rm = T))
names(summarized_df)[names(summarized_df)=="median_value"] <- type_of
return(list(summarized_df = summarized_df, values = values_list %>% parse_number()/1000000))
}
}
#' Get the filings parameters names (Statements of Income)
#' @description
#' Get the parameter names from the filings
#' @param filings **Character** List of filings
#' @param param_name **Character** Parameter name
#' @return **List** List of tibbles w/ the parameters names of interest
#' @importFrom purrr map
#' @importFrom dplyr `%>%`
get_statement_names <- function(filings) {
names <- map(seq_along(filings),
.f=function(.x) names(filings[[.x]]$StatementsOfIncome)) %>% unlist()
return(names)
}
#' Get the filings parameters names (Balance Sheet)
#' @description
#' Get the parameter names from the filings
#' @param filings **Character** List of filings
#' @param param_name **Character** Parameter name
#' @return **List** List of tibbles w/ the parameters names of interest
#' @importFrom purrr map
#' @importFrom dplyr `%>%`
get_balance_names <- function(filings) {
names <- map(seq_along(filings),
.f=function(.x) names(filings[[.x]]$BalanceSheets)) %>% unlist()
return(names)
}
#' Get the filings parameters names (StatementsOfCashFlows)
#' @description
#' Get the parameter names from the filings
#' @param filings **Character** List of filings
#' @param param_name **Character** Parameter name
#' @return **List** List of tibbles w/ the parameters names of interest
#' @importFrom purrr map
#' @importFrom dplyr `%>%`
get_cash_flow_names <- function(filings) {
names <- map(seq_along(filings),
.f=function(.x) names(filings[[.x]]$StatementsOfCashFlows)) %>% unlist()
return(names)
}
#' Get the filings parameters names (BalanceSheets)
#' @description
#' Get the parameter names from the filings
#' @param filings **Character** List of filings
#' @param param_name **Character** Parameter name
#' @return **List** List of tibbles w/ the parameters names of interest
#' @importFrom purrr map
#' @importFrom dplyr `%>%`
get_balance_names <- function(filings) {
names <- map(seq_along(filings),
.f=function(.x) names(filings[[.x]]$BalanceSheets)) %>% unlist()
return(names)
}
#' Get the filings parameters names (StatementsOfCashFlows)
#' @description
#' Get the parameter names from the filings
#' @param filings **Character** List of filings
#' @param param_name **Character** Parameter name
#' @return **List** List of tibbles w/ the parameters names of interest
#' @importFrom purrr map
#' @importFrom dplyr `%>%`
get_cash_names <- function(filings) {
names <- map(seq_along(filings),
.f=function(.x) names(filings[[.x]]$StatementsOfCashFlows)) %>% unlist()
return(names)
}
#' Get equity data frames
#' @description
#' Get equity data frames from the whole industry level filings
#' @param list_filings **Character** List of filings (industry level)
#' @return **List** List w/ equity data frames
#' @importFrom stringr str_detect
get_bulk_equity <- function(list_filings) {
result <- list()
for (i in seq_along(list_filings)) {
if (!is.null(list_filings[[i]]$BalanceSheets) & length(list_filings[[i]]$BalanceSheets) > 1) {
if (any(str_detect(names(list_filings[[i]]$BalanceSheets), "StockholdersEquity"))) {
result[[length(result)+1]] <- list_filings[[i]]$BalanceSheets$StockholdersEquity
} else {
print("Equity data is missing")
}
}
else {
print("Missing Balance Sheet (NULL or length <=1")
}
}
return(result)
}
#' Get long debt data frames
#' @description
#' Get long debt data frames from the whole industry level filings
#' @param list_filings **Character** List of filings (industry level)
#' @return **List** List w/ long debt data frames
#' @importFrom stringr str_detect
#' @importFrom dplyr `%>%` inner_join select tibble
#' @importFrom tidyr unnest
get_bulk_long_debt <- function(list_filings) {
result <- list()
for (i in seq_along(list_filings)) {
if (!is.null(list_filings[[i]]$BalanceSheets) & length(list_filings[[i]]$BalanceSheets) > 1) {
if (any(str_detect(names(list_filings[[i]]$BalanceSheets), "LongTermDebt"))) {
result[[length(result)+1]] <- list_filings[[i]]$BalanceSheets$LongTermDebt
}
else if (any(str_detect(names(list_filings[[i]]$BalanceSheets),
"^(LongTermDebtNoncurrent|LongTermDebtCurrent)$"))) {
non_current <- as.numeric(as.data.frame(list_filings[[i]]$BalanceSheets$LongTermDebtNoncurrent)$value)
current <- as.numeric(as.data.frame(list_filings[[i]]$BalanceSheets$LongTermDebtCurrent)$value)
value <- data.frame(value = non_current + current)
result[[length(result)+1]] <- value
}
else {
print("Long term debt data is missing")
}
}
else {
print("Missing Balance Sheet (NULL or length <=1")
}
}
# return(result)
return(result)
}
#' Get current liabilities data frames
#' @description
#' Get current liabilities data frames from the whole industry level filings
#' @param list_filings **Character** List of filings (industry level)
#' @return **List** List w/ current liabilities data frames
#' @importFrom stringr str_detect
get_bulk_short_debt <- function(list_filings) {
result <- list()
for (i in seq_along(list_filings)) {
if (!is.null(list_filings[[i]]$BalanceSheets) & length(list_filings[[i]]$BalanceSheets) > 1) {
if (any(str_detect(names(list_filings[[i]]$BalanceSheets), "LiabilitiesCurrent"))) {
result[[length(result)+1]] <- list_filings[[i]]$BalanceSheets$LiabilitiesCurrent
} else {
print("LiabilitiesCurrent data is missing")
}
}
else {
print("Missing Balance Sheet (NULL or length <=1")
}
}
return(result)
}
#' Get cash data frames
#' @description
#' Get cash data frames from the whole industry level filings
#' @param list_filings **Character** List of filings (industry level)
#' @return **List** List w/ cash data frames
#' @importFrom stringr str_detect
get_bulk_cash <- function(list_filings) {
result <- list()
for (i in seq_along(list_filings)) {
if (!is.null(list_filings[[i]]$BalanceSheets) & length(list_filings[[i]]$BalanceSheets) > 1) {
if (any(str_detect(names(list_filings[[i]]$BalanceSheets), "CashAndCashEquivalentsAtCarryingValue"))) {
result[[length(result)+1]] <- list_filings[[i]]$BalanceSheets$CashAndCashEquivalentsAtCarryingValue
} else {
print("CashAndCashEquivalentsAtCarryingValue data is missing")
}
}
else {
print("Missing Balance Sheet (NULL or length <=1")
}
}
return(result)
}
#' Get EBIT data frames
#' @description
#' Get EBIT data frames from the whole industry level filings
#' @param list_filings **Character** List of filings (industry level)
#' @return **List** List w/ EBIT data frames
#' @importFrom stringr str_detect
get_bulk_ebit <- function(list_filings) {
result <- list()
for (i in seq_along(list_filings)) {
if (!is.null(list_filings[[i]]$StatementsOfIncome) & length(list_filings[[i]]$StatementsOfIncome) > 1) {
if (any(str_detect(names(list_filings[[i]]$StatementsOfIncome), "OperatingIncomeLoss"))) {
result[[length(result)+1]] <- list_filings[[i]]$StatementsOfIncome$OperatingIncomeLoss
} else {
print("OperatingIncomeLoss data is missing")
}
}
else {
print("Missing Income Statement (NULL or length <=1")
}
}
return(result)
}
#' Get R&D data frames
#' @description
#' Get R&D data frames from the whole industry level filings
#' @param list_filings **Character** List of filings (industry level)
#' @return **List** List w/ R&D data frames
#' @importFrom stringr str_detect
get_bulk_rnd <- function(list_filings) {
result <- list()
for (i in seq_along(list_filings)) {
if (!is.null(list_filings[[i]]$StatementsOfIncome) & length(list_filings[[i]]$StatementsOfIncome) > 1) {
if (any(str_detect(names(list_filings[[i]]$StatementsOfIncome), "ResearchAndDevelopmentExpense"))) {
result[[length(result)+1]] <- list_filings[[i]]$StatementsOfIncome$ResearchAndDevelopmentExpense
} else if (any(str_detect(names(list_filings[[i]]$StatementsOfIncome), "ResearchAndDevelopmentExpensesNet")))
result[[length(result)+1]] <- list_filings[[i]]$StatementsOfIncome$ResearchAndDevelopmentExpensesNet
else {
print("ResearchAndDevelopmentExpense data is missing")
}
}
else {
print("Missing Income Statement (NULL or length <=1")
}
}
return(result)
}
#' Get Revenues data frames
#' @description
#' Get revenues data frames from the whole industry level filings
#' @param list_filings **Character** List of filings (industry level)
#' @return **List** List w/ R&D data frames
#' @importFrom stringr str_detect
get_bulk_revenues <- function(list_filings) {
result <- list()
for (i in seq_along(list_filings)) {
if (!is.null(list_filings[[i]]$StatementsOfIncome) & length(list_filings[[i]]$StatementsOfIncome) > 1) {
if (any(str_detect(names(list_filings[[i]]$StatementsOfIncome), "Revenues"))) {
result[[length(result)+1]] <- list_filings[[i]]$StatementsOfIncome$Revenues
} else if (any(str_detect(names(list_filings[[i]]$StatementsOfIncome), "SalesRevenueNet"))) {
result[[length(result)+1]] <- list_filings[[i]]$StatementsOfIncome$SalesRevenueNet
} else {
print("Revenues data is missing")
}
}
else {
print("Missing Income Statement (NULL or length <=1")
}
}
return(result)
}
#' Get CapEx data frames
#' @description
#' Get CapEx data frames from the whole industry level filings
#' @param list_filings **Character** List of filings (industry level)
#' @return **List** List w/ CapEx data frames
#' @importFrom stringr str_detect
get_bulk_capex <- function(list_filings) {
result <- list()
for (i in seq_along(list_filings)) {
if (!is.null(list_filings[[i]]$StatementsOfCashFlows) & length(list_filings[[i]]$StatementsOfCashFlows) > 1) {
if (any(str_detect(names(list_filings[[i]]$StatementsOfCashFlows), "PaymentsToAcquirePropertyPlantAndEquipment"))) {
result[[length(result)+1]] <- list_filings[[i]]$StatementsOfCashFlows$PaymentsToAcquirePropertyPlantAndEquipment
} else {
print("Revenues data is missing")
}
}
else {
print("Missing Cash Flow Statement (NULL or length <=1")
}
}
return(result)
}
#' Get summarized industry data
#' @description
#' Get summarized industry data (StatementsofIncome, CashFlow, BalanceSheets)
#' @param list_filings **List** List of filings (industry level)
#' @param industry *Character* Industry naming
#' @return **Data Frame** List w/ summarized values / data frame
#' @importFrom dplyr bind_rows
#' @importFrom purrr reduce
#' @importFrom tidyr pivot_longer
get_data_clean <- function(filings, industry) {
### Get Equity
equity <- get_bulk_equity(filings)
equity_values <- get_bulk_values(equity)
equity_total <- get_formatted_value(equity_values, industry_name = industry,
type_of = "equity")
equity_collected_values <- equity_total$values
equity_df <- equity_total$summarized_df
### Get Long Debt
long_debt <- get_bulk_long_debt(filings)
long_debt_values <- get_bulk_values(long_debt)
long_debt_total <- get_formatted_value(long_debt_values, industry_name = industry,
type_of = "long_debt")
long_debt_collected_values <- long_debt_total$values
long_debt_df <- long_debt_total$summarized_df
### Get short debt
short_debt <- get_bulk_short_debt(filings)
short_debt_values <- get_bulk_values(short_debt)
short_debt_total <- get_formatted_value(short_debt_values, industry_name = industry,
type_of = "short_debt")
short_debt_collected_values <- short_debt_total$values
short_debt_df <- short_debt_total$summarized_df
#---------------------------------------------------------------------------
### EBITs
ebit <- get_bulk_ebit(filings)
ebit_values <- get_bulk_values(ebit)
ebit_total <- get_formatted_value(ebit_values,
industry_name = industry,
type_of = "ebit")
ebit_collected_values <- ebit_total$values
ebit_df <- ebit_total$summarized_df
### Revenues
revenue <- get_bulk_revenues(filings)
revenue_values <- get_bulk_values(revenue)
revenue_total <- get_formatted_value(revenue_values, industry_name = industry,
type_of = "revenue")
revenue_collected_values <- revenue_total$values
revenue_df <- revenue_total$summarized_df
### R&D
rnd <- get_bulk_rnd(filings)
rnd_values <- get_bulk_values(rnd)
rnd_total <- get_formatted_value(rnd_values, industry_name = industry,
type_of = "rnd")
rnd_collected_values <- rnd_total$values
rnd_df <- rnd_total$summarized_df
### CapEx
capex <- get_bulk_capex(filings)
capex_values <- get_bulk_values(capex)
capex_total <- get_formatted_value(capex_values, industry_name = industry,
type_of = "capex")
capex_collected_values <- capex_total$values
capex_df <- capex_total$summarized_df
data_list <- list(ebit_df, revenue_df, rnd_df, capex_df, equity_df,
long_debt_df, short_debt_df)
value_list <- list(equity=equity_collected_values, long_debt=long_debt_collected_values,
short_debt=short_debt_collected_values, ebit=ebit_collected_values,
revenue=revenue_collected_values, rnd=rnd_collected_values,
capex=capex_collected_values)
cons <- reduce(data_list, bind_rows)
df_final <- pivot_longer(cons, cols = -c("industry"), names_to = "param",
values_to = "value") %>% na.omit()
return(list(summarized=df_final, all_values=value_list))
}
#' Get summarized industry data (complete pipeline)
#' @description
#' Get summarized industry data (StatementsofIncome, CashFlow, BalanceSheets).
#' This method include the whole pipeline (API filing collecting and summarizing)
#' @param industry_list **List** List of industries of interes
#' @param api_key **Character** API key
#' @param p_tickers **Number** Number of firms to sample
#' @param p_links **Number** Number of filing links to sample
#' @return **List of Data Frame** List w/ summarized values / data frame
#' @importFrom tidyr pivot_wider
#' @importFrom purrr reduce
#' @importFrom readr write_csv
get_all_industries_summary <- function(api_key, industry_list, p_tickers, p_links) {
summary_total <- list()
if (!dir.exists("raw_data")) {
dir.create("raw_data")
} else {
print("Directory: raw_data already exists!")
}
for (i in seq_along(industry_list)) {
# Get tickers
tickers <- get_industry_tickers(api_key = api_key, industry_name = industry_list[i])
# Get links
links <- get_industry_links(api_key = api_key, ticker_list = sample(tickers, size = length(tickers)*p_tickers))
# Get filings
filings <- get_industry_filings(api_key = api_key, filing_links = sample(links, size=length(links)*p_links))
# Get summary stats
summary <- list(get_data_clean(filings, industry = industry_list[i]))
# Get summary raw data
saveRDS(get_data_clean(filings, industry = industry_list[i])[2],
paste0("raw_data/data_", industry_list[i]))
# Append to list of summaries
summary_total[[length(summary_total)+1]] <- summary
lol <- list()
for (i in seq_along(summary_total)) {
lol[[length(lol)+1]] <- summary_total[[i]][[1]]$summarized
result <- reduce(lol, bind_rows) %>%
pivot_wider(names_from = param, values_from = value)
}
write_csv(result, "raw_data/aggregated_industry_data.csv")
}
return(list(total=summary_total, df=result))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.