R/sec_api_bulk_methods.R

Defines functions get_all_industries_summary get_data_clean get_bulk_capex get_bulk_revenues get_bulk_rnd get_bulk_ebit get_bulk_cash get_bulk_short_debt get_bulk_long_debt get_bulk_equity get_cash_names get_balance_names get_cash_flow_names get_balance_names get_statement_names get_formatted_value get_bulk_values get_industry_filings get_industry_links get_industry_tickers

Documented in get_all_industries_summary get_balance_names get_bulk_capex get_bulk_cash get_bulk_ebit get_bulk_equity get_bulk_long_debt get_bulk_revenues get_bulk_rnd get_bulk_short_debt get_bulk_values get_cash_flow_names get_cash_names get_data_clean get_formatted_value get_industry_filings get_industry_links get_industry_tickers get_statement_names

#' Fetch representative tickers of a specific industry
#' @description
#' Call SEC API and fetch the constituent industry tickers.
#' @param industry_name **Character** Industry name
#' @param api_key **Character** SEC API key
#' @return **List** List of tickers
#' @examples
#' # Company ticker
#' # industry_name <- "Consumer Electronics"
#' # API key
#' # api_key <- "XXXXXXXXX"
#' @importFrom dplyr `%>%`
#' @importFrom reticulate import r_to_py py_install
#' @importFrom purrr map
#' @export
get_industry_tickers <- function(api_key, industry_name) {
  py_install("sec_api", pip=T)
  sec <- import("sec_api")

  query_api <- sec$MappingApi(api_key = api_key)

  by_industry <- query_api$resolve("industry", industry_name)


  result <- map(.x =seq_along(by_industry),
                  .f=function(.x) {
                    by_industry[[.x]]$ticker})

  return(result %>% unlist())

}

#' Fetch the sample of tickers links
#' @description
#' Call SEC API and fetch the constituent industry tickers links
#' @param ticker_list **Character** List of tickers
#' @param api_key **Character** SEC API key
#' @return **List** List of tickers htm urls
#' @examples
#' # Company ticker
#' # industry_name <- get_industry_tickers output
#' # API key
#' # api_key <- "XXXXXXXXX"
#' @importFrom dplyr `%>%`
#' @importFrom purrr map compact
#' @export
get_industry_links <- function(ticker_list, api_key) {
  result <- map(.x = ticker_list, .f = function(.x) {
    print(.x)
    Sys.sleep(0.5)
    get_all_10k_filings(.x, api_key)
  })
  return(result %>% compact() %>% unlist())
}

#' Fetch the company filings that represent an industry
#' @description
#' Call SEC API and fetch the constituent industry filings
#' @param filing_links **Character** List of industry links
#' @param api_key **Character** SEC API key
#' @return **List** List of filings (whole industry)
#' @examples
#' # Company ticker
#' # filing_links <- "get_industry_links output"
#' # API key
#' # api_key <- "XXXXXXXXX"
#' @importFrom dplyr `%>%`
#' @importFrom purrr map compact
#' @export
get_industry_filings <- function(filing_links, api_key) {
  result <- map(.x = filing_links, .f = function(.x) {
    print(.x)
    Sys.sleep(1)
    get_filing(.x, api_key)
  })
  return(result %>% compact())
}

#' Get the parameter values
#' @description
#' Get the parameter values from the filtered filings
#' @param filings **Character** List of filtered filings
#' @return **List** List of tibbles w/ the parameters of interest
#' @importFrom stringr str_detect
#' @importFrom dplyr arrange slice_tail select
 get_bulk_values <- function(filings) {
  result <- list()
  for (i in seq_along(filings)) {
    if (any(str_detect(colnames(filings[[i]]), "segment")) & any(str_detect(colnames(filings[[i]]), "value"))) {
      result[[length(result)+1]] <- data.frame(filings[[i]]) %>% arrange(segment) %>% slice_tail(n=2) %>% select(value)
    } else if(any(str_detect(colnames(filings[[i]]), "value"))) {
      result[[length(result)+1]] <- data.frame(filings[[i]]) %>% select(value)
    } else {
      print("Passing...")
    }
  }
  return(result)
}

#' Format the parameter values and calculate median
#' @description
#' Format the parameter values, calculate the median, and consolidate into a data frame
#' @param industry_name **Character** Industry name
#' @param bulk_values **List** List of filtered data frames
#' @param type_of **Character** Parameter name
#' @return **List** List w/ list of values and median value
#' @importFrom tidyr tibble
#' @importFrom readr parse_number
get_formatted_value <- function(bulk_values, industry_name, type_of) {
  if (length(unlist(bulk_values))>0) {
    values_list <- unlist(bulk_values)

    summarized_df <- tibble(industry = industry_name,
                            median_value = median(parse_number(values_list)/1000000, na.rm = T))

    names(summarized_df)[names(summarized_df)=="median_value"] <- type_of

    return(list(summarized_df = summarized_df, values = values_list %>% parse_number()/1000000))
  }
}

#' Get the filings parameters names (Statements of Income)
#' @description
#' Get the parameter names from the filings
#' @param filings **Character** List of  filings
#' @param param_name **Character** Parameter name
#' @return **List** List of tibbles w/ the parameters names of interest
#' @importFrom purrr map
#' @importFrom dplyr `%>%`
get_statement_names <- function(filings) {
  names <- map(seq_along(filings),
               .f=function(.x) names(filings[[.x]]$StatementsOfIncome)) %>% unlist()
  return(names)
}

#' Get the filings parameters names (Balance Sheet)
#' @description
#' Get the parameter names from the filings
#' @param filings **Character** List of  filings
#' @param param_name **Character** Parameter name
#' @return **List** List of tibbles w/ the parameters names of interest
#' @importFrom purrr map
#' @importFrom dplyr `%>%`
get_balance_names <- function(filings) {
  names <- map(seq_along(filings),
               .f=function(.x) names(filings[[.x]]$BalanceSheets)) %>% unlist()
  return(names)
}



#' Get the filings parameters names (StatementsOfCashFlows)
#' @description
#' Get the parameter names from the filings
#' @param filings **Character** List of  filings
#' @param param_name **Character** Parameter name
#' @return **List** List of tibbles w/ the parameters names of interest
#' @importFrom purrr map
#' @importFrom dplyr `%>%`
get_cash_flow_names <- function(filings) {
  names <- map(seq_along(filings),
               .f=function(.x) names(filings[[.x]]$StatementsOfCashFlows)) %>% unlist()
  return(names)
}

#' Get the filings parameters names (BalanceSheets)
#' @description
#' Get the parameter names from the filings
#' @param filings **Character** List of  filings
#' @param param_name **Character** Parameter name
#' @return **List** List of tibbles w/ the parameters names of interest
#' @importFrom purrr map
#' @importFrom dplyr `%>%`
get_balance_names <- function(filings) {
  names <- map(seq_along(filings),
               .f=function(.x) names(filings[[.x]]$BalanceSheets)) %>% unlist()
  return(names)
}

#' Get the filings parameters names (StatementsOfCashFlows)
#' @description
#' Get the parameter names from the filings
#' @param filings **Character** List of  filings
#' @param param_name **Character** Parameter name
#' @return **List** List of tibbles w/ the parameters names of interest
#' @importFrom purrr map
#' @importFrom dplyr `%>%`
get_cash_names <- function(filings) {
  names <- map(seq_along(filings),
               .f=function(.x) names(filings[[.x]]$StatementsOfCashFlows)) %>% unlist()
  return(names)
}

#' Get equity data frames
#' @description
#' Get equity data frames from the whole industry level filings
#' @param list_filings **Character** List of filings (industry level)
#' @return **List** List w/ equity data frames
#' @importFrom stringr str_detect
get_bulk_equity <- function(list_filings) {
  result <- list()
  for (i in seq_along(list_filings)) {
    if (!is.null(list_filings[[i]]$BalanceSheets) & length(list_filings[[i]]$BalanceSheets) > 1) {
      if (any(str_detect(names(list_filings[[i]]$BalanceSheets), "StockholdersEquity"))) {
        result[[length(result)+1]] <- list_filings[[i]]$BalanceSheets$StockholdersEquity
      } else {
        print("Equity data is missing")
      }
    }
    else {
      print("Missing Balance Sheet (NULL or length <=1")
    }
  }
  return(result)
}

#' Get long debt data frames
#' @description
#' Get long debt data frames from the whole industry level filings
#' @param list_filings **Character** List of filings (industry level)
#' @return **List** List w/ long debt data frames
#' @importFrom stringr str_detect
#' @importFrom dplyr `%>%` inner_join select tibble
#' @importFrom tidyr unnest
get_bulk_long_debt <- function(list_filings) {
  result <- list()
  for (i in seq_along(list_filings)) {
    if (!is.null(list_filings[[i]]$BalanceSheets) & length(list_filings[[i]]$BalanceSheets) > 1) {
      if (any(str_detect(names(list_filings[[i]]$BalanceSheets), "LongTermDebt"))) {
        result[[length(result)+1]] <- list_filings[[i]]$BalanceSheets$LongTermDebt
      }
      else if (any(str_detect(names(list_filings[[i]]$BalanceSheets),
                                "^(LongTermDebtNoncurrent|LongTermDebtCurrent)$"))) {
        non_current <- as.numeric(as.data.frame(list_filings[[i]]$BalanceSheets$LongTermDebtNoncurrent)$value)
        current <- as.numeric(as.data.frame(list_filings[[i]]$BalanceSheets$LongTermDebtCurrent)$value)
        value <- data.frame(value = non_current + current)
        result[[length(result)+1]] <- value
      }
      else {
        print("Long term debt data is missing")
      }
    }
    else {
      print("Missing Balance Sheet (NULL or length <=1")
    }
  }
  # return(result)
  return(result)
}

#' Get current liabilities data frames
#' @description
#' Get current liabilities data frames from the whole industry level filings
#' @param list_filings **Character** List of filings (industry level)
#' @return **List** List w/ current liabilities data frames
#' @importFrom stringr str_detect
get_bulk_short_debt <- function(list_filings) {
  result <- list()
  for (i in seq_along(list_filings)) {
    if (!is.null(list_filings[[i]]$BalanceSheets) & length(list_filings[[i]]$BalanceSheets) > 1) {
      if (any(str_detect(names(list_filings[[i]]$BalanceSheets), "LiabilitiesCurrent"))) {
        result[[length(result)+1]] <- list_filings[[i]]$BalanceSheets$LiabilitiesCurrent
      } else {
        print("LiabilitiesCurrent data is missing")
      }
    }
    else {
      print("Missing Balance Sheet (NULL or length <=1")
    }
  }
  return(result)
}

#' Get cash data frames
#' @description
#' Get cash data frames from the whole industry level filings
#' @param list_filings **Character** List of filings (industry level)
#' @return **List** List w/ cash data frames
#' @importFrom stringr str_detect
get_bulk_cash <- function(list_filings) {
  result <- list()
  for (i in seq_along(list_filings)) {
    if (!is.null(list_filings[[i]]$BalanceSheets) & length(list_filings[[i]]$BalanceSheets) > 1) {
      if (any(str_detect(names(list_filings[[i]]$BalanceSheets), "CashAndCashEquivalentsAtCarryingValue"))) {
        result[[length(result)+1]] <- list_filings[[i]]$BalanceSheets$CashAndCashEquivalentsAtCarryingValue
      } else {
        print("CashAndCashEquivalentsAtCarryingValue data is missing")
      }
    }
    else {
      print("Missing Balance Sheet (NULL or length <=1")
    }
  }
  return(result)
}

#' Get EBIT data frames
#' @description
#' Get EBIT data frames from the whole industry level filings
#' @param list_filings **Character** List of filings (industry level)
#' @return **List** List w/ EBIT data frames
#' @importFrom stringr str_detect
get_bulk_ebit <- function(list_filings) {
  result <- list()
  for (i in seq_along(list_filings)) {
    if (!is.null(list_filings[[i]]$StatementsOfIncome) & length(list_filings[[i]]$StatementsOfIncome) > 1) {
      if (any(str_detect(names(list_filings[[i]]$StatementsOfIncome), "OperatingIncomeLoss"))) {
        result[[length(result)+1]] <- list_filings[[i]]$StatementsOfIncome$OperatingIncomeLoss
      } else {
        print("OperatingIncomeLoss data is missing")
      }
    }
    else {
      print("Missing Income Statement (NULL or length <=1")
    }
  }
  return(result)
}

#' Get R&D data frames
#' @description
#' Get R&D data frames from the whole industry level filings
#' @param list_filings **Character** List of filings (industry level)
#' @return **List** List w/ R&D data frames
#' @importFrom stringr str_detect
get_bulk_rnd <- function(list_filings) {
  result <- list()
  for (i in seq_along(list_filings)) {
    if (!is.null(list_filings[[i]]$StatementsOfIncome) & length(list_filings[[i]]$StatementsOfIncome) > 1) {
      if (any(str_detect(names(list_filings[[i]]$StatementsOfIncome), "ResearchAndDevelopmentExpense"))) {
        result[[length(result)+1]] <- list_filings[[i]]$StatementsOfIncome$ResearchAndDevelopmentExpense
      } else if (any(str_detect(names(list_filings[[i]]$StatementsOfIncome), "ResearchAndDevelopmentExpensesNet")))
        result[[length(result)+1]] <- list_filings[[i]]$StatementsOfIncome$ResearchAndDevelopmentExpensesNet
      else {
        print("ResearchAndDevelopmentExpense data is missing")
      }
    }
    else {
      print("Missing Income Statement (NULL or length <=1")
    }
  }
  return(result)
}

#' Get Revenues data frames
#' @description
#' Get revenues data frames from the whole industry level filings
#' @param list_filings **Character** List of filings (industry level)
#' @return **List** List w/ R&D data frames
#' @importFrom stringr str_detect
get_bulk_revenues <- function(list_filings) {
  result <- list()
  for (i in seq_along(list_filings)) {
    if (!is.null(list_filings[[i]]$StatementsOfIncome) & length(list_filings[[i]]$StatementsOfIncome) > 1) {
      if (any(str_detect(names(list_filings[[i]]$StatementsOfIncome), "Revenues"))) {
        result[[length(result)+1]] <- list_filings[[i]]$StatementsOfIncome$Revenues
      } else if (any(str_detect(names(list_filings[[i]]$StatementsOfIncome), "SalesRevenueNet"))) {
        result[[length(result)+1]] <- list_filings[[i]]$StatementsOfIncome$SalesRevenueNet
      } else {
        print("Revenues data is missing")
      }
    }
    else {
      print("Missing Income Statement (NULL or length <=1")
    }
  }
  return(result)
}

#' Get CapEx data frames
#' @description
#' Get CapEx data frames from the whole industry level filings
#' @param list_filings **Character** List of filings (industry level)
#' @return **List** List w/ CapEx data frames
#' @importFrom stringr str_detect
get_bulk_capex <- function(list_filings) {
  result <- list()
  for (i in seq_along(list_filings)) {
    if (!is.null(list_filings[[i]]$StatementsOfCashFlows) & length(list_filings[[i]]$StatementsOfCashFlows) > 1) {
      if (any(str_detect(names(list_filings[[i]]$StatementsOfCashFlows), "PaymentsToAcquirePropertyPlantAndEquipment"))) {
        result[[length(result)+1]] <- list_filings[[i]]$StatementsOfCashFlows$PaymentsToAcquirePropertyPlantAndEquipment
      } else {
        print("Revenues data is missing")
      }
    }
    else {
      print("Missing Cash Flow Statement (NULL or length <=1")
    }
  }
  return(result)
}

#' Get summarized industry data
#' @description
#' Get summarized industry data (StatementsofIncome, CashFlow, BalanceSheets)
#' @param list_filings **List** List of filings (industry level)
#' @param industry *Character* Industry naming
#' @return **Data Frame** List w/ summarized values / data frame
#' @importFrom dplyr bind_rows
#' @importFrom purrr reduce
#' @importFrom tidyr pivot_longer
get_data_clean <- function(filings, industry) {
  ### Get Equity
  equity <- get_bulk_equity(filings)
  equity_values <- get_bulk_values(equity)
  equity_total <- get_formatted_value(equity_values, industry_name = industry,
                                      type_of = "equity")
  equity_collected_values <- equity_total$values
  equity_df <- equity_total$summarized_df


  ### Get Long Debt
  long_debt <- get_bulk_long_debt(filings)
  long_debt_values <- get_bulk_values(long_debt)
  long_debt_total <- get_formatted_value(long_debt_values, industry_name = industry,
                                         type_of = "long_debt")
  long_debt_collected_values <- long_debt_total$values
  long_debt_df <- long_debt_total$summarized_df

  ### Get short debt
  short_debt <- get_bulk_short_debt(filings)
  short_debt_values <- get_bulk_values(short_debt)
  short_debt_total <- get_formatted_value(short_debt_values, industry_name = industry,
                                          type_of = "short_debt")
  short_debt_collected_values <- short_debt_total$values
  short_debt_df <- short_debt_total$summarized_df

  #---------------------------------------------------------------------------

  ### EBITs
  ebit <- get_bulk_ebit(filings)
  ebit_values <- get_bulk_values(ebit)
  ebit_total <- get_formatted_value(ebit_values,
                                    industry_name = industry,
                                    type_of = "ebit")
  ebit_collected_values <- ebit_total$values
  ebit_df <- ebit_total$summarized_df


  ### Revenues
  revenue <- get_bulk_revenues(filings)
  revenue_values <- get_bulk_values(revenue)
  revenue_total <- get_formatted_value(revenue_values, industry_name = industry,
                                       type_of = "revenue")
  revenue_collected_values <- revenue_total$values
  revenue_df <- revenue_total$summarized_df


  ### R&D
  rnd <- get_bulk_rnd(filings)
  rnd_values <- get_bulk_values(rnd)
  rnd_total <- get_formatted_value(rnd_values, industry_name = industry,
                                   type_of = "rnd")
  rnd_collected_values <- rnd_total$values
  rnd_df <- rnd_total$summarized_df


  ### CapEx
  capex <- get_bulk_capex(filings)
  capex_values <- get_bulk_values(capex)
  capex_total <- get_formatted_value(capex_values, industry_name = industry,
                                     type_of = "capex")
  capex_collected_values <- capex_total$values
  capex_df <- capex_total$summarized_df



  data_list <- list(ebit_df, revenue_df, rnd_df, capex_df, equity_df,
                    long_debt_df, short_debt_df)

  value_list <- list(equity=equity_collected_values, long_debt=long_debt_collected_values,
                     short_debt=short_debt_collected_values, ebit=ebit_collected_values,
                     revenue=revenue_collected_values, rnd=rnd_collected_values,
                     capex=capex_collected_values)


  cons <- reduce(data_list, bind_rows)

  df_final <- pivot_longer(cons, cols = -c("industry"), names_to = "param",
                           values_to = "value") %>% na.omit()

  return(list(summarized=df_final, all_values=value_list))

}

#' Get summarized industry data (complete pipeline)
#' @description
#' Get summarized industry data (StatementsofIncome, CashFlow, BalanceSheets).
#' This method include the whole pipeline (API filing collecting and summarizing)
#' @param industry_list **List** List of industries of interes
#' @param api_key **Character** API key
#' @param p_tickers **Number** Number of firms to sample
#' @param p_links **Number** Number of filing links to sample
#' @return **List of Data Frame** List w/ summarized values / data frame
#' @importFrom tidyr pivot_wider
#' @importFrom purrr reduce
#' @importFrom readr write_csv
get_all_industries_summary <- function(api_key, industry_list, p_tickers, p_links) {
  summary_total <- list()
  if (!dir.exists("raw_data")) {
    dir.create("raw_data")
  } else {
    print("Directory: raw_data already exists!")
  }


  for (i in seq_along(industry_list)) {
    # Get tickers
    tickers <- get_industry_tickers(api_key = api_key, industry_name = industry_list[i])
    # Get links
    links <- get_industry_links(api_key = api_key, ticker_list = sample(tickers, size = length(tickers)*p_tickers))
    # Get filings
    filings <- get_industry_filings(api_key = api_key, filing_links = sample(links, size=length(links)*p_links))
    # Get summary stats
    summary <- list(get_data_clean(filings, industry = industry_list[i]))
    # Get summary raw data
    saveRDS(get_data_clean(filings, industry = industry_list[i])[2],
            paste0("raw_data/data_", industry_list[i]))
    # Append to list of summaries
    summary_total[[length(summary_total)+1]] <- summary

  lol <- list()
  for (i in seq_along(summary_total)) {
      lol[[length(lol)+1]] <- summary_total[[i]][[1]]$summarized
      result <- reduce(lol, bind_rows) %>%
        pivot_wider(names_from = param, values_from = value)
  }

  write_csv(result, "raw_data/aggregated_industry_data.csv")


  }
  return(list(total=summary_total, df=result))
}
TracyRage/fun_valuation documentation built on Jan. 29, 2023, 8:41 a.m.