Nothing
#' Submission URL Tools
#'
#' EDGAR submissions are organized fairly regularly. These functions help to
#' fint the URL to submission components.
#' @param cik Company code
#' @param accession accession number for a filing
#' @param filename filename provided in a submission
#' @return A string with URL requested
#' @examples
#' submission_index_href("0000712515", "0000712515-17-000090")
#' submission_href("0000712515", "0000712515-17-000090")
#' submission_file_href("0000712515", "0000712515-17-000090",
#' "pressrelease-ueberroth.htm")
#' @export
submission_index_href <- function(cik, accession) {
submission_file_href(cik, accession, paste0(accession, "-index.htm"))
}
#' @describeIn submission_index_href Creates a link to the master submission
#' sgml submission file
#' @export
submission_href <- function(cik, accession) {
submission_file_href(cik, accession, paste0(accession, ".txt"))
}
#' @describeIn submission_index_href provides the link to a given file within a
#' particular submission.
#' @export
submission_file_href <- function(cik, accession, filename) {
trim_cik <- gsub("^0+", "", cik)
dashless <- gsub("-", "", accession)
paste0("https://www.sec.gov/Archives/edgar/data/", trim_cik, "/", dashless,
"/", filename)
}
#' Company URL for a CIK
#'
#' Given a CIK, provide a link to the company information page.
#'
#' @param cik Company code
#' @param ownership (default: FALSE) boolean for inclusion of company change
#' filings
#' @param atom (default: FALSE) if the link should be to the atom XML feed
#' @return A string with URL requested
#' @examples
#' company_href("0000037912")
#' @export
company_href <- function(cik, ownership = FALSE, atom = FALSE) {
if (ownership == TRUE) ownership <- "include"
if (ownership == FALSE) ownership <- "exclude"
if (ownership == "") ownership <- "exclude"
paste0(
"https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany",
"&CIK=", cik,
"&owner=", ownership,
"&hidefilings=0",
ifelse(atom, "&output=atom", ""))
}
is_url <- function(x) {
grepl("^(http|ftp)s?://", x, ignore.case = T)
}
get_doc <- function(x, clean = F) {
if (typeof(x) == "character") {
if (is_url(x)) {
res <- edgar_GET(x)
content <- httr::content(res, encoding = "UTF-8", as = "text")
if (clean) {
content <- clean_html(content)
}
# The 'HUGE' option can lead to some negative consequences with
# particularly large documents, but given how poorly formed a lot of SEC
# filings are, it is needed to ensure some parse at all...
doc <- try({
xml2::read_html(content, base_url = x)
}, silent = T)
if (inherits(doc, "try-error")) {
doc <- xml2::read_html(content, base_url = x, options = "HUGE")
}
} else {
if (clean) {
content <- clean_html(x)
} else {
content <- x
}
doc <- try({
xml2::read_html(content)
}, silent = T)
if (inherits(doc, "try-error")) {
doc <- xml2::read_html(content, options = "HUGE")
}
}
} else {
doc <- x
}
if (clean) {
doc <- clean_doc(doc)
}
doc
}
charToDoc <- function(x) {
if (is_url(x)) {
res <- edgar_GET(x)
if (res$status != "200") {
stop(paste0("Unable to reach the SEC endpoint (", x, ")"))
}
xml2::read_html(res, base_url = x, options = "HUGE")
} else {
xml2::read_html(x, options = "HUGE")
}
}
charToText <- function(x) {
if (is_url(x)) {
res <- edgar_GET(x)
if (res$status != "200") {
stop(paste0("Unable to reach the SEC endpoint (", x, ")"))
}
return(httr::content(res, encoding = "UTF-8"))
} else {
return(x)
}
}
unicode_map <- matrix(c(
160, " ",
32, " ",
8194, " ", # En Space
8195, " ", # Em Space
8203, "", # Zero-width space
## Hyphens
151, " - ",
8208, " - ", # Hyphen
8209, " - ", # Non-breaking Hyphen
8210, " - ", # Figure Dash
8211, " - ", # En dash
8212, " - ", # Em dash
8213, " - ", # Horizontal Bar
8722, " - ", # Minus Sign
## Quotes
145, "'", # Private use One
146, "'", # Possessive Quote
8216, "'", # Left Single Quote
8217, "'", # Right Single Quote
147, "\"", # Set Transmit State (renered as double quote)
148, "\"", # Cancel Character (renered as double quote)
8220, "\"", # Left Double Quote
8221, "\"", # Right Double Quote
## Other
8232, "\n", # Line Separator
8260, "/", # Fraction Slash
"038", "&"
), ncol = 2, byrow = T)
html_escape_map <- unlist(apply(unicode_map, 1, function(map) {
int.code <- as.integer(map[1])
res <- list()
res[paste0("&#", map[1], ";")] <- map[2]
res[paste0("&#x", as.hexmode(int.code), ";")] <- map[2]
res[paste0("&#x", toupper(as.hexmode(int.code)), ";")] <- map[2]
res
}))
## This is a highly curated list based on what is actually seen in filings
## rather than an exhaustive code mapping
html_escape_map <- list(
"
" = "\n",
"‑" = " - ",
"‒" = " - ",
"−" = " - ",
"‐" = " - ",
"–" = " - ",
"—" = " - ",
"‐" = " - ",
"‑" = " - ",
"–" = " - ",
"—" = " - ",
"―" = " - ",
"−" = " - ",
"–" = " - ",
"—" = " - ",
"―" = " - ",
"‘" = "'",
"’" = "'",
"‘" = "'",
"’" = "'",
"‘" = "'",
"’" = "'",
"“" = "\"",
"”" = "\"",
"“" = "\"",
"”" = "\"",
"“" = "\"",
"”" = "\"",
" " = " ",
" " = " ",
" " = " ",
" " = " ",
" " = " ",
" " = " ",
"​" = " ", #zero width space
"⁄" = "/",
"&" = "&"
)
html_escape_map[" "] <- " "
html_escape_map["\u00a0"] <- " " # Unicode nbsp
# strips difficult to handle html bits we don't really care about
# @param x text of an html document
clean_html <- function(x) {
# Not cleaned:
# CODE, Count from SP500 Filings
# "þ" # thorn
# "§" # sect
# "®" # reg
# "ֹ",1 # Hebrew Point Holam
# "‎",8 # Left to Right mark
# "†",8415 # Dagger
# "‡",1331 # Double Dagger
# "•",178544 # Bullet
# "…",655 # Ellipsis
# "
",2 # Line Separator
# "₤",81 # Lira Sign
# "₨",2 # Rupee Sign
# "₩",10 # Won Sign
# "€",3717 # Euro
# "₱",6 # Peso
# "℠",2 # Service Mark
# "™",3914 # Trademark
# "⅛",50 # Vulgar Fraction 1/8
# "⅜",44 # Vulgar Fraction 3/8
# "⅝",41 # Vulgar Fraction 5/8
# "⅞",67 # Vulgar Fraction 7/8
# "∎",23 # End of Proof
# "∙",26 # Bullet Operator
# "√",47 # Square Root
# "≠",2 # Not Equal To
# "≤",110 # Less-than or equal to
# "≥",273 # Greater-than or equal to
# "⋅",5 # Dot operator
# "■",37 # Black Square
# "□",15 # White Square
# "▪",4846 # Black Square Small
# "○",76 # White Circle
# "●",4552 # Black Circle
# "◦",2029 # White Bullet
# "☐",1191 # Ballot Box
# "☑",397 # Ballot box w/ Check
# "☒",552 # Ballot Box w/ X
# "♦",130 # Black Diamond Suit
# "─",36 # Circled Digit 0
# character.replacements = list(
# ## SPACES
# " " = " ",
# " " = " ",
# " " = " ",
# "\u00a0" = " ", # Unicode nbsp
# " " = " ", # En Space
# " " = " ", # Em Space
# "​" = "", # Zero-width space
# ## Hyphens
# "—" = " - ",
# "‐" = " - ", # Hyphen
# "‑" = " - ", # Non-breaking Hyphen
# "‒" = " - ", # Figure Dash
# "–" = " - ", # En dash
# "—" = " - ", # Em dash
# "―" = " - ", # Horizontal Bar
# "−" = " - ", # Minus Sign
# ## Quotes
# "’" = "'", # Possessive Quote
# "‘" = "'", # Left Single Quote
# "’" = "'", # Right Single Quote
# "“" = "\"", # Left Double Quote
# "”" = "\"", # Right Double Quote
# ## Other
# "
" = "\n", # Line Separator
# "⁄" = "/" # Fraction Slash
# )
for (escape in names(html_escape_map)) {
x <- gsub(escape,
html_escape_map[escape],
x,
fixed = T)
# x <- gsub(intToUtf8(substr(escape, 3, nchar(escape) - 1)),
# character.replacements[escape],
# x,
# fixed = T)
}
# xml_text doesn't break words on div closes, which we typically want
x <- gsub("</div>", "</div> ", x, fixed = T)
x <- gsub("<br>", " ", x, ignore.case = T)
x <- gsub("<br/>", " ", x, ignore.case = T)
x <- gsub("<page>", " ", x, ignore.case = T)
x
}
# removes br from the doc since we don't really care about display, replacing
# with spaces
clean_doc <- function(doc) {
replacement <- xml2::xml_find_first(xml2::read_xml("<p> </p>"),
"/p/text()")
xml2::xml_replace(xml2::xml_find_all(doc, "//br"), replacement)
# remove hidden divs
xml2::xml_remove(xml2::xml_find_all(doc, "//div[@style = 'display:none']"),
free = T)
# Don't care about non-text divs
xml2::xml_remove(xml2::xml_find_all(doc, "//div[(count(*) = 0 or count(hr) =
count(*)) and normalize-space() = '']"), free = T)
# strip messy inlineXBRL
if (length(xml2::xml_ns(doc)) > 1) {
xml2::xml_remove(xml2::xml_find_all(doc, "//header"), free = T)
}
doc
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.