# parsing -----------------------------------------------------------------
.get_html_page <-
function(url = 'http://www.adviserinfo.sec.gov/IAPD/IAPDFirmSummary.aspx?ORG_PK=160080') {
page <-
url %>% curl::curl() %>% xml2::read_html()
page
}
.check_html_node <-
function(page, node_css = '#ctl00_cphMain_landing_p2BrochureLink') {
if (page %>% html_nodes(css = node_css) %>% length() > 0) {
node_exists <-
T
} else {
node_exists <-
F
}
node_exists
}
.get_html_node_text <-
function(page, node_css = '#ctl00_cphMain_landing_lblActiveOrgName') {
parse_html_for_text <-
function(page, node_css = '#ctl00_cphMain_landing_lblActiveOrgName') {
node_text <-
page %>%
html_nodes(css = node_css) %>%
html_text()
return(node_text)
}
node_exists <-
page %>%
.check_html_node(node_css = node_css)
if (node_exists) {
node_text <-
page %>% parse_html_for_text(node_css = node_css)
} else {
node_text <-
NA
}
node_text
}
.get_entity_manager_name <-
function(page) {
manager_name <-
page %>%
.get_html_node_text(node_css = '#ctl00_ctl00_cphMainContent_ucADVHeader_lblPrimaryBusinessName')
if (manager_name %>% is.na()) {
manager_name <-
page %>%
html_nodes('.summary-displayname') %>%
html_text()
}
return(manager_name)
}
.get_html_node_attributes <-
function(page,
node_css = '#ctl00_cphMain_landing_p2BrochureLink',
html_attribute = 'href',
base_url = NULL) {
parse_for_html_attribute <-
function(page,
node_css = '#ctl00_cphMain_landing_p2BrochureLink',
html_attribute = 'href',
base_url = NULL) {
node_attribute <-
page %>%
html_nodes(css = node_css) %>%
html_attr(name = html_attribute)
if (!base_url %>% purrr::is_null()) {
node_attribute <-
base_url %>%
paste0(node_attribute)
}
return(node_attribute)
}
node_exists <-
page %>%
.check_html_node(node_css = node_css)
if (node_exists) {
node_attribute <-
page %>%
parse_for_html_attribute(
node_css = node_css,
html_attribute = html_attribute,
base_url = base_url
)
} else {
node_attribute <-
NA
}
return(node_attribute)
}
.parse_node_table_to_text <-
function(page, css_node = '#ctl00_ctl00_cphMainContent_cphAdvFormContent_ClientCompensation_ctl00_trIAPDHeader + tr + tr') {
node_text <-
page %>%
.get_html_node_text(node_css = css_node) %>%
str_replace_all('\r|\n|\t', '') %>%
stri_replace_all_charclass("\\p{WHITE_SPACE}", " ") %>%
stri_trim_both() %>%
gsub("^\\s+|\\s+$", "", .) %>%
str_split(" +") %>%
flatten_chr()
node_text <-
node_text[!node_text == '']
return(node_text)
}
.parse_finra_c_url <-
function(finra_c_url = "curl 'http://brokercheck.finra.org/Search/GenericSearch' -H 'Pragma: no-cache' -H 'Origin: http://brokercheck.finra.org' -H 'Accept-Encoding: gzip, deflate' -H 'Accept-Language: en-US,en;q=0.8' -H 'Upgrade-Insecure-Requests: 1' -H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.80 Safari/537.36' -H 'Content-Type: application/x-www-form-urlencoded' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' -H 'Cache-Control: no-cache' -H 'Referer: http://brokercheck.finra.org/Search/GenericSearch' -H 'Cookie: __RequestVerificationToken=rSB02up7WU1YiLtAmLvoqbrC4heA7LXKidX0GkSiEjprktiE0qFDfkGKbQkTQVd94ShQynDI5BsmpIJeB4idW39G2QE6hRtgA9M_ejnQoHc1; ASP.NET_SessionId=kjzpjdpm1p0ltmm1xcuafmoz' -H 'Connection: keep-alive' -H 'DNT: 1' --data '__RequestVerificationToken=1jsNCQ-1KyYFIfKZuaJc_-CFqVaBiSi1MQ_WyvUrMKk9bEh1ux4s_te11rbeAEHH23ncmfbN8Ndf9gIQvYb0gGEuSabJ1PKIhKi0_AJac9Q1&GenericSearch.StartRow=1&GenericSearch.PageSize=15&GenericSearch.System=BC&GenericSearch.SearchType=2&GenericSearch.IndividualSearchText=&GenericSearch.EmploymingFirmSearchText=&GenericSearch.FirmSearchText=EJF&GenericSearch.Within=25&GenericSearch.ZipCode=' --compressed") {
clean_url <-
finra_c_url %>%
curlconverter::straighten() %>%
suppressMessages()
res <-
clean_url %>%
curlconverter::make_req()
html_page <-
res[[1]]() %>%
content(as = "parsed")
return(html_page)
}
.find_text_node <-
function(node_text,
hit_words = "Total Number of Clients",
off_set = 4,
is_numeric_node = T) {
if (hit_words %>% length() > 1) {
hit_words <-
hit_words %>% str_c(collapse = '|')
}
node_exists <-
(node_text %>% grep(hit_words, .)) %>% length() > 0
if (!node_exists) {
stop("Sorry " %>%
paste0(hit_words, ' is not found in the text nodes'))
}
node_location <-
(node_text %>% grep(hit_words, .)) %>% +off_set
text_node <-
node_text[node_location] %>%
str_trim()
if (is_numeric_node) {
text_node <-
text_node %>%
str_replace_all('\\$', '') %>% str_trim() %>%
as.character() %>%
readr::parse_number()
text_node <-
text_node[!text_node %>% is.na]
}
return(text_node)
}
# munging -----------------------------------------------------------------
.munge_fund_names <-
function(data) {
options(scipen = 9999)
funds <-
data %>%
select(nameFund) %>%
mutate_if(is.character,
funs(. %>% str_trim() %>% stringr::str_to_upper())) %>%
suppressWarnings()
funds <-
funds %>%
mutate(nameFundClean = nameFund) %>%
separate(nameFundClean,
sep = "\\(",
into = c("nameFundClean", "detailFund")) %>%
mutate(
detailFund = detailFund %>% str_replace_all("\\)", ""),
nameFundClean = nameFundClean %>% str_replace_all("\\ L.P.|\\ L.P", " LP") %>% str_replace_all("\\--", " ") %>%
str_replace_all("\\G.P.|G.P", "GP") %>% str_replace_all("\\L.L.C.|L.L.C", "LLC"),
typeFundEntity = case_when(
nameFundClean %>% str_detect("LLC") ~ "LLC",
nameFundClean %>% str_detect("GP") ~ "GP",
nameFundClean %>% str_detect("\\LP") ~ "LP",
TRUE ~ NA_character_
),
isCoInvestmentVehicle = nameFundClean %>% str_detect("COINVESTMENT|CO-INVESTMENT|SIDECAR|SIDE-CAR|\\COINVEST"),
isParalellFund = nameFundClean %>% str_detect("PARALLEL"),
nameFundClean =
nameFundClean %>% str_replace_all(
"\\ LLP|\\ LLC|\\ LP|\\ GP|\\,|CO-INVESTMENT|COINVESTMENT|\\SIDECAR|\\SIDE-CAR|PARALLEL",
""
)
) %>%
mutate(nameFundClean = nameFundClean %>% str_trim() %>% str_replace_all(" ", " ")) %>%
select(
typeFundEntity,
nameFund,
nameFundClean,
detailFund,
isCoInvestmentVehicle,
isParalellFund
) %>%
suppressWarnings()
has_dash <-
funds$nameFundClean %>% str_detect('\\-[A-H]') %>% sum(na.rm = TRUE) >
0
if (has_dash) {
funds$nameFundClean[funds$nameFundClean %>% str_detect('\\-[A-H]')] <-
funds$nameFundClean[funds$nameFundClean %>% str_detect('\\-[A-H]')] %>% str_replace_all('\\-', " SERIES ")
}
count_series <-
funds$nameFundClean %>% str_count('\\SERIES') %>% sum(na.rm = T)
if (count_series > 0) {
funds <-
funds %>%
separate(nameFundClean,
into = c("nameFundClean", "idSeries"),
sep = "\\ SERIES ") %>%
suppressWarnings()
}
new_names <-
names(funds)[!names(funds) %>% str_detect("nameFund")]
data %>%
left_join(funds) %>%
select(idCRD:nameFund,
nameFundClean,
one_of(new_names),
everything()) %>%
suppressMessages()
}
.select_start_vars <-
function(data) {
data <-
data %>%
dplyr::select(idCRD, nameEntityManager, everything())
return(data)
}
.mutate_adv_data <-
function(data) {
has_dates <-
data %>% dplyr::select(dplyr::matches("^date")) %>% names() %>% length() > 0
if (has_dates) {
data <-
data %>%
mutate_at(.vars = data %>% dplyr::select(dplyr::matches("^date")) %>% names() ,
funs(. %>% lubridate::ymd()))
}
has_counts <-
data %>% dplyr::select(dplyr::matches("^count[A-Z]|^amount[A-Z]|idCRD")) %>% dplyr::select(-dplyr::matches("country")) %>%
names %>% length() > 0
if (has_counts) {
data <-
data %>%
mutate_at(
.vars = data %>% dplyr::select(dplyr::matches("^count[A-Z]|^amount[A-Z]|idCRD")) %>%
dplyr::select(-dplyr::matches("country")) %>%
names(),
list(function(x){
x %>% as.character() %>% parse_number() %>% formattable::comma(digits = 0)
})
)
}
has_logical <-
data %>% dplyr::select(dplyr::matches("^is[A-Z]|^has[A-Z]")) %>% names() %>% length() > 0
if (has_logical) {
data <-
data %>%
mutate_at(.vars = data %>% dplyr::select(dplyr::matches("^is[A-Z]|^has[A-Z]")) %>% names(),
funs(. %>% as.logical()))
}
has_amounts <-
data %>% dplyr::select(dplyr::matches("^amount[A-Z]")) %>%
names %>% length() > 0
if (has_amounts) {
data <-
data %>%
mutate_at(.vars = data %>% dplyr::select(dplyr::matches("^amount")) %>%
names(),
list(function(x) {
x %>% as.character() %>% parse_number() %>% formattable::currency(digits = 0)
}))
}
has_pct <-
data %>% dplyr::select(dplyr::matches("^pct[A-Z]")) %>%
names %>% length() > 0
if (has_pct) {
data <-
data %>%
mutate_at(
.vars = data %>% dplyr::select(dplyr::matches("^pct")) %>%
names(),
funs(. %>% as.numeric() %>% formattable::percent(digits = 3))
)
}
return(data)
}
.widen_adv_data <-
function(data) {
data <-
data %>%
mutate_at(.vars = data %>% dplyr::select(dplyr::matches("^date")) %>% names(),
funs(. %>% as.character())) %>%
gather(nameItem, value, -c(countItem, nameEntityManager)) %>%
suppressWarnings()
data <-
data %>%
mutate(
countItem = countItem - 1,
countItem = countItem %>% as.character(),
countItem = ifelse(countItem == "0", '', countItem)
) %>%
unite(item, nameItem, countItem, sep = '') %>%
distinct() %>%
suppressWarnings()
col_order <-
c('nameEntityManager', data$item)
data <-
data %>%
spread(item, value) %>%
dplyr::select(one_of(col_order))
data <-
data %>%
.mutate_adv_data() %>%
suppressWarnings()
return(data)
}
.get_item_name_yes_no_df <-
function(item_name = 'hasCustodyClientCash') {
item_name_df <-
seq_along(item_name) %>%
future_map_dfr(function(x) {
tibble(nameItem = rep(item_name[x], 2),
valueItem = c(T, F)) %>%
unite(fullnameItem,
nameItem,
valueItem,
sep = '.',
remove = F)
})
return(item_name_df)
}
.has_item_check_name <-
function(item_name = 'hasQuarterlyStatemnt') {
item_name_df <-
seq_along(item_name) %>%
future_map_dfr(function(x) {
tibble(nameItem = rep(item_name[x], 1),
valueItem = T) %>%
mutate(fullnameItem = nameItem)
})
return(item_name_df)
}
# sec_adv_data ------------------------------------------------------------
.parse_sec_manager_pdf_url <-
function(url = 'http://www.adviserinfo.sec.gov/IAPD/Part2Brochures.aspx?ORG_PK=135952') {
page <-
url %>%
.get_html_page
idCRD <-
url %>%
.get_pk_url_crd()
nameEntityBrochure <-
page %>%
.get_html_node_text(node_css = 'td:nth-child(1) a') %>%
str_trim()
dateBrochureSubmitted <-
page %>%
.get_html_node_text(node_css = 'td:nth-child(2)') %>%
str_trim() %>%
lubridate::mdy() %>%
suppressWarnings()
dateLastConfirmed <-
page %>%
.get_html_node_text(node_css = 'td:nth-child(3)') %>%
str_trim() %>%
lubridate::mdy() %>%
suppressWarnings()
urlPDFManagerADVBrochure <-
page %>%
.get_html_node_attributes(node_css = 'td:nth-child(1) a',
html_attribute = 'href',
base_url = 'http://www.adviserinfo.sec.gov')
pdf_data <-
tibble(
idCRD,
nameEntityBrochure,
urlPDFManagerADVBrochure,
dateBrochureSubmitted,
dateLastConfirmed
)
if (pdf_data %>% nrow > 1) {
pdf_data <-
pdf_data %>%
gather(item,
value,
-c(idCRD, dateBrochureSubmitted, dateLastConfirmed)) %>%
group_by(item) %>%
mutate(idItem = (1:n() - 1),
nameItem = if_else(idItem > 0, item %>% paste0(idItem), item)) %>%
ungroup() %>%
dplyr::select(-c(item, idItem)) %>%
spread(nameItem, value)
}
return(pdf_data)
}
.get_url_crd <-
function(url = 'http://www.adviserinfo.sec.gov/IAPD/IAPDFirmSummary.aspx?ORG_PK=135952') {
idCRD <-
url %>%
str_split('\\?ORG_PK=') %>%
flatten_chr() %>%
.[2] %>%
as.numeric()
return(idCRD)
}
.get_manager_sec_page <-
function(url = 'http://www.adviserinfo.sec.gov/IAPD/IAPDFirmSummary.aspx?ORG_PK=156663') {
httr::set_config(httr::config(ssl_verifypeer = 0L))
page_status <-
url %>%
GET()
if (page_status$url == 'http://www.adviserinfo.sec.gov/IAPD/SearchNoResult.aspx') {
idCRD <-
url %>%
.get_url_crd()
manager_df <-
tibble(idCRD,
urlManagerSummaryADV = NA)
"No data" %>% message
}
else {
page <-
url %>%
.get_html_page()
name_entity_manager <-
page %>%
.get_html_node_text(node_css = '#ctl00_cphMain_landing_lblActiveOrgName')
idCRDSEC <-
page %>%
.get_html_node_text(node_css = '#ctl00_cphMain_landing_lblActiveOrgCrd') %>%
str_replace_all('\\(|\\)|CRD# |SEC#', '')
get_all_status_data <-
function(page) {
parse_era_table <-
function(page) {
nodes_exist <-
page %>%
html_nodes('#tbERAStatus td') %>% length() > 0
if (nodes_exist) {
statusJurisdictionERA <-
page %>%
html_nodes('#tbERAStatus td:nth-child(1)') %>%
html_text()
statusReportingERA <-
page %>%
html_nodes('#tbERAStatus td:nth-child(2)') %>%
html_text()
dateEffectiveERA <-
page %>%
html_nodes('#tbERAStatus td:nth-child(3)') %>%
html_text() %>%
lubridate::mdy() %>%
as.character()
table_df <-
tibble(statusJurisdictionERA,
statusReportingERA,
dateEffectiveERA) %>%
mutate(nameEntityManager = name_entity_manager,
countItem = 1:n()) %>%
dplyr::select(nameEntityManager, everything()) %>%
.widen_adv_data() %>%
.mutate_adv_data()
} else {
table_df <-
tibble(nameEntityManager = name_entity_manager, isExempt = F)
}
return(table_df)
}
parse_state_table <-
function(page) {
nodes_exist <-
page %>%
html_nodes('#tbNtcStatus td') %>% length() > 0
if (nodes_exist) {
stateRegistration <-
page %>%
html_nodes('#tbNtcStatus td:nth-child(1)') %>%
html_text()
dateEffectiveState <-
page %>%
html_nodes('#tbNtcStatus td:nth-child(2)') %>%
html_text() %>%
lubridate::mdy() %>%
as.character()
table_df <-
tibble(stateRegistration, dateEffectiveState) %>%
mutate(nameEntityManager = name_entity_manager) %>%
dplyr::select(nameEntityManager, everything()) %>%
mutate(countItem = 1:n()) %>%
arrange(countItem) %>%
.widen_adv_data() %>%
.mutate_adv_data()
} else {
table_df <-
tibble(nameEntityManager = name_entity_manager)
}
return(table_df)
}
parse_reg_status <-
function(page) {
nodes_exist <-
page %>%
html_nodes('#tbRegStatus td') %>% length() > 0
if (nodes_exist) {
statusJurisdiction <-
page %>%
html_nodes('#tbRegStatus td:nth-child(1)') %>%
html_text()
statusReporting <-
page %>%
html_nodes('#tbRegStatus td:nth-child(2)') %>%
html_text()
dateEffective <-
page %>%
html_nodes('#tbRegStatus td:nth-child(3)') %>%
html_text() %>%
lubridate::mdy() %>%
as.character()
table_df <-
tibble(statusJurisdiction, statusReporting, dateEffective) %>%
mutate(nameEntityManager = name_entity_manager,
countItem = 1:n()) %>%
dplyr::select(nameEntityManager, everything()) %>%
.widen_adv_data() %>%
.mutate_adv_data()
} else {
table_df <-
tibble(nameEntityManager = name_entity_manager, isExempt = F)
}
return(table_df)
}
era_df <-
page %>%
parse_era_table()
reg_df <-
parse_reg_status(page)
state_df <-
parse_state_table(page)
all_df <-
era_df %>%
left_join(reg_df) %>%
left_join(state_df) %>%
suppressMessages()
return(all_df)
}
manager_df <-
page %>%
get_all_status_data()
urlManagerADV <-
page %>%
.get_html_node_attributes(
node_css = '#aspnetForm > div.container-fluid > div > nav > ul > li:nth-child(7) > a',
html_attribute = 'href',
base_url = 'http://www.adviserinfo.sec.gov'
)
manager_df <-
manager_df %>%
mutate(
nameEntityManager = name_entity_manager,
idCRDSEC,
urlManagerSummaryADV = url,
urlManagerADV
) %>%
separate(idCRDSEC,
sep = ' / ',
into = c('idCRD', 'idSEC')) %>%
mutate(idCRD = idCRD %>% as.numeric()) %>%
dplyr::select(nameEntityManager, idCRD, idSEC, everything())
brochure_exists <-
page %>%
.check_html_node(node_css = '#ctl00_cphMain_landing_p2BrochureLink')
if (brochure_exists) {
urlPDFManager <-
page %>%
.get_html_node_attributes(
node_css = '#aspnetForm > div.container-fluid > div > nav > ul > li:nth-child(8) > a',
html_attribute = 'href',
base_url = 'http://www.adviserinfo.sec.gov'
)
if (!'nameEntityManager' %in% names(manager_df)) {
nameEntityManager <-
page %>%
.get_entity_manager_name()
manager_df <-
manager_df %>%
mutate(nameEntityManager)
}
manager_df <-
manager_df %>%
left_join(urlPDFManager %>%
.parse_sec_manager_pdf_url()) %>%
suppressMessages()
}
manager_df <-
manager_df %>%
dplyr::select(idCRD, nameEntityManager, everything())
}
return(manager_df)
}
#' IAPD registered meta data
#'
#' This function returns meta data
#' on any Investment Adviser Public Disclosure [IAPD]
#' filing manager. This function can be used to discover
#' managers to power the
#' \code{\link{adv_managers_filing()}} function.
#'
#' @param entity_names vector of entities to search
#' @param score_threshold matching score threshold for the search name
#' if \code{NULL} there is no threshold
#' @param crd_ids numric vector of crds to search
#' @param return_message return a message
#' @import dplyr purrr curl jsonlite lubridate tidyr rvest httr
#' @return a data frame
#' @export
#' @family IAPD
#' @family FINRA
#' @family fund data
#' @family entity search
#' @examples
#' adv_managers_metadata(entity_names = c('Divco', 'EJF'), score_threshold = .2)
#' adv_managers_metadata(entity_names = "Blackstone", score_threshold = NULL)
#' adv_managers_metadata(crd_ids = 173787)
adv_managers_metadata <-
function(entity_names = NULL,
crd_ids = NULL,
score_threshold = .2,
return_message = T) {
if (entity_names %>% purrr::is_null() & crd_ids %>% purrr::is_null()) {
stop("Please enter a name or CRD to search")
}
if (!crd_ids %>% purrr::is_null()) {
crd_urls <-
'https://www.adviserinfo.sec.gov/IAPD/IAPDFirmSummary.aspx?ORG_PK=' %>%
paste0(crd_ids)
}
if (!entity_names %>% purrr::is_null()) {
finra_data <-
entity_names %>%
finra_entities(
ocr_pdf = FALSE,
score_threshold = score_threshold,
return_message = return_message
) %>%
suppressMessages() %>%
suppressWarnings()
if (finra_data %>% nrow == 0) {
return(invisible())
}
adv_urls <-
finra_data$urlManagerSummaryADV
}
if ((!crd_ids %>% purrr::is_null()) & (!entity_names %>% purrr::is_null())) {
adv_urls <-
c(adv_urls, crd_urls)
}
if ((!crd_ids %>% purrr::is_null()) & (entity_names %>% purrr::is_null())) {
adv_urls <-
c(crd_urls)
}
.get_manager_sec_page_safe <-
possibly(.get_manager_sec_page, NULL)
sec_summary_data <-
adv_urls %>%
map_df(function(url){
.get_manager_sec_page(url = url)
}) %>%
suppressWarnings() %>%
filter(!idCRD %>% is.na()) %>%
mutate_if(is.character,
funs(. %>% str_trim()))
return(sec_summary_data)
}
.get_manager_sec_adv_actual_url <-
function(url = 'http://www.adviserinfo.sec.gov/IAPD/crd_iapd_AdvVersionSelector.aspx?ORG_PK=146629') {
page <-
url %>%
GET()
actual_url <-
page$url
return(actual_url)
}
.get_url_primary_key <-
function(url = 'http://www.adviserinfo.sec.gov/IAPD/content/viewform/adv/sections/iapd_AdvIdentifyingInfoSection.aspx?ORG_PK=146629&FLNG_PK=01285F220008018901086F5100319405056C8CC0') {
url_primary_key <-
url %>%
str_split(pattern = 'FLNG_PK=') %>%
flatten_chr() %>%
.[2]
return(url_primary_key)
}
.get_sec_sitemap_df <-
function() {
tibble(
nameSection = c(
'Registration',
"Part 1A Item 11 Disclosure Information",
"DRPs",
"Item 1 Identifying Information",
"Item 2 SEC Registration/Reporting",
"Item 3 Form Of Organization",
"Item 4 Successions",
"Item 5 Information About Your Advisory Business",
"Item 6 Other Business Activities",
"Item 7.A Financial Industry Affiliations",
"Item 7.B Private Fund Reporting",
"Item 8 Participation or Interest in Client Transactions",
"Item 9 Custody",
"Item 10 Control Persons",
"Item 11 Disclosure Information",
"Item 12 Small Businesses",
"Schedule A",
"Schedule B",
"Schedule D",
"Signature Page"
),
idSection =
c(
'sectionRegistration',
"sectionDisclosures",
"sectionDRP",
"section1IdentifyingInfo",
"section2SECRegistration",
"section3OrganizationForm",
"section4Successions",
"section5AdvisoryBusinessInformation",
"section6OtherBusinessInformation",
"section7AFinanceAffiliations",
"section7BPrivateFundReporting",
"section8ClientConflicts",
"section9Custody",
"section10ControlPersons",
"section11Disclosures",
"section12SmallBusiness",
"sectionScheduleAOwners",
"sectionScheduleBIndirectOwners",
"sectionScheduleD",
"sectionSignaturePage"
),
nameData =
c(
"data_registration",
"data_disclosures",
"data_drp",
"data_1identifyinginfo",
"data_2secregistration",
"data_3organizationform",
"data_4successions",
"data_5advisorybusinessinformation",
"data_6otherbusinessinformation",
"data_7afinanceaffiliations",
"data_7bprivatefundreporting",
"data_8clientconflicts",
"data_9custody",
"data_10controlpersons",
"data_11disclosures",
"data_12smallbusiness",
"data_schedulea",
"data_scheduleb",
"data_scheduled",
"data_signaturepage"
),
nameFunction = c(
".get_manager_sec_page_safe",
NA,
".get_section_drp_safe",
".get_section_1_data_safe",
".get_section_2_data_safe",
".get_section_3_data_safe",
".get_section_4_data_safe",
".get_section_5_data_safe",
".get_section_6_data_safe",
".get_section_7a_data_safe",
".get_section_7b_data_safe",
".get_section_8_data_safe",
".get_section_9_data_safe",
".get_section_10_data_safe",
".get_section_11_data_safe",
".get_section_12_data_safe",
".get_schedule_a_data_safe",
".get_schedule_b_data_safe",
".get_schedule_d_data_safe",
".get_manager_signatory_data_safe"
),
nameSectionActual = c(
"Registration",
"Part 1A Item 11 Disclosure Information",
"DRPs",
"Identifying Information",
"SEC Reporting",
"Organization",
"Successions",
"Advisory Business Information",
"Other Business Activities",
"Financial Industry Affiliations",
"Private Fund Reporting",
"Participation or Interest in Client Transactions",
"Custody",
"Control Persons",
"Disclosure Information",
"Small Businesses",
"Direct Manager Owners",
"Indirect Manager Owners",
"Other Manager Information",
"Manager Signatories"
)
)
}
#' Returns data frame of SEC ADV sitemap
#'
#' @return
#' @export
#' @import dplyr
#' @examples
sec_adv_manager_sitemap <-
function() {
data <-
tibble(
idSection =
c(
'sectionRegistration',
"sectionDisclosures",
"sectionDRP",
"section1IdentifyingInfo",
"section2SECRegistration",
"section3OrganizationForm",
"section4Successions",
"section5AdvisoryBusinessInformation",
"section6OtherBusinessInformation",
"section7AFinanceAffiliations",
"section7BPrivateFundReporting",
"section8ClientConflicts",
"section9Custody",
"section10ControlPersons",
"section11Disclosures",
"section12SmallBusiness",
"sectionScheduleA",
"sectionScheduleB",
"sectionScheduleD",
"sectionSignaturePage"
),
nameSection = c(
'Registration',
"Part 1A Item 11 Disclosure Information",
"DRPs",
"Item 1 Identifying Information",
"Item 2 SEC Registration/Reporting",
"Item 3 Form Of Organization",
"Item 4 Successions",
"Item 5 Information About Your Advisory Business",
"Item 6 Other Business Activities",
"Item 7.A Financial Industry Affiliations",
"Item 7.B Private Fund Reporting",
"Item 8 Participation or Interest in Client Transactions",
"Item 9 Custody",
"Item 10 Control Persons",
"Item 11 Disclosure Information",
"Item 12 Small Businesses",
"Schedule A",
"Schedule B",
"Schedule D",
"Signature Page"
),
nameSectionActual = c(
"Registration",
"Part 1A Item 11 Disclosure Information",
"DRPs",
"Identifying Information",
"SEC Reporting",
"Organization",
"Successions",
"Advisory Business Information",
"Other Business Activities",
"Financial Industry Affiliations",
"Private Fund Reporting",
"Participation or Interest in Client Transactions",
"Custody",
"Control Persons",
"Disclosure Information",
"Small Businesses",
"Direct Manager Owners",
"Indirect Manager Owners",
"Other Manager Information",
"Manager Signatories"
)
)
return(data)
}
.get_location_name_df <-
function() {
location_name_df <-
tibble(
itemNode = c(
"Number and Street 1",
'Number and Street 2',
"City",
"State",
"Country",
"ZIP+4/Postal Code"
),
nameNode = c(
"addressStreet1",
'addressStreet2',
"city",
"state",
"country",
"zip"
)
)
return(location_name_df)
}
.get_sitemap_urls <-
function(data, section_name = 'section7BPrivateFundReporting') {
urls_exist <-
data %>%
dplyr::filter(idSection %in% section_name) %>%
.$urlADVSection %>% length() > 0
if (urls_exist) {
urls <-
data %>%
dplyr::filter(idSection %in% section_name) %>%
.$urlADVSection
return(urls)
}
}
.parse_adv_manager_sitemap_df <-
function(url = 'http://www.adviserinfo.sec.gov/IAPD/crd_iapd_AdvVersionSelector.aspx?ORG_PK=135952',
manager = NULL,
return_wide = F) {
idCRD <-
url %>%
.get_url_crd()
if (idCRD %>% is.na()) {
idCRD <-
url %>%
str_split('\\?ORG_PK=') %>%
flatten_chr() %>%
.[2] %>%
str_split('\\&') %>%
flatten_chr() %>%
.[[1]] %>%
as.numeric() %>%
suppressWarnings()
}
actual_url <-
url %>%
.get_manager_sec_adv_actual_url()
url_primary_key <-
actual_url %>%
.get_url_primary_key()
page <-
actual_url %>%
.get_html_page()
base_url <-
actual_url %>%
str_split('sections|Sections') %>%
flatten_chr() %>%
.[[1]]
name_entity_manager <-
page %>%
.get_entity_manager_name()
name_entity_manager <-
page %>%
html_nodes('.summary-displayname') %>%
html_text()
if (length(manager) > 0) {
name_entity_manager <- manager
}
items <-
page %>%
.get_html_node_text('.sidebar a[href^=".."]') %>%
str_trim()
values <-
page %>%
.get_html_node_attributes(node_css = '.sidebar a[href^=".."]',
'href') %>%
str_trim() %>%
str_replace('../', '') %>%
paste0(base_url,
.) %>%
unique()
adv_data <-
tibble(
idCRD,
nameSection = 'Registration',
urlADVSection = glue::glue('http://www.adviserinfo.sec.gov/IAPD/IAPDFirmSummary.aspx?ORG_PK={idCRD}') %>% as.character()
) %>%
bind_rows(tibble(idCRD,
nameSection = items,
urlADVSection = values)) %>%
left_join(.get_sec_sitemap_df()) %>%
distinct() %>%
dplyr::filter(!nameFunction %>% is.na()) %>%
dplyr::select(-nameSection) %>%
mutate(nameEntityManager = name_entity_manager,
idRow = 1:n()) %>%
group_by(idSection) %>%
dplyr::filter(idRow == min(idRow)) %>%
ungroup() %>%
dplyr::select(-idRow) %>%
dplyr::select(idCRD, nameEntityManager, everything()) %>%
suppressMessages()
if (return_wide) {
adv_data <-
adv_data %>%
spread(idSection, urlADVSection)
}
adv_data
}
.get_managers_adv_sitemap_adv <-
function(idCRDs = c(162351),
score_threshold = .2,
entity_names = NULL) {
adv_managers_metadata_safe <-
possibly(adv_managers_metadata, NULL)
.parse_adv_manager_sitemap_df_safe <-
possibly(.parse_adv_manager_sitemap_df, NULL)
if (!purrr::is_null(idCRDs)) {
urls <-
glue::glue("https://www.adviserinfo.sec.gov/IAPD/crd_iapd_AdvVersionSelector.aspx?ORG_PK={idCRDs}") %>% as.character()
}
if (!purrr::is_null(entity_names)) {
manager_data <-
entity_names %>%
future_map_dfr(function(x) {
adv_managers_metadata(
crd_ids = idCRDs,
score_threshold = score_threshold,
entity_names = x,
return_message = T
)
})
manager_data <-
manager_data %>%
dplyr::filter(!urlManagerADV %>% is.na())
}
if ('urls' %>% exists() & 'manager_data' %>% exists()) {
urls <-
c(urls, manager_data$urlManagerADV %>% unique()) %>% unique()
}
if (idCRDs %>% purrr::is_null() & 'manager_data' %>% exists()) {
urls <-
manager_data$urlManagerADV %>% unique()
}
if ('urls' %>% exists() & entity_names %>% purrr::is_null()) {
manager_data <-
adv_managers_metadata(crd_ids = idCRDs, entity_names = NULL)
name_entity_manager <-
manager_data$nameEntityManager
}
.parse_adv_manager_sitemap_df_safe <-
purrr::possibly(.parse_adv_manager_sitemap_df, tibble())
data <-
urls %>%
unique() %>%
future_map_dfr(function(x) {
.parse_adv_manager_sitemap_df_safe(url = x, return_wide = F)
})
if (manager_data %>% nrow() > 0) {
data <-
data %>%
dplyr::filter(!nameEntityManager %>% is.na()) %>%
left_join(manager_data %>% dplyr::select(idCRD, nameEntityManager)) %>%
dplyr::select(
idCRD,
nameSectionActual,
nameEntityManager,
idSection,
nameData,
nameFunction,
urlADVSection
) %>%
suppressMessages()
}
return(data)
}
.get_type_manager_entity_owner_df <-
function() {
type_df <-
tibble(
idTypeEntityManagerOwner = c("I", "DE", "FE"),
typeEntityManagerOwner = c('Individual', 'Domestic Entity', 'Foreign Entity'),
isEntityOwnerManagerEntity = c(F, T, T)
)
return(type_df)
}
.get_range_entity_owner_df <-
function() {
range_df <-
tibble(
idRangeManagerEntityOwnership = c(NA, LETTERS[1:6]),
rangeManagerEntityOwnership = c(
"< 5%",
">= 5%, < 10%",
">= 10%, < 25%",
">= 25%, < 50%",
">= 50%, < 75%",
">= 75%",
"Other (general partner, trustee, or elected manager)"
)
)
return(range_df)
}
.parse_manager_owner_name <-
function(x = "GRAY, ROBERT, LAWRENCE") {
count_commas <-
x %>% str_count('\\, ')
full_name <-
x %>% str_split('\\, ') %>% flatten_chr()
if (count_commas == 1) {
name_first <-
full_name[2]
name_middle <-
NA
name_last <-
full_name[1]
name_full <-
name_first %>% paste(name_last)
name_common <-
name_full
}
if (count_commas == 2) {
name_first <-
full_name[2]
name_middle <-
full_name[3]
name_last <-
full_name[1]
name_full <-
name_first %>% paste(name_middle, name_last)
name_common <-
name_first %>% paste(name_last)
}
if (count_commas == 3) {
name_first <-
full_name[3]
name_middle <-
full_name[4]
name_last <-
full_name[1] %>%
paste(full_name[2], collapse = '')
name_full <-
name_first %>% paste(name_middle, name_last)
name_common <-
name_first %>% paste(name_last)
}
name_df <-
tibble(
nameEntityManagerOwner = x,
nameCommonEntityOwnerManager = name_common,
nameFullEntityOwnerManager = name_full,
nameFirstEntityManagerOwner = name_first,
nameMiddleEntityManagerOwner = name_middle,
nameLastEntityManagerOwner = name_last
) %>%
mutate_all(str_to_upper)
return(name_df)
}
.get_pk_url_crd <-
function(url = 'http://www.adviserinfo.sec.gov/IAPD/content/viewform/adv/Sections/iapd_AdvPrivateFundReportingSection.aspx?ORG_PK=162351&FLNG_PK=052DAAB400080184043CE66005E35E29056C8CC0') {
idCRD <-
url %>%
str_split('=') %>%
flatten_chr() %>%
.[2] %>% str_replace_all('&FLNG_PK', '') %>%
as.numeric()
return(idCRD)
}
.get_sitemap_filter_url <-
function(data, filter_name = 'sectionScheduleA') {
urls <-
data %>%
dplyr::filter(idSection == filter_name) %>%
.$urlADVSection
return(urls)
}
# private fund data -------------------------------------------------------
.get_check_box_value_df <-
function() {
node_check_df <-
tibble(
nodeName = c(
"Checkbox checked, changed",
"Checkbox not checked",
"Radio button not selected",
"Radio button selected, changed"
),
isNodeChecked = c(TRUE, FALSE, FALSE,
TRUE)
)
return(node_check_df)
}
.get_form_check_box_df <-
function(modify = T) {
form_value_df <-
tibble(
idRow = 1:52,
nameItem = c(
'hasICA1940S31Exclusion.TRUE',
'hasICA1940S37Exclusion.TRUE',
'isMasterFund.TRUE',
'isMasterFund.FALSE',
'isFeederFund.TRUE',
'isFeederFund.FALSE',
'isFundOfFund.TRUE',
'isFundOfFund.FALSE',
'isFundOfFundRelatedInvestment.TRUE',
'isFundOfFundRelatedInvestment.FALSE',
'isICA1940PrivateSecuritiesInvestor.TRUE',
'isICA1940PrivateSecuritiesInvestor.FALSE',
'isHedgeFund.TRUE',
'isLiquidityFund.TRUE',
'isPrivateEquityFund.TRUE',
'isRealEstateFund.TRUE',
'isSecuritizedAssetFund.TRUE',
'isVentureFund.TRUE',
'isOtherFund.TRUE',
'hasPrivateFundSubAdviser.TRUE',
'hasPrivateFundSubAdviser.FALSE',
'hasOtherPrivateFundAdviser.TRUE',
'hasOtherPrivateFundAdviser.FALSE',
'hasSolicitedClientsForFund.TRUE',
'hasSolicitedClientsForFund.FALSE',
'hasREGDExemption.TRUE',
'hasREGDExemption.FALSE',
'isFundAuditedAnnually.TRUE',
'isFundAuditedAnnually.FALSE',
'isFundAuditGAAPCompliant.TRUE',
'isFundAuditGAAPCompliant.FALSE',
'isFundAuditorIndependent.TRUE',
'isFundAuditorIndependent.FALSE',
'isFundAuditorPCAOBRegistered.TRUE',
'isFundAuditorPCAOBRegistered.FALSE',
'isFundAuditorPCAOBCompliant.TRUE',
'isFundAuditorPCAOBCompliant.FALSE',
'hasInvestorAnnualFinancialStatement.TRUE',
'hasInvestorAnnualFinancialStatement.FALSE',
'hasAuditorUnqualifiedOpinion.TRUE',
'hasAuditorUnqualifiedOpinion.FALSE',
'hasAuditorUnqualifiedOpinion.NA',
'hasFundPrimeBroker.TRUE',
'hasFundPrimeBroker.FALSE',
'hasFundCustodian.TRUE',
'hasFundCustodian.FALSE',
'isCustodianRelatedPerson.TRUE',
'isCustodianRelatedPerson.FALSE',
'hasFundAdministrator.TRUE',
'hasFundAdministrator.FALSE',
'hasFundPlacementAgent.TRUE',
'hasFundPlacementAgent.FALSE'
)
)
if (modify) {
form_value_df <-
form_value_df <-
tibble(
nameItem = c(
'isMasterFund.TRUE',
'isMasterFund.FALSE',
'isFeederFund.TRUE',
'isFeederFund.FALSE',
'isFundOfFund.TRUE',
'isFundOfFund.FALSE',
'isFundOfFundRelatedInvestment.TRUE',
'isFundOfFundRelatedInvestment.FALSE',
'isFundICASecuritiesInvestor.TRUE',
'isFundICASecuritiesInvestor.FALSE',
'isHedgeFund.TRUE',
'isLiquidityFund.TRUE',
'isPrivateEquityFund.TRUE',
'isRealEstateFund.TRUE',
'isSecuritizedAssetFund.TRUE',
'isVentureFund.TRUE',
'isOtherFund.TRUE',
'hasPrivateFundSubAdviser.TRUE',
'hasPrivateFundSubAdviser.FALSE',
'hasOtherPrivateFundAdviser.TRUE',
'hasOtherPrivateFundAdviser.FALSE',
'hasSolicitedClientsForFund.TRUE',
'hasSolicitedClientsForFund.FALSE',
'hasREGDExemption.TRUE',
'hasREGDExemption.FALSE',
'isFundAuditedAnnually.TRUE',
'isFundAuditedAnnually.FALSE',
'isFundAuditGAAPCompliant.TRUE',
'isFundAuditGAAPCompliant.FALSE',
'isFundAuditorIndependent.TRUE',
'isFundAuditorIndependent.FALSE',
'isFundAuditorPCAOBRegistered.TRUE',
'isFundAuditorPCAOBRegistered.FALSE',
'isFundAuditorPCAOBCompliant.TRUE',
'isFundAuditorPCAOBCompliant.FALSE',
'hasInvestorAnnualFinancialStatement.TRUE',
'hasInvestorAnnualFinancialStatement.FALSE',
'hasAuditorUnqualifiedOpinion.TRUE',
'hasAuditorUnqualifiedOpinion.FALSE',
'hasAuditorUnqualifiedOpinion.NA',
'hasFundPrimeBroker.TRUE',
'hasFundPrimeBroker.FALSE'
)
) %>%
mutate(idRow = 1:n()) %>%
dplyr::select(idRow, nameItem)
}
return(form_value_df)
}
.get_fund_table_html <-
function(page = page,
table_number = 1,
table_css_node_df = table_css_node_df) {
node_css <-
table_css_node_df %>%
dplyr::filter(numberTable == table_number) %>%
.$tableCSSNode
table_html <-
page %>%
html_nodes(css = node_css)
return(table_html)
}
.get_table_check_box_df <-
function(page = page,
table_number = 1,
table_css_node_df,
only_radio = T) {
node_table <-
.get_fund_table_html(
page = page,
table_number = table_number,
table_css_node_df = table_css_node_df
)
table_checked_nodes <-
node_table %>%
html_nodes('img') %>%
html_attr('alt') %>%
str_trim()
if (only_radio) {
table_checked_nodes <-
table_checked_nodes[table_checked_nodes %>%
str_detect("Radio")]
}
check_box_df <-
tibble(nodeName = table_checked_nodes) %>%
mutate(idTable = table_number,
idImageNode = 1:n()) %>%
left_join(.get_check_box_value_df()) %>%
suppressMessages() %>%
dplyr::select(idTable, idImageNode, nodeName, isNodeChecked)
return(check_box_df)
}
.get_tables_check_boxes_df <-
function(page = page,
table_number = 1,
table_css_node_df = table_css_node_df) {
tables_check_boxes_df <-
table_css_node_df$numberTable %>%
future_map_dfr(function(x) {
.get_table_check_box_df(
page = page,
table_number = x,
table_css_node_df = table_css_node_df
)
})
return(tables_check_boxes_df)
}
.get_table_check_box_data <-
function(page = page,
table_number = 1,
table_css_node_df,
only_radio = T) {
form_value_df <-
.get_form_check_box_df(modify = T)
check_box_df <-
.get_table_check_box_df(
page = page,
table_number = table_number,
table_css_node_df = table_css_node_df,
only_radio = T
)
form_value_df <-
form_value_df %>%
separate(nameItem,
into = c('nameItem', 'logicalResponse'),
sep = '\\.') %>%
suppressWarnings() %>%
mutate(logicalResponse = logicalResponse %>% as.logical)
response_df <-
check_box_df %>%
slice(1:nrow(form_value_df)) %>%
left_join(form_value_df %>% dplyr::rename(idImageNode = idRow)) %>%
dplyr::filter(isNodeChecked == T) %>%
dplyr::filter(logicalResponse == T) %>%
dplyr::select(nameItem, logicalResponse) %>%
suppressWarnings() %>%
suppressMessages()
x <-
response_df$nameItem
fund_type <-
case_when(
x == 'isHedgeFund' ~ "Hedge Fund",
x == 'isLiquidityFund' ~ "Liquidity",
x == 'isPrivateEquityFund' ~ "Private Equity",
x == 'isRealEstateFund' ~ "Real Estate",
x == 'isSecuritizedAssetFund' ~ "Securitized Asset",
x == 'isVentureFund' ~ "Venture",
x == 'isOtherFund' ~ "Other",
FALSE ~ as.character(x)
)
fund_type <-
fund_type[!fund_type %>% is.na()]
response_df <-
response_df %>%
dplyr::filter(
!nameItem %in% c(
'isHedgeFund',
'isLiquidityFund',
'isPrivateEquityFund',
'isRealEstateFund',
'isSecuritizedAssetFund',
'isVentureFund',
'isOtherFund'
)
)
column_order <-
response_df$nameItem
response_df <-
response_df %>%
spread(nameItem, logicalResponse) %>%
dplyr::select(one_of(column_order)) %>%
mutate(numberFund = table_number,
typeFund = fund_type) %>%
dplyr::select(numberFund, typeFund, everything())
return(response_df)
}
.parse_table_node_df <-
function(page = page,
table_number = 1,
table_css_node_df = table_css_node_df) {
base_table_html <-
.get_fund_table_html(
page = page,
table_number = table_number,
table_css_node_df = table_css_node_df
) %>%
html_nodes('.PaperFormTableData tr td')
table_data_nodes <-
base_table_html %>%
html_text(trim = T) %>%
str_replace_all('\r|\n|\t', '') %>%
stri_replace_all_charclass("\\p{WHITE_SPACE}", " ") %>%
stri_trim_both() %>%
gsub("^\\s+|\\s+$", "", .)
table_data_nodes <-
table_data_nodes[!table_data_nodes == '']
table_node_df <-
seq_along(table_data_nodes) %>%
future_map_dfr(function(x) {
data_nodes <-
table_data_nodes[[x]] %>%
str_split(' ') %>%
flatten_chr()
if (data_nodes[!data_nodes == ''] %>% length() > 0) {
data_nodes <-
data_nodes[!data_nodes == '']
nodeText <-
data_nodes %>% paste0(collapse = ' ')
nodeText <-
nodeText %>%
str_replace_all(
'Name of the marketer: |Legal name of custodian: |Primary business name of custodian: |CRD Number (if any):|Name of the prime broker: ',
''
)
isLetter <-
(nodeText %>% str_detect("^\\([a-z]")) &
(nodeText %>% nchar() < 10)
isNumberSection <-
(nodeText %>% str_detect("^[1-9].")) &
(nodeText %>% nchar() < 4) &
(nodeText %>% str_detect('\\.'))
if (isNumberSection) {
numberSection <-
nodeText %>% str_extract_all("^[1-9].") %>% flatten_chr() %>% as.numeric()
} else {
numberSection <-
NA
}
if (isLetter) {
letterSection <-
nodeText %>% str_extract_all("^\\([a-z]") %>% flatten_chr() %>%
str_replace_all('\\(', '')
} else {
letterSection <-
NA
}
remove_node <-
nodeText %>% str_detect('Include only those assets where')
if (remove_node) {
letterSection <-
"g"
isLetter <-
T
}
if (nodeText == 'B. SERVICE PROVIDERS') {
letterSection <-
"c"
isLetter <-
T
}
if (nodeText == 'Your Advisory Services') {
letterSection <-
"q"
isLetter <-
T
}
if (nodeText %>% str_detect('\\CRD Number')) {
nodeText <-
nodeText %>%
gsub('\\CRD Number', '', .) %>%
gsub('\\(if any):', '', .) %>%
gsub('and', '', .) %>%
str_trim()
letterSection <-
"cc"
}
if (nodeText == 'Private Offering') {
letterSection <-
"c"
isLetter <-
T
}
if (nodeText == 'Administrator') {
letterSection <-
"g"
isLetter <-
T
}
if (nodeText == 'No Information Filed') {
nodeText <-
NA
}
if (isLetter | isNumberSection) {
nodeText <-
NA
}
tibble(idNode = x, numberSection, letterSection, nodeText)
}
}) %>%
fill(numberSection, letterSection) %>%
dplyr::filter(!nodeText %>% is.na()) %>%
dplyr::filter(!numberSection %>% is.na()) %>%
dplyr::filter(!nodeText %>% str_detect("^NOTE:|Ownership")) %>%
unite(
numberLetterSection,
numberSection,
letterSection,
sep = '',
remove = F
) %>%
mutate(numberTable = table_number) %>%
dplyr::select(numberTable, everything())
return(table_node_df)
}
.extract_node_data <-
function(table_node_df,
variable_name = 'pctFundNonUSCitizens',
section_letter = "16b",
method = "max") {
options(scipen = 99999)
letter_section_location <-
c('7c', '23c', '24d', '25d', '26c', '28e')
letter_section_other <-
c(
'7a',
'7b',
'7d',
'23b',
'24a',
'24b',
'24c',
'24cc',
'25a',
'25b',
'25c',
'25f',
'25cc',
'26a',
'26b',
'28c',
'28d',
'28cc',
'28g'
)
is_location <-
variable_name %>% str_detect("location")
if (is_location) {
method <-
'location'
} else {
method <-
'max'
}
if (!method %>% str_to_lower() %in% c('max', 'location')) {
stop('Method can only be ')
}
if (section_letter %in% table_node_df$numberLetterSection) {
node_df <-
table_node_df %>%
dplyr::filter(numberLetterSection == section_letter)
if ((method == 'max') &
(!section_letter %in% letter_section_other)) {
node_value <-
node_df %>%
dplyr::filter(idNode == max(idNode)) %>%
.$nodeText
if (variable_name %in% c('nameFundGPManagerTrusteeDirector') &
node_df %>% nrow > 2) {
node_value <-
node_df %>% slice(-c(1:2)) %>%
.$nodeText
if (node_value %>% length() > 1) {
node_value <-
node_value %>% paste0(collapse = ' - ')
}
}
if (variable_name %in% c('idSECCRDPrimeBroker', 'idSECCRDCustodian')) {
node_value <-
node_df %>%
dplyr::filter(!nodeText %>% str_detect('If')) %>%
.$nodeText
if (node_value %>% length() > 1) {
node_value <-
node_value %>% paste0(collapse = ' & ')
}
}
has_currency <-
node_value %>% str_detect('\\$') %>% unique()
if (has_currency) {
node_value <-
node_value %>% str_replace('\\$', '') %>%
str_trim() %>%
as.character() %>%
readr::parse_number()
}
if (variable_name %>% str_detect("pct")) {
node_value <-
node_value %>% str_replace('\\%', '') %>% str_trim() %>% as.numeric() %>% suppressWarnings()
}
if (node_value %>% str_detect('\\$') &!is.na(node_value)) {
node_value <-
node_value %>% str_replace('\\$|\\', '')
}
if ((variable_name == 'idFundFormD') &
(node_value %>% str_detect("-") == F)) {
node_value <-
NA
}
}
if ((method == 'location') &
(!section_letter %in% c('7c', '23c', '24d', '25d', '26c'))) {
node_value <-
node_df %>%
dplyr::filter(nodeText %>% str_detect('City:|State:|Country')) %>%
.$nodeText %>%
str_replace_all('City:|State:|Country:', '')
node_value <-
node_value[!node_value == ''] %>% str_trim()
if (node_value %>% length() > 1) {
node_value <-
node_value %>%
paste0(collapse = ', ') %>%
str_trim() %>% str_to_upper()
} else {
node_value <-
node_value %>% str_trim() %>% str_to_upper()
}
}
if (section_letter %in% letter_section_other) {
hitwords <-
'Name|Private fund|If you are filing |Additional Feeder Fund|If the|Yes|No|The location|Additional|If the marketer is registered|Website Address|If the answer to 28(f)|Click the button|If you or another adviser'
has_node_length <-
node_df %>%
dplyr::filter(!nodeText %>% str_detect(hitwords)) %>%
.$nodeText %>% length() > 0
if (has_node_length) {
node_value <-
node_df %>%
dplyr::filter(!nodeText %>% str_detect(hitwords)) %>%
.$nodeText
if ((node_value %>% length()) > 1) {
vars_names <-
c(variable_name,
1:(length(node_value) - 1) %>% paste0(variable_name, .))
}
} else {
node_value <-
NA
}
}
if (section_letter %in% letter_section_location) {
hit_words <-
c("The",
"Under",
'Location of prime',
'The location of',
'Location of') %>%
paste0(collapse = '|')
locations_df <-
node_df %>%
dplyr::select(idNode, nodeText) %>%
dplyr::filter(!nodeText %>% str_detect('Yes|No')) %>%
mutate(idLocationNode = ifelse(nodeText %>% str_detect(hit_words), idNode, NA))
count_df <-
locations_df %>%
dplyr::filter(nodeText %>% str_detect(hit_words)) %>%
mutate(countLocation = 1:n()) %>%
dplyr::select(countLocation, idLocationNode, nodeText)
locations_df <-
locations_df %>% left_join(count_df) %>%
fill(countLocation) %>%
dplyr::filter(!nodeText %>% str_detect("The")) %>%
dplyr::select(countLocation, nodeText) %>%
suppressMessages()
count_locations <-
locations_df$countLocation %>% unique()
node_value <-
count_locations %>%
map_chr(function(x) {
node_vals <-
locations_df %>%
dplyr::filter(countLocation == x) %>%
dplyr::filter(nodeText %>% str_detect('City:|State:|Country')) %>%
.$nodeText %>%
str_replace_all('City:|State:|Country:', '')
node_vals <-
node_vals[!node_vals == ''] %>% str_trim()
if (node_vals %>% length() > 1) {
node_vals <-
node_vals %>%
paste0(collapse = ', ') %>%
str_trim()
} else {
node_vals <-
node_vals %>% str_trim()
}
return(node_vals %>% str_to_upper)
})
if ((node_value %>% length() > 1)) {
vars_names <-
c(variable_name,
1:(length(node_value) - 1) %>% paste0(variable_name, .))
}
}
} else {
node_value <-
NA
}
if ('vars_names' %>% exists()) {
variable_name <-
vars_names
rm(vars_names) %>% suppressWarnings()
}
node_df <-
tibble(nameVariable = variable_name,
valueNode = node_value) %>%
mutate(sectionLetter = section_letter) %>%
dplyr::select(sectionLetter, everything())
return(node_df)
}
.get_section_matrix_df <-
function() {
section_matrix_df <-
tibble(
sectionLetter = c(
"1a",
"1b",
"2b",
"3b",
"7a",
"7b",
"7c",
"7d",
"11b",
"12b",
"13b",
"14b",
"15b",
"16b",
"20b",
"22c",
"23b",
"23c",
"24b",
"24c",
'24cc',
"24d",
"25b",
"25c",
"25d",
"25f",
'25cc',
"26b",
"26c",
"27f",
'28c',
'28d',
'28cc',
'28e',
'28g'
),
variableName = c(
"nameFund",
"idPrivateFund",
"locationFundIncorporation",
"nameFundGPManagerTrusteeDirector",
"nameFeederFund",
"idPrivateFundFeeder",
"locationFeederFund",
"nameFundFeederGPManagerTrusteeDirector",
"amountFundGrossAUM",
"amountFundMinimumInvestment",
"countFundOwners",
"pctFundManagerOwned",
"pctFundFundOfFunds",
"pctFundNonUSCitizens",
"pctClientsFundInvestor",
"idFundFormD",
"nameAuditorFund",
"locationAuditorFund",
"namePrimeBroker",
"idSECPrimeBroker",
'idCRDPrimeBroker',
"locationPrimeBroker",
"nameCustodianLegal",
"nameCustodianBusiness",
"locationCustodian",
"idSECCustodian",
'idCRDCustodian',
"nameAdministratorFund",
"locationAdministratorFund",
"pctFund3rdPartyValued",
'nameFundMarketer',
'idSECFundMarketer',
'idCRDFundMarketer',
'locationFundMarketer',
'urlFundMarketer'
)
)
return(section_matrix_df)
}
.parse_funds_tables <-
function(page,
table_css_node_df = table_css_node_df,
section_matrix_df = section_matrix_df,
return_message = T) {
parse_fund_table <-
function(page,
table_number = 1,
table_css_node_df = table_css_node_df,
section_matrix_df = section_matrix_df) {
table_node_df <-
.parse_table_node_df(
page = page,
table_number = table_number,
table_css_node_df = table_css_node_df
)
response_df <-
.get_table_check_box_data(
page = page,
table_number = table_number,
table_css_node_df = table_css_node_df,
only_radio = T
)
.extract_node_data_safe <-
purrr::possibly(.extract_node_data, tibble())
fund_table_data <-
1:nrow(section_matrix_df) %>%
future_map_dfr(function(x) {
.extract_node_data(
table_node_df = table_node_df,
variable_name = section_matrix_df$variableName[x],
section_letter = section_matrix_df$sectionLetter[x],
method = 'max'
) %>%
mutate(valueNode = valueNode %>% as.character())
}) %>%
suppressWarnings() %>%
mutate(valueNode = ifelse(valueNode %in% c('', '-'), NA, valueNode)) %>%
dplyr::filter(!valueNode %>% is.na())
col_order <-
fund_table_data$nameVariable
fund_table_data <-
fund_table_data %>%
dplyr::select(-sectionLetter) %>%
spread(nameVariable, valueNode) %>%
dplyr::select(one_of(col_order)) %>%
mutate(numberFund = table_number) %>%
dplyr::select(numberFund, everything())
has_amount <-
fund_table_data %>% dplyr::select(dplyr::matches("^count|amount")) %>% names() %>% length() > 0
if (has_amount) {
fund_table_data <-
fund_table_data %>%
mutate_at(.vars = fund_table_data %>% dplyr::select(dplyr::matches("^count|amount")) %>% names(),
.funs = as.numeric)
}
has_percent <-
fund_table_data %>% dplyr::select(dplyr::matches("^pct")) %>% names() %>% length() > 0
if (has_percent) {
fund_table_data <-
fund_table_data %>%
mutate_at(
.vars = fund_table_data %>% dplyr::select(dplyr::matches("^pct")) %>% names(),
.funs = funs(. %>% as.numeric() / 100)
)
}
fund_table_data <-
fund_table_data %>%
left_join(response_df) %>%
dplyr::select(numberFund:nameFund, typeFund, everything()) %>%
suppressMessages()
return(fund_table_data)
}
parse_fund_table_safe <-
possibly(parse_fund_table, NULL)
table_numbers <-
table_css_node_df$numberTable
all_table_data <-
table_numbers %>%
future_map_dfr(function(x) {
parse_fund_table_safe(
page = page,
table_number = x,
table_css_node_df = table_css_node_df,
section_matrix_df = section_matrix_df
)
})
return(all_table_data)
}
# form_sections -----------------------------------------------------------
.get_section_1_data <-
function(url = 'https://files.adviserinfo.sec.gov/IAPD/content/viewform/adv102017/Sections/iapd_AdvIdentifyingInfoSection.aspx?ORG_PK=138854&FLNG_PK=0324B906000801AC01C4BA71024984CD056C8CC0') {
idCRD <-
url %>%
.get_pk_url_crd()
get_node_item_df <- function() {
items <- c("isUmbrellaADVFiling",
'hasChangedLegalName',
'hasChangedBusinessName',
'isPrivateResidence',
'isOfficeOpenMondayFriday',
'isOfficeOpenOther',
'isPrivateResidence2',
"hasCompanySocialMediaAccounts",
"hasNoCompanySocialMediaAccounts",
"hasSection204Accounting",
"hasNoSection204Accounting",
"hasForeignRegulatoryRegistration",
"hasNoForeignRegulatoryRegistration",
"is1934Filer",
"isNo1934Filer",
"hasAUMOver1B",
"hasNoAUMOver1B",
"rangeAUM1Bto10B",
"rangeAUM10Bto50B",
"rangeAUMOver50B")
data <-
tibble(nameItem = items, valueItem = T) %>%
mutate(idRow = 1:n())
data
}
page <-
url %>%
.get_html_page()
name_entity_manager <-
page %>%
.get_entity_manager_name()
.find_text_node_safe <-
possibly(.find_text_node, NULL)
.parse_node_df <-
function(page) {
check_nodes <-
page %>%
html_nodes('.main td img') %>%
html_attr('alt') %>%
str_trim()
node_df <-
tibble(nodeName = check_nodes) %>%
left_join(.get_check_box_value_df(), by = "nodeName") %>%
mutate(idRow = 1:n()) %>%
left_join(get_node_item_df(), by = "idRow") %>%
mutate(valueItem = T) %>%
dplyr::filter(isNodeChecked) %>%
dplyr::select(nameItem, valueItem) %>%
suppressMessages()
column_order <-
node_df$nameItem
node_df <-
node_df %>%
spread(nameItem, valueItem) %>%
dplyr::select(one_of(column_order))
return(node_df)
}
.parse_value_nodes <-
function(page) {
node_text <-
page %>%
.parse_node_table_to_text(css_node = '#ctl00_ctl00_cphMainContent_cphAdvFormContent_IdentInfoPHSection_ctl00_trIAPDHeader + tr +tr td td')
business_name_df <-
tibble(
nameItem = c(
'nameEntityManagerLegal',
'nameEntityManagerBusiness',
'idSEC',
'addressStreet1OfficePrimary',
'addressStreet2OfficePrimary',
'cityOfficePrimary',
'stateOfficePrimary',
'countryOfficePrimary',
'zipOfficePrimary',
'hoursOffice',
'phoneOfficePrimary',
'idLEI'
),
hit_words = c(
'Your full legal name',
'Name under which you primarily conduct your advisory business, if different from',
'egistered with the SEC as an investment adviser',
'Number and Street 1:',
'Number and Street 2:',
'City:',
'State:',
'Country:',
'Postal Code:',
'Normal business hours at this location:',
'Telephone number at this location:',
'Provide your Legal Entity Identifier'
),
off_set = c(1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1),
is_numeric_node = rep(F, 12)
)
business_data_df <-
1:nrow(business_name_df) %>%
future_map_dfr(function(x) {
tibble(
nameItem =
business_name_df$nameItem[[x]],
value =
.find_text_node_safe(
node_text = node_text,
hit_words = business_name_df$hit_words[[x]],
off_set = business_name_df$off_set[[x]],
is_numeric_node = business_name_df$is_numeric_node[[x]]
)
)
}) %>%
mutate(value = value %>% sub('\\:', '>', .)) %>%
separate(value, into = c('remove', 'value'), sep = "\\>") %>%
suppressWarnings()
business_data_df <-
business_data_df %>%
mutate(value = if_else(value %>% is.na, remove, value),
value = value %>% str_trim()) %>%
dplyr::select(nameItem, value) %>%
dplyr::filter(!value == '') %>%
mutate(idRow = 1:n()) %>%
group_by(nameItem) %>%
dplyr::filter(idRow == min(idRow)) %>%
ungroup() %>%
dplyr::select(-idRow) %>%
distinct()
column_order <-
business_data_df$nameItem
business_data_df <-
business_data_df %>%
spread(nameItem, value) %>%
dplyr::select(one_of(column_order))
has_secondary_office <-
'addressStreet2OfficePrimary' %in% names(business_data_df)
if (has_secondary_office) {
business_data_df <-
business_data_df %>%
unite(
addressStreet1OfficePrimary,
addressStreet1OfficePrimary,
addressStreet2OfficePrimary,
sep = ' '
)
}
has_office_location <-
names(business_data_df) %>% str_count('^address|^city|^country|^state') %>% sum() >= 4
if (has_office_location) {
business_data_df <-
business_data_df %>%
mutate(
addressOfficePrimary = addressStreet1OfficePrimary %>% paste0(
' ',
cityOfficePrimary,
', ',
stateOfficePrimary,
' ',
countryOfficePrimary,
' ',
zipOfficePrimary
) %>% str_to_upper()
)
}
if (business_data_df$idLEI == 'A legal entity identifier') {
business_data_df <-
business_data_df %>%
dplyr::select(-idLEI)
}
return(business_data_df)
}
section_data <-
page %>%
.parse_value_nodes() %>%
mutate(idCRD, nameEntityManager = name_entity_manager) %>%
left_join(page %>%
.parse_node_df() %>% mutate(idCRD, nameEntityManager = name_entity_manager)) %>%
.select_start_vars() %>%
suppressMessages()
section_data <-
section_data %>%
mutate_at(.vars = section_data %>%
dplyr::select(dplyr::matches("^address|^city|^state|^country")) %>%
names(),
list(. %>% str_to_upper())) %>%
mutate_if(is.character,
list(function(x){
x %>% str_remove_all("Item 1.A.") %>% str_trim()
}))
return(section_data)
}
.get_section_2_data <-
function(url = 'https://www.adviserinfo.sec.gov/IAPD/content/viewform/adv/Sections/iapd_AdvSecRegistrationSection.aspx?ORG_PK=158207&FLNG_PK=056E2F260008019605946011010394F5056C8CC0') {
idCRD <-
url %>%
.get_pk_url_crd()
get_node_item_df <- function(page) {
node_item_df <-
c(
"hasAUMGreater100M",
"hasAUMUnder100MOver25m",
"hasPrincipalOfficeWY",
"hasPrincipalOfficeForeign",
"isAdviser1940InvestmentActCompany",
"isAdviserBusinessDevelopmentCompany25MInCapital",
"isAdviserPensionCapitalGreater200M",
"isAdviserRelated203A",
"isAdviserNew203A",
"isAdviserMultiState203A",
"isAdviserInternet",
"hasSECOrderProhibitingRegistration",
"isAdviserSECIneligible"
) %>%
.has_item_check_name()
state_exists <-
page %>%
html_nodes('.main') %>%
html_nodes(
'#ctl00_ctl00_cphMainContent_cphAdvFormContent_SECRegisteredPHSection_ctl00_FourthColumn'
) %>%
length() > 0
if (state_exists) {
states <-
c(
"AL",
"AK",
"AZ",
"AR",
"CA",
"CO",
"CT",
"DE",
"DC",
"FL",
"GA",
"GU",
"HI",
"ID",
"IL",
"IN",
"IA",
"KS",
"KY",
"LA",
"ME",
"MD",
"MA",
"MI",
"MN",
"MS",
"MO",
"MT",
"NE",
"NV",
"NH",
"NJ",
"NM",
"NY",
"NC",
"ND",
"OH",
"OK",
"OR",
"PA",
"PR",
"RI",
"SC",
"SD",
"TN",
"TX",
"UT",
"VT",
"VI",
"VA",
"WA",
"WV",
"WI"
)
item_name <-
'stateRegistered'
state_df <-
seq_along(states) %>%
future_map_dfr(function(x) {
tibble(nameItem = item_name,
valueItem = states[x]) %>%
unite(fullnameItem,
nameItem,
valueItem,
sep = '.',
remove = F)
})
node_item_df <-
node_item_df %>% mutate(valueItem = valueItem %>% as.character()) %>%
bind_rows(state_df)
}
return(node_item_df)
}
page <-
url %>%
.get_html_page()
name_entity_manager <-
page %>%
.get_entity_manager_name()
parse_node_df <-
function(page) {
check_nodes <-
page %>%
html_nodes('.main td img') %>%
html_attr('alt') %>%
str_trim()
check_nodes <-
check_nodes[!check_nodes == '']
if (check_nodes %>% length() == 3) {
node_item_df <-
tibble(
nameItem = c(
'hasExemptionAsSolelyVentureAdviser',
'hasExemptionAsPrivateFundManagerUnder150MAUM',
'hasExemptionSoleyPrivateFundManagerAUMOver150M'
),
valueItem = T
)
node_df <-
tibble(nodeName = check_nodes) %>%
left_join(.get_check_box_value_df()) %>%
mutate(isNodeChecked = if_else(nodeName %>% str_detect('Checkbox not checked'), F, T)) %>%
bind_cols(node_item_df) %>%
mutate(valueItem = T) %>%
dplyr::filter(isNodeChecked == T) %>%
dplyr::select(nameItem, valueItem) %>%
suppressMessages()
}
if (!check_nodes %>% length() == 3) {
node_item_df <-
get_node_item_df(page = page)
if (check_nodes %>% length() == 56) {
node_item_df <-
node_item_df %>%
dplyr::filter(nameItem %>% str_detect("^state")) %>%
dplyr::select(nameItem, valueItem)
node_item_df <-
tibble(
nameItem = c(
'hasExemptionAsSolelyVentureAdviser',
'hasExemptionAsPrivateFundManagerUnder150MAUM',
'hasExemptionSoleyPrivateFundManagerAUMOver150M'
),
valueItem = T %>% as.character()
) %>% bind_rows(node_item_df)
}
if (check_nodes %>% length() == 66) {
node_item_df <-
node_item_df %>%
dplyr::filter(nameItem %>% str_detect("^state")) %>%
dplyr::select(nameItem, valueItem)
node_item_df <-
tibble(
nameItem = c(
"hasAUMGreater100M",
"hasAUMUnder100MOver25m",
"hasPrincipalOfficeWY",
"hasPrincipalOfficeForeign",
"isAdviser1940InvestmentActCompany",
"isAdviserBusinessDevelopmentCompany25MInCapital",
"isAdviserPensionCapitalGreater200M",
"isAdviserRelated203A",
"isAdviserNew203A",
"isAdviserMultiState203A",
"isAdviserInternet",
"hasSECOrderProhibitingRegistration",
"isAdviserSECIneligible"
),
valueItem = T %>% as.character()
) %>% bind_rows(node_item_df)
}
node_df <-
tibble(nodeName = check_nodes) %>%
left_join(.get_check_box_value_df()) %>%
tidyr::replace_na(list(isNodeChecked = F)) %>%
mutate(isNodeChecked = if_else(nodeName %>% str_detect('Checkbox not checked'), F, T)) %>%
bind_cols(node_item_df) %>%
dplyr::filter(isNodeChecked == T) %>%
dplyr::select(nameItem, valueItem) %>%
suppressMessages()
}
node_df <-
node_df %>%
group_by(nameItem) %>%
mutate(countItem = 1:n()) %>%
ungroup() %>%
mutate(nameEntityManager = name_entity_manager) %>%
mutate(
countItem = countItem - 1,
countItem = countItem %>% as.character(),
countItem = ifelse(countItem == "0", '', countItem)
) %>%
unite(item, nameItem, countItem, sep = '') %>%
distinct() %>%
suppressWarnings()
col_order <-
c('nameEntityManager', node_df$item)
node_df <-
node_df %>%
spread(item, valueItem) %>%
dplyr::select(one_of(col_order)) %>%
.mutate_adv_data()
return(node_df)
}
section_data <-
page %>% parse_node_df() %>%
mutate(idCRD, nameEntityManager = name_entity_manager) %>%
.select_start_vars()
return(section_data)
}
.get_section_3_data <-
function(url = 'https://www.adviserinfo.sec.gov/IAPD/content/viewform/adv/Sections/iapd_AdvFormOfOrgSection.aspx?ORG_PK=158207&FLNG_PK=056E2F260008019605946011010394F5056C8CC0') {
idCRD <-
url %>%
.get_pk_url_crd()
page <-
url %>%
.get_html_page()
name_entity_manager <-
page %>%
.get_entity_manager_name()
parse_organizational_form_data <-
function(page) {
boxes <-
page %>%
html_nodes('img') %>%
html_attr('alt') %>%
str_trim()
boxes <-
boxes[boxes %>%
str_detect("Radio")]
typeEntityManager <-
tibble(
typeEntityManager = c(
'Corporation',
'Sole Proprietorship',
'LLP',
'Partnership',
'LLC',
'LP',
'Other'
),
nodeName = boxes
) %>%
left_join(.get_check_box_value_df()) %>%
dplyr::filter(isNodeChecked == T) %>%
suppressMessages() %>%
.$typeEntityManager
nodes <-
page %>%
html_nodes('.PrintHistRed') %>%
html_text()
monthFiscalYearEnd <-
nodes[1]
if (nodes %>% length() == 3) {
locationEntityOrganized <-
nodes[2:3] %>% str_to_upper() %>% paste0(collapse = ', ')
} else {
locationEntityOrganized <-
NA
}
org_data <-
tibble(typeEntityManager,
monthFiscalYearEnd,
locationEntityOrganized)
return(org_data)
}
section_data <-
page %>%
parse_organizational_form_data() %>%
mutate(idCRD, nameEntityManager = name_entity_manager) %>%
.select_start_vars()
return(section_data)
}
.get_section_4_data <-
function(url = 'https://www.adviserinfo.sec.gov/IAPD/content/viewform/adv/Sections/iapd_AdvSuccessionsSection.aspx?ORG_PK=158207&FLNG_PK=056E2F260008019605946011010394F5056C8CC0',
return_wide = T) {
idCRD <-
url %>%
.get_pk_url_crd()
get_node_item_df <-
function() {
node_item_df <-
'isSucceedingRIABusiness' %>%
.get_item_name_yes_no_df()
return(node_item_df)
}
page <-
url %>%
.get_html_page()
name_entity_manager <-
page %>%
.get_entity_manager_name()
parse_node_df <-
function(page) {
check_nodes <-
page %>%
html_nodes('.main td img') %>%
html_attr('alt') %>%
str_trim()
node_df <-
tibble(nodeName = check_nodes) %>%
left_join(.get_check_box_value_df()) %>%
bind_cols(get_node_item_df()) %>%
dplyr::filter(isNodeChecked == T) %>%
dplyr::select(nameItem, valueItem) %>%
suppressMessages()
column_order <-
node_df$nameItem
node_df <-
node_df %>%
spread(nameItem, valueItem) %>%
dplyr::select(one_of(column_order))
has_ria_node <-
'isSucceedingRIABusiness' %in% names(node_df)
if (has_ria_node) {
if (node_df$isSucceedingRIABusiness == T) {
node_text <-
page %>%
.parse_node_table_to_text(
'#ctl00_ctl00_cphMainContent_cphAdvFormContent_SuccessionPHSection_ctl00_trIAPDHeader+ tr td'
)
dateSuccession <-
node_text %>%
.find_text_node(
hit_words = "Date of Succession",
off_set = 0,
is_numeric_node = F
) %>%
lubridate::mdy()
node_df <-
node_df %>%
mutate(dateSuccession)
}
}
return(node_df)
}
name_entity_manager <-
page %>%
.get_entity_manager_name()
section_data <-
page %>% parse_node_df() %>%
mutate(idCRD, nameEntityManager = name_entity_manager) %>%
.select_start_vars()
if (!return_wide) {
section_data <-
section_data %>%
gather(itemName, value, -c(idCRD, nameEntityManager))
}
return(section_data)
}
.get_section_5_data <-
function(url = "https://www.adviserinfo.sec.gov/IAPD/content/viewform/adv/Sections/iapd_AdvAdvisoryBusinessSection.aspx?ORG_PK=158207&FLNG_PK=056E2F260008019605946011010394F5056C8CC0") {
section_name <-
'section5AdvisoryBusinessInformation'
idCRD <-
url %>%
.get_pk_url_crd()
page <-
url %>%
.get_html_page()
name_entity_manager <-
page %>%
.get_entity_manager_name()
if (!'sitefuture_map_dfr' %>% exists()) {
url <-
glue::glue('http://www.adviserinfo.sec.gov/IAPD/crd_iapd_AdvVersionSelector.aspx?ORG_PK={idCRD}')
sitefuture_map_dfr <-
.parse_adv_manager_sitemap_df(url =url, return_wide = F, manager = name_entity_manager)
}
is_new_iapd <- page %>% html_nodes('.QueryHeaderLabel') %>% html_text() %>% str_detect(" Amount of Regulatory Assets") %>% sum(na.rm = T) > 0
section_exists <-
section_name %in% sitefuture_map_dfr$idSection
if (!section_exists) {
stop(section_name %>% paste0(" does not exists for ", idCRD))
}
.get_section_5_node_name_df <- function() {
base <- "hasLessThan5Clients"
terms <- c("IndividualNonHighNetWorth", "IndividualHighNetWorth", "BankThrift", "PensionPlan",
"ClientsCharitable","StateMunicipalGovernment","InvestmentAdviserOther","InsuranceCompany","SoverignWealthFund","CorporationOther","OtherAll")
ranges <- glue::glue("{base}{terms}") %>% as.character()
fees <-
c("hasFeeAUM",
"hasFeeHourlyCharge",
"hasFeeSubscription",
"hasFeeFixed",
"hasFeeCommission",
"hasFeePerformance",
"hasFeeOther",
"hasSecuritiesPortfolioManagement.TRUE",
"hasSecuritiesPortfolioManagement.FALSE",
"hasFinancialPlanning",
"hasPortfolioManagementIndividualSmallBusiness",
"hasPortfolioManagementInvestmentCompanies",
"hasPortfolioManagementPooledInvestmentVehicles",
"hasPortfolioManagementInstitutionalClients",
"hasServicePensionConsulting",
"hasServiceInvestmentAdviserSelection",
"hasServicePeriodicalPublication",
"hasServiceSecurityRating",
"hasServiceMarketTiming",
"hasServiceEducationSeminars",
"hasServiceOther",
"rangeClientsFinancialPlanningZero",
"rangeClientsFinancialPlanning1to10",
"rangeClientsFinancialPlanning11to25",
"rangeClientsFinancialPlanning26to50",
"rangeClientsFinancialPlanning51to100",
"rangeClientsFinancialPlanning101to250",
"rangeClientsFinancialPlanning251to500",
"rangeClientsFinancialPlanningOver500",
"hasFeeWrapSponsor.TRUE",
"hasFeeWrapSponsor.FALSE",
"hasInvestmentAdviceLimited.TRUE",
"hasInvestmentAdviceLimited.FALSE",
"hasDifferentClientReportMethod.TRUE",
"hasDifferentClientReportMethod.FALSE",
"hasAUMOther5D.TRUE",
"hasAUMOther5D.FALSE",
"hasSeperateAccountMargin.TRUE",
"hasSeperateAccountMargin.FALSE",
"hasSeperateAccountDervatives.TRUE",
"hasSeperateAccountDervatives.FALSE",
"hasCustodian10PCT.TRUE",
"hasCustodian10PCT.FALSE"
)
tibble(nameItem = c(ranges,fees)) %>%
separate(nameItem, into = c("nameItem", "valueItem"), sep = "\\.") %>%
mutate(valueItem = ifelse(is.na(valueItem), T, valueItem) %>% as.logical()) %>%
suppressWarnings() %>%
mutate(idRow = 1:n())
}
section_exists <-
'section5AdvisoryBusinessInformation' %in% sitefuture_map_dfr$idSection
if (!section_exists) {
stop("Company structure data does not exists for " %>% paste0(idCRD))
}
parse_company_structure_page <-
function(page) {
parse_section_5_check_nodes <-
function(page) {
check_nodes <-
page %>%
html_nodes('.main td img') %>%
html_attr('alt') %>%
str_trim()
client_summary_image_df <-
tibble(nodeName = check_nodes) %>%
left_join(.get_check_box_value_df()) %>%
suppressMessages()
client_summary_image_df <-
client_summary_image_df %>%
mutate(idRow = 1:n()) %>%
left_join(
.get_section_5_node_name_df(),
by = "idRow"
) %>%
select(-idRow) %>%
suppressMessages() %>%
dplyr::filter(isNodeChecked) %>%
dplyr::select(nameItem, valueItem)
column_order <-
client_summary_image_df$nameItem
client_summary_df <-
client_summary_image_df %>%
spread(nameItem, valueItem) %>%
dplyr::select(one_of(column_order))
client_summary_df <-
client_summary_df %>%
mutate_at(.vars = client_summary_df %>% dplyr::select(dplyr::matches("^is|^has")) %>% names(),
.funs = as.logical)
client_summary_df
}
.find_text_node_safe <-
possibly(.find_text_node, NULL)
name_entity_manager <-
page %>%
.get_entity_manager_name()
##
node_text <-
page %>%
.parse_node_table_to_text(css_node = '#ctl00_ctl00_cphMainContent_cphAdvFormContent_ClientCompensation_ctl00_trIAPDHeader + tr + tr')
employee_node_df <-
tibble(
nameItem = c(
"countEmployeesTotal",
"countEmployeesInvestmentAdvisory",
"countEmployeesBrokerDealer",
"countEmployeesStateRegisteredInvestmentAdviser",
"countEmployeesStateRegisteredInvestmentAdviserMultipleEntities",
"countEmployeesLicensedInsuranceAgents",
"countEmployeesSolicitAdvisoryClients",
"pctClientsNonUS"
),
hit_words = c(
'but do not include any clerical workers.',
' perform investment advisory',
'registered representatives of a broker-dealer',
'investment adviser representatives',
'for an investment adviser other than you',
'licensed agents of an insurance company or agency',
'Approximately how many firms or other',
'are non-United States persons'
),
off_set = c(1, 1, 1, 1, 1, 1, 5, 1),
is_numeric_node = c(TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, T)
)
employee_count_df <-
1:nrow(employee_node_df) %>%
future_map_dfr(function(x) {
tibble(
nameItem =
employee_node_df$nameItem[[x]],
value =
.find_text_node_safe(
node_text = node_text,
hit_words = employee_node_df$hit_words[[x]],
off_set = employee_node_df$off_set[[x]],
is_numeric_node = employee_node_df$is_numeric_node[[x]]
)
)
}) %>%
spread(nameItem, value) %>%
dplyr::select(one_of(employee_node_df$nameItem)) %>%
mutate_at(.vars = 'pctClientsNonUS',
.funs = funs(. / 100)) %>%
suppressWarnings()
node_text <-
page %>%
.parse_node_table_to_text(
'#ctl00_ctl00_cphMainContent_cphAdvFormContent_AssetsUnderMgnmt_ctl00_trIAPDHeader + tr table td tr td'
)
aum_value_name_df <-
tibble(
nameItem = c(
"amountAUMDiscretionary",
"amountAUMNonDiscretionary",
"amountAUMTotal",
"countAccountsDiscretionary",
"countAccountsNonDiscretionary",
"countAccountsTotal"
),
hit_words = c(
c(
'Discretionary',
'Non-Discretionary',
'Total:',
'Discretionary',
'Non-Discretionary',
'Total:'
)
),
off_set = c(2, 2, 2, 4, 4, 4),
is_numeric_node = c(TRUE, TRUE, TRUE, TRUE, TRUE, TRUE)
)
has_aum_df <-
node_text %>% str_count(aum_value_name_df$hit_words %>% paste0(collapse = "|")) %>% sum() > 0
if (has_aum_df) {
aum_value_df <-
1:nrow(aum_value_name_df) %>%
future_map_dfr(function(x) {
has_value <-
.find_text_node_safe(
node_text = node_text,
hit_words = aum_value_name_df$hit_words[[x]],
off_set = aum_value_name_df$off_set[[x]],
is_numeric_node = aum_value_name_df$is_numeric_node[[x]]
) %>% length() > 0
if (has_value) {
val <-
.find_text_node_safe(
node_text = node_text,
hit_words = aum_value_name_df$hit_words[[x]],
off_set = aum_value_name_df$off_set[[x]],
is_numeric_node = aum_value_name_df$is_numeric_node[[x]]
)
} else {
val <-
NA
}
tibble(nameItem =
aum_value_name_df$nameItem[[x]],
value =
val)
}) %>%
distinct() %>%
group_by(nameItem) %>%
dplyr::filter(value == max(value)) %>%
ungroup() %>%
spread(nameItem, value) %>%
suppressWarnings()
aum_value_df <-
aum_value_df %>%
dplyr::select(dplyr::matches(aum_value_name_df$nameItem %>% paste0(collapse = '|'))) %>%
mutate(nameEntityManager = name_entity_manager)
} else {
aum_value_df <-
tibble(nameEntityManager = name_entity_manager)
}
check_node_df <-
page %>% parse_section_5_check_nodes()
section_5_data <-
tibble(idCRD,
nameEntityManager = name_entity_manager) %>%
bind_cols(list(employee_count_df, check_node_df)) %>%
left_join(aum_value_df) %>%
suppressMessages()
node_text <-
page %>%
.parse_node_table_to_text(css_node = '#ctl00_ctl00_cphMainContent_cphAdvFormContent_ClientCompensation_ctl00_trIAPDHeader + tr + tr')
if (node_text[grepl("[[:upper:]]+$", node_text)] %>% unique() %>%
length() == 1) {
other_value <-
node_text[grepl("[[:upper:]]+$", node_text)] %>% unique()
section_5_data <-
section_5_data %>%
mutate(typeOther = other_value) %>%
dplyr::select(idCRD,
nameEntityManager,
dplyr::matches("typeOther"),
everything())
}
if (node_text[grepl("[[:upper:]]+$", node_text)] %>% unique() %>%
length() > 1) {
other_value <-
node_text[grepl("[[:upper:]]+$", node_text)] %>% unique()
type_df <-
tibble(nameItem = rep('typeOther', length(other_value)),
value = other_value) %>%
mutate(
countItem = (1:n()) - 1,
fullnameItem = if_else(
countItem > 0,
nameItem %>% paste0(".", countItem),
nameItem
)
) %>%
dplyr::select(fullnameItem, value) %>%
spread(fullnameItem, value) %>%
mutate(nameEntityManager = name_entity_manager)
section_5_data <-
section_5_data %>%
left_join(type_df) %>%
suppressMessages() %>%
dplyr::select(idCRD,
nameEntityManager,
dplyr::matches("typeOther"),
everything())
}
return(section_5_data)
}
section_5_data <-
page %>%
parse_company_structure_page() %>%
mutate(nameEntityManager = name_entity_manager) %>%
.select_start_vars()
if ('countAccountsNonDiscretionary' %in% names(section_5_data)) {
section_5_data <-
section_5_data %>%
mutate(countAccountsNonDiscretionary = countAccountsTotal - countAccountsDiscretionary)
}
section_5_data
}
.get_section_6_data <-
function(url = 'https://www.adviserinfo.sec.gov/IAPD/content/viewform/adv/Sections/iapd_AdvOtherBusinessSection.aspx?ORG_PK=156663&FLNG_PK=02D633120008019C05413701015E4D91056C8CC0',
return_wide = T) {
idCRD <-
url %>%
.get_pk_url_crd()
page <-
url %>%
.get_html_page()
get_node_item_df <-
function() {
items <- c("isBrokerDealer", "isBrokerDealerRep", "isCommodityPoolOperator", "isFuturesMerchant", "isRealEstateBroker", "isInsuranceBroker", "isBank", "isTrustCompany", "isMunicipalAdvisor", "isSwapDealer", "isSwapParticipant", "isAccountingFirm", "isLawFirm", "hasOtherFinancialProducts",
"hasOtherBusiness6a.TRUE", "hasNoOtherBusiness6a.TRUE",
"isOtherBusinessPrimary.TRUE", "isOtherBusinessPrimary.FALSE",
"hasOtherNonFinancialClientServices.TRUE", "hasNoOtherNonFinancialClientServices.TRUE")
node_item_df <-
tibble(nameItem = items) %>%
separate(nameItem,
into = c("nameItem", "valueItem"),
sep = "\\.") %>%
separate(nameItem, into = c("nameItem", "valueItem"), sep = "\\.") %>%
mutate(valueItem = ifelse(is.na(valueItem), T, valueItem) %>% as.logical()) %>%
suppressWarnings() %>%
mutate(idRow = 1:n())
return(node_item_df)
}
parse_section_6 <-
function(page, return_wide) {
parse_section_6_check_nodes <-
function(page, return_wide) {
check_nodes <-
page %>%
html_nodes('.main td img') %>%
html_attr('alt') %>%
str_trim()
node_df <-
tibble(nodeName = check_nodes) %>%
left_join(.get_check_box_value_df(), by = "nodeName") %>%
mutate(idRow = 1:n()) %>%
left_join(
get_node_item_df(),
by = "idRow"
) %>%
filter(isNodeChecked) %>%
dplyr::select(nameItem, valueItem) %>%
suppressMessages()
if (return_wide) {
column_order <-
node_df$nameItem
node_df <-
node_df %>%
spread(nameItem, valueItem) %>%
dplyr::select(one_of(column_order))
}
return(node_df)
}
name_entity_manager <-
page %>%
.get_entity_manager_name()
section_data <-
page %>%
parse_section_6_check_nodes(return_wide = return_wide) %>%
mutate(nameEntityManager = name_entity_manager)
return(section_data)
}
section_data <-
page %>%
parse_section_6(return_wide = return_wide) %>%
mutate(idCRD) %>%
.select_start_vars()
return(section_data)
}
.get_section_7a_data <-
function(url = "https://www.adviserinfo.sec.gov/IAPD/content/viewform/adv/Sections/iapd_AdvFinancialAffiliationsSection.aspx?ORG_PK=156663&FLNG_PK=02D633120008019C05413701015E4D91056C8CC0",
return_wide = T) {
idCRD <-
url %>%
.get_pk_url_crd()
page <-
url %>%
.get_html_page()
name_entity_manager <-
page %>%
.get_entity_manager_name()
get_node_item_df <-
function() {
node_item_df <-
tibble(
nameItem = c(
'isBrokerDealer',
'isOtherInvestmentAdvisor',
'isRegisteredMunicipalAdvisor',
'isRegisteredSwapDealer',
'isMajorSwapParticipant',
'isCommodityPoolOperator',
'isFutureCommissionMerchant',
'isBankThrift',
'isTrustCompany',
'isAccountingFirm',
'isLawFirm',
'isInsuranceCompany',
'isPensionConsultant',
'isRealEstateBrokerDealer',
'isNonPooledInvestmentSponsor',
'isPooledInvestmentSponsor'
)
)
return(node_item_df)
}
parse_affliations <-
function(page, return_wide) {
parse_section_7_check_nodes <-
function(page, return_wide) {
check_nodes <-
page %>%
html_nodes('.main td img') %>%
html_attr('alt') %>%
str_trim()
has_nodes <-
tibble(nodeName = check_nodes) %>%
left_join(.get_check_box_value_df()) %>%
bind_cols(get_node_item_df()) %>%
mutate(valueItem = T) %>%
suppressMessages() %>%
dplyr::filter(isNodeChecked == T) %>% nrow > 0 %>%
suppressMessages()
if (has_nodes) {
affiliation_df <-
tibble(nodeName = check_nodes) %>%
left_join(.get_check_box_value_df()) %>%
bind_cols(get_node_item_df()) %>%
mutate(valueItem = T) %>%
dplyr::filter(isNodeChecked == T) %>%
dplyr::select(nameItem, valueItem) %>%
suppressMessages()
if (return_wide) {
column_order <-
affiliation_df$nameItem
affiliation_df <-
affiliation_df %>%
spread(nameItem, valueItem) %>%
dplyr::select(one_of(column_order))
}
} else {
affiliation_df <-
tibble(idCRD, nameEntityManager = name_entity_manager)
}
return(affiliation_df)
}
parse_section_7_check_nodes_safe <-
possibly(parse_section_7_check_nodes, NULL)
section_data <-
page %>%
parse_section_7_check_nodes(return_wide = return_wide) %>%
mutate(nameEntityManager = name_entity_manager)
return(section_data)
}
section_data <-
page %>%
parse_affliations(return_wide = return_wide) %>%
mutate(idCRD) %>%
dplyr::select(idCRD, nameEntityManager, everything())
return(section_data)
}
.get_section_7b_data <-
function(url = 'http://www.adviserinfo.sec.gov/IAPD/content/viewform/adv/Sections/iapd_AdvPrivateFundReportingSection.aspx?ORG_PK=162351&FLNG_PK=052DAAB400080184043CE66005E35E29056C8CC0',
return_wide = F,
return_message = T) {
idCRD <-
url %>%
.get_pk_url_crd()
page <-
url %>%
.get_html_page()
parse_private_fund_data <-
function(page, return_wide, return_message = T) {
name_entity_manager <-
page %>%
.get_entity_manager_name()
main <-
page %>%
html_nodes('.main div')
all_div_ids <-
main %>%
html_attr('id') %>%
unique() %>%
discard(is.na)
has_fund_data <-
length(all_div_ids) >= 1
if (has_fund_data) {
page_table_nodes <- all_div_ids[all_div_ids %>% str_detect("pnlFund")]
page_table_nodes <- str_c("#", page_table_nodes)
fund_count <- length(page_table_nodes)
page_sequences <-
seq(3, length.out = fund_count) %>% as.character() %>%
map_chr(function(x) {
if (x %>% nchar() == 1) {
paste0('0', x)
} else {
x
}
})
table_css_node_df <-
tibble(
numberTable = 1:fund_count,
pageSequenceNode = page_sequences,
tableCSSNode = page_table_nodes
)
section_matrix_df <-
.get_section_matrix_df()
all_data <-
.parse_funds_tables(
page = page,
return_message = return_message,
table_css_node_df = table_css_node_df,
section_matrix_df = section_matrix_df
)
all_data <-
all_data %>%
mutate(nameEntityManager = name_entity_manager) %>%
dplyr::select(nameEntityManager, everything())
if (return_message) {
total_aum <-
all_data$amountFundGrossAUM %>% sum(na.rm = T) %>% formattable::currency()
glue::glue("Parsed {name_entity_manager} they have {fund_count} fund vehicles
and {total_aum} in private fund AUM") %>% cat(fill = T)
}
if (return_wide) {
all_data <-
all_data %>%
dplyr::rename(countItem = numberFund) %>%
.widen_adv_data() %>%
.mutate_adv_data()
}
} else {
all_data <-
tibble(nameEntityManager = name_entity_manager)
}
all_data
}
parse_private_fund_data_safe <-
possibly(parse_private_fund_data, NULL)
section_data <-
page %>%
parse_private_fund_data_safe(return_wide = return_wide,
return_message = return_message)
section_data <-
section_data %>%
mutate(idCRD) %>%
.select_start_vars() %>%
.munge_fund_names()
return(section_data)
}
.get_section_8_data <-
function(url = 'http://www.adviserinfo.sec.gov/IAPD/content/viewform/adv/Sections/iapd_AdvClientTransSection.aspx?ORG_PK=150510&FLNG_PK=00B175BA0008018601B35551000582C5056C8CC0',
return_wide = T) {
idCRD <-
url %>%
.get_pk_url_crd()
page <-
url %>%
.get_html_page()
get_node_item_df <-
function() {
node_item_df <-
c(
"isSecuritiesBuyerFromClientsForSelfToClientsFromOwned",
"isSecuritiesFirmBoughSoldClientRecommended",
"isSecuritiesClientRecommendedFirmOwnedSecurity",
"hasTradeExecutionClient",
"isSecuritiesUnderwriterPurchaserManagerClientRecommendedSecurity",
"hasRecommendedPurchaseSaleFirmOwnedSecurity",
"hasClientDiscretionBuySell",
"hasClientDiscretionBuySellAmount",
"hasClientDiscretionBrokerSelection",
"hasClientDiscretionCommisionCost",
"isClientBrokerRelatedParty",
"hasClientBrokerRecommendation",
"isClientBrokerRecommenationRelatedParty",
"isBrokerSoftDollarRecipient",
"isBrokerSoftDollarEligibleResearchService",
"hasCompensationForClientReferrals",
"isCompensatedForClientReferrals"
) %>%
.get_item_name_yes_no_df()
return(node_item_df)
}
parse_section_8 <-
function(page, return_wide) {
name_entity_manager <-
page %>%
.get_entity_manager_name()
parse_section_8_nodes <-
function(page, return_wide) {
check_nodes <-
page %>%
html_nodes('.main td img') %>%
html_attr('alt') %>%
str_trim()
section_data <-
tibble(nodeName = check_nodes) %>%
left_join(.get_check_box_value_df()) %>%
bind_cols(get_node_item_df()) %>%
dplyr::filter(isNodeChecked == T) %>%
mutate(nameEntityManager = name_entity_manager) %>%
dplyr::select(nameEntityManager, nameItem, valueItem) %>%
suppressMessages()
if (return_wide) {
column_order <-
c('nameEntityManager', section_data$nameItem)
section_data <-
section_data %>%
spread(nameItem, valueItem) %>%
dplyr::select(one_of(column_order))
}
return(section_data)
}
section_data <-
page %>%
parse_section_8_nodes(return_wide = return_wide)
return(section_data)
}
section_data <-
page %>%
parse_section_8(return_wide = return_wide) %>%
mutate(idCRD) %>%
dplyr::select(idCRD, everything())
return(section_data)
}
.get_section_9_data <-
function(url = 'https://adviserinfo.sec.gov/IAPD/content/viewform/adv/Sections/iapd_AdvCustodySection.aspx?ORG_PK=134600&FLNG_PK=04189D5A00080188013572C10022C501056C8CC0',
return_wide = T) {
idCRD <-
url %>%
.get_pk_url_crd()
page <-
url %>%
.get_html_page()
name_entity_manager <-
page %>%
.get_entity_manager_name()
get_node_item_df <-
function() {
node_item_df <-
tibble(
namefullItem =
c(
"hasCustodyClientCash.TRUE",
"hasCustodyClientCash.FALSE",
"hasCustodyClientSecurities.TRUE",
"hasCustodyClientSecurities.FALSE",
"hasCustodyClientCashRelatdPerson.TRUE",
"hasCustodyClientCashRelatdPerson.FALSE",
"hasCustodyClientSecuritiesRelatedPerson.TRUE",
"hasCustodyClientSecuritiesRelatedPerson.FALSE",
"hasQuarterlyStatementsFromCustodian",
"hasAnnualIndependentFundAudit",
"hasSurpriseIndependentFundAudit",
"hasControlPersonReport",
"isQualifiedCustodian.TRUE",
"isQualifiedCustodian.FALSE",
"hasRelatedQualifiedCustodian.TRUE",
"hasRelatedQualifiedCustodian.FALSE"
),
nameItem =
c(
"hasCustodyClientCash",
"hasCustodyClientCash",
"hasCustodyClientSecurities",
"hasCustodyClientSecurities",
"hasCustodyClientCashRelatdPerson",
"hasCustodyClientCashRelatdPerson",
"hasCustodyClientSecuritiesRelatedPerson",
"hasCustodyClientSecuritiesRelatedPerson",
"hasQuarterlyStatementsFromCustodian",
"hasAnnualIndependentFundAudit",
"hasSurpriseIndependentFundAudit",
"hasControlPersonReport",
"isQualifiedCustodian",
"isQualifiedCustodian",
"hasRelatedQualifiedCustodian",
"hasRelatedQualifiedCustodian"
),
valueItem =
c(
TRUE,
FALSE,
TRUE,
FALSE,
TRUE,
FALSE,
TRUE,
FALSE,
TRUE,
TRUE,
TRUE,
TRUE,
TRUE,
FALSE,
TRUE,
FALSE
)
)
return(node_item_df)
}
parse_custody <-
function(page, return_wide) {
parse_custody_nodes <-
function(page, return_wide) {
check_nodes <-
page %>%
html_nodes('.main td img') %>%
html_attr('alt') %>%
str_trim()
node_df <-
tibble(nodeName = check_nodes) %>%
left_join(.get_check_box_value_df()) %>%
bind_cols(get_node_item_df()) %>%
mutate(value = T) %>%
dplyr::filter(isNodeChecked == T) %>%
mutate(nameEntityManager = name_entity_manager) %>%
dplyr::select(nameEntityManager, nameItem, value) %>%
suppressMessages()
if (return_wide) {
column_order <-
c('nameEntityManager', node_df$nameItem)
node_df <-
node_df %>%
spread(nameItem, value) %>%
dplyr::select(one_of(column_order))
}
return(node_df)
}
section_data <-
page %>%
parse_custody_nodes(return_wide = return_wide)
return(section_data)
}
section_data <-
page %>%
parse_custody(return_wide = return_wide) %>%
mutate(idCRD) %>%
dplyr::select(idCRD, everything())
node_text <-
page %>%
.parse_node_table_to_text(css_node = '#ctl00_ctl00_cphMainContent_cphAdvFormContent_CustodyPH_ctl00_trIAPDHeader + tr + tr')
has_nodes <-
node_text %>% str_count('\\$') %>% sum() > 0
if (has_nodes) {
custody_name_df <-
tibble(
nameItem = c(
"amountAUMClientFundsInCustody",
"countClientFundsInCustody",
"amountAUMClientFundsInCustodyRelatedParty",
"countClientFundsInCustodyRelatedParty",
'countQualifiedCustodians',
'monthYearIndependentInspection'
),
hit_words =
c(
'^custody',
'^custody',
'^for which your related persons',
'^for which your related persons',
'in connection with advisory services you provide to',
'during your last fiscal year'
),
off_set = c(4, 6, 5, 7, 2, 1),
is_numeric_node = c(TRUE, TRUE, TRUE, TRUE, TRUE, F)
)
.find_text_node_safe <-
possibly(.find_text_node, NULL)
custody_value_df <-
1:nrow(custody_name_df) %>%
future_map_dfr(function(x) {
value_exists <-
.find_text_node_safe(
node_text = node_text,
hit_words = custody_name_df$hit_words[[x]],
off_set = custody_name_df$off_set[[x]],
is_numeric_node = custody_name_df$is_numeric_node[[x]]
) %>% length() > 0
if (value_exists) {
value <-
.find_text_node_safe(
node_text = node_text,
hit_words = custody_name_df$hit_words[[x]],
off_set = custody_name_df$off_set[[x]],
is_numeric_node = custody_name_df$is_numeric_node[[x]]
) %>%
as.character() %>%
suppressWarnings()
} else {
value <- NA
}
val_df <-
tibble(nameItem =
custody_name_df$nameItem[[x]],
value = value)
return(val_df)
}) %>%
distinct() %>%
dplyr::filter(!value %>% is.na()) %>%
spread(nameItem, value) %>%
mutate(nameEntityManager = name_entity_manager) %>%
.mutate_adv_data() %>%
dplyr::select(nameEntityManager, everything()) %>%
suppressWarnings()
section_data <-
custody_value_df %>%
left_join(section_data) %>%
suppressMessages()
}
section_data <-
section_data %>%
dplyr::select(idCRD, nameEntityManager, everything())
return(section_data)
}
.get_section_10_data <-
function(url = 'http://www.adviserinfo.sec.gov/IAPD/content/viewform/adv/Sections/iapd_AdvControlPersonsSection.aspx?ORG_PK=142979&FLNG_PK=00AB630A0008018801764C5100236B05056C8CC0') {
idCRD <-
url %>%
.get_pk_url_crd()
page <-
url %>%
.get_html_page()
get_node_item_df <-
function() {
node_item_df <-
c('hasControlPersonUnnamed') %>%
.get_item_name_yes_no_df()
return(node_item_df)
}
parse_section_10 <-
function(page) {
parse_section_10_nodes <-
function(page) {
check_nodes <-
page %>%
html_nodes('.main td img') %>%
html_attr('alt') %>%
str_trim()
section_data <-
tibble(nodeName = check_nodes) %>%
left_join(.get_check_box_value_df()) %>%
bind_cols(get_node_item_df()) %>%
dplyr::filter(isNodeChecked == T) %>%
dplyr::select(nameItem, valueItem) %>%
suppressMessages()
column_order <-
section_data$nameItem
section_data <-
section_data %>%
spread(nameItem, valueItem) %>%
dplyr::select(one_of(column_order))
return(section_data)
}
name_entity_manager <-
page %>%
.get_entity_manager_name()
section_data <-
page %>%
parse_section_10_nodes()
section_data <-
tibble(nameEntityManager = name_entity_manager) %>%
bind_cols(list(section_data))
return(section_data)
}
section_data <-
page %>%
parse_section_10()
section_data <-
tibble(idCRD) %>%
bind_cols(section_data)
return(section_data)
}
.get_section_11_data <-
function(url = 'http://www.adviserinfo.sec.gov/IAPD/content/viewform/adv/Sections/iapd_AdvDisciplinarySection.aspx?ORG_PK=142979&FLNG_PK=00AB630A0008018801764C5100236B05056C8CC0',
return_wide = T) {
idCRD <-
url %>%
.get_pk_url_crd()
page <-
url %>%
.get_html_page()
get_node_item_df <-
function() {
node_item_df <-
c(
"hasManagementSupervisedPersonEvent",
"hasFelonyPleaConviction",
"hasFelonyCharge",
"hasMisdemeanorPleaConviction",
"hasMisdemeanrCharge",
"hasSEC_CFTCFalseStatementOmission",
"hasSEC_CFTCStatuteViolation",
"hasSEC_CFTCAuthorizationAction",
"hasSEC_CFTCOrderAgainst",
"hasSEC_CFCPenaltyCeaseDesist",
"hasFederalStateForeignFalseStatement",
"hasFederalStateForeignInvestmentViolation",
"hasFederalStateForeignBusinessRevokeSuspended",
"hasFederalStateForeignOrderAgainst",
"hasFederalStateForeignLicenseRevoked",
"hasSelfRegulatedBodyFalseStatement",
"hasSelfRegulatedBodyRuleViolation",
"hasSelfRegulatedBodyBusinessRevokeSuspension",
"hasSelfRegulatedBodyActivityBan",
"hasAttorneyAccountantFederalContractorPriorBanRevoke",
"isSubjectToRegulatoryProceeding",
"hasDomesticForeignCourtEnjoinedInvestmentActivity",
"hasDomesticForeignCourtGuiltyStatuteViolation",
"hasDomesticForeignCourtDismissedActionSettlementPursuant",
"isDomesticForeignCourtSubjectToProceeding"
) %>%
.get_item_name_yes_no_df()
return(node_item_df)
}
parse_section <-
function(page, return_wide) {
name_entity_manager <-
page %>%
.get_entity_manager_name()
parse_section_nodes <-
function(page) {
check_nodes <-
page %>%
html_nodes('.main td img') %>%
html_attr('alt') %>%
str_trim()
section_data <-
tibble(nodeName = check_nodes) %>%
left_join(.get_check_box_value_df()) %>%
bind_cols(get_node_item_df()) %>%
dplyr::filter(isNodeChecked == T) %>%
mutate(nameEntityManager = name_entity_manager) %>%
dplyr::select(nameEntityManager, nameItem, value = valueItem) %>%
suppressMessages()
if (return_wide) {
col_order <-
c('nameEntityManager', section_data$nameItem)
section_data <-
section_data %>%
spread(nameItem, value) %>%
suppressWarnings() %>%
dplyr::select(one_of(col_order))
}
return(section_data)
}
section_data <-
page %>%
parse_section_nodes()
return(section_data)
}
section_data <-
page %>%
parse_section(return_wide = return_wide) %>%
mutate(idCRD) %>%
dplyr::select(idCRD, everything())
return(section_data)
}
.get_section_12_data <-
function(url = 'http://www.adviserinfo.sec.gov/IAPD/content/viewform/adv/Sections/iapd_AdvSmallBusinessSection.aspx?ORG_PK=150510&FLNG_PK=00B175BA0008018601B35551000582C5056C8CC0') {
idCRD <-
url %>%
.get_pk_url_crd()
page <-
url %>%
.get_html_page()
get_node_item_df <-
function() {
node_item_df <-
c(
'hasAssetsOver5mFiscalYearEnd',
'hasControlRelatedAssetManagerAssetsOver25mFiscalYearEnd',
'hasControlRelatedAssetManagerAssetsOver5mFiscalYearEnd',
'isControlledByAssetManagerAssetsOver25mFiscalYearEnd',
'isControlledByAssetManagerAssetsOver5mFiscalYearEnd'
) %>%
.get_item_name_yes_no_df()
return(node_item_df)
}
parse_section <-
function(page) {
parse_section_nodes <-
function(page) {
name_entity_manager <-
page %>%
.get_entity_manager_name()
check_nodes <-
page %>%
html_nodes('.main td img') %>%
html_attr('alt') %>%
str_trim()
has_nodes <-
tibble(nodeName = check_nodes) %>%
left_join(.get_check_box_value_df()) %>%
suppressMessages() %>%
bind_cols(get_node_item_df()) %>%
dplyr::filter(isNodeChecked == T) %>% nrow > 0
if (has_nodes) {
section_data <-
tibble(nodeName = check_nodes) %>%
left_join(.get_check_box_value_df()) %>%
bind_cols(get_node_item_df()) %>%
dplyr::filter(isNodeChecked == T) %>%
dplyr::select(nameItem, valueItem) %>%
dplyr::select(nameEntityManager, everything) %>%
mutate(nameEntityManager = name_entity_manager) %>%
dplyr::rename(value = valueItem) %>%
group_by(nameItem) %>%
mutate(countItem = 1:n()) %>%
ungroup() %>%
suppressMessages() %>%
mutate(
countItem = countItem - 1,
countItem = countItem %>% as.character(),
countItem = ifelse(countItem == "0", '', countItem)
) %>%
unite(item, nameItem, countItem, sep = '') %>%
distinct() %>%
suppressWarnings()
col_order <-
c('nameEntityManager', section_data$item)
section_data <-
section_data %>%
spread(item, value) %>%
dplyr::select(one_of(col_order))
} else {
section_data <-
tibble(nameEntityManager = name_entity_manager)
}
return(section_data)
}
section_data <-
page %>%
parse_section_nodes() %>%
mutate(idCRD) %>%
dplyr::select(idCRD, everything())
return(section_data)
}
section_data <-
page %>%
parse_section() %>%
mutate(idCRD) %>%
dplyr::select(idCRD, everything())
return(section_data)
}
.get_schedule_a_data <-
function(url = 'http://www.adviserinfo.sec.gov/IAPD/content/viewform/adv/Sections/iapd_AdvScheduleASection.aspx?ORG_PK=150510&FLNG_PK=00B175BA0008018601B35551000582C5056C8CC0',
return_wide = F) {
idCRD <-
url %>%
.get_pk_url_crd()
page <-
url %>%
.get_html_page()
parse_section <-
function(page) {
name_entity_manager <-
page %>%
.get_entity_manager_name()
table_exists <-
page %>% html_nodes(css = '#ctl00_ctl00_cphMainContent_cphAdvFormContent_ScheduleAPHSection_ctl00_ownersGrid') %>%
length() > 0
if (table_exists) {
table_data <-
page %>%
html_nodes(css = '#ctl00_ctl00_cphMainContent_cphAdvFormContent_ScheduleAPHSection_ctl00_ownersGrid') %>%
html_table(fill = T, header = F) %>%
data.frame(stringsAsFactors = F) %>%
as_tibble() %>%
slice(-1) %>%
mutate(X1 = X1 %>% str_to_upper())
names(table_data) <-
c(
'nameEntityManagerOwner',
'idTypeEntityManagerOwner',
'statusEntityManagerOwner',
'monthYearEntityManagerOwnerPurchased',
'idRangeManagerEntityOwnership',
'isControlPerson',
'isPublicReportingEntity',
'idEntityManagerOwner'
)
table_data <-
table_data %>%
mutate(
idCRD,
isControlPerson = if_else(isControlPerson == "Y", TRUE, FALSE),
isPublicReportingEntity = if_else(isPublicReportingEntity == "Y", TRUE, FALSE),
dateEntityManagerOwnerPurchased = '01/' %>% paste0(monthYearEntityManagerOwnerPurchased) %>% lubridate::dmy() %>% as.Date
) %>%
left_join(.get_type_manager_entity_owner_df()) %>%
left_join(.get_range_entity_owner_df()) %>%
dplyr::select(-monthYearEntityManagerOwnerPurchased) %>%
suppressMessages()
has_individual_owners <-
table_data %>%
dplyr::filter(idTypeEntityManagerOwner == "I") %>% nrow > 0
if (has_individual_owners) {
individual_data <-
table_data %>%
dplyr::filter(idTypeEntityManagerOwner == "I")
individual_data <-
individual_data$nameEntityManagerOwner %>%
future_map_dfr(.parse_manager_owner_name) %>%
right_join(individual_data) %>%
suppressMessages() %>%
mutate(
countDash = idEntityManagerOwner %>% str_count('\\-'),
typeIDEntityManagerOwner = if_else(countDash == 0, 'idEmployee', 'idSSN')
) %>%
dplyr::select(-countDash)
}
has_entity_owners <-
table_data %>% dplyr::filter(isEntityOwnerManagerEntity == T) %>% nrow > 0
if (has_entity_owners) {
entity_df <-
table_data %>% dplyr::filter(isEntityOwnerManagerEntity == T) %>%
mutate(
countDash = idEntityManagerOwner %>% str_count('\\-'),
typeIDEntityManagerOwner = if_else(countDash == 0, 'idCRD', 'idEIN')
) %>%
dplyr::select(-countDash)
if ('individual_data' %>% exists) {
table_data <-
individual_data %>%
bind_rows(entity_df) %>%
mutate(
nameCommonEntityOwnerManager = if_else(
nameCommonEntityOwnerManager %>% is.na,
nameEntityManagerOwner,
nameCommonEntityOwnerManager
),
nameEntityManagerOwner = if_else(
nameFullEntityOwnerManager %>% is.na,
nameEntityManagerOwner,
nameFullEntityOwnerManager
)
) %>%
suppressMessages() %>%
dplyr::select(idCRD, everything())
} else {
table_data <-
entity_df %>%
mutate(
nameFullEntityOwnerManager = nameEntityManagerOwner,
nameCommonEntityOwnerManager = nameEntityManagerOwner
)
}
} else {
table_data <-
individual_data %>%
mutate(
nameCommonEntityOwnerManager = if_else(
nameCommonEntityOwnerManager %>% is.na,
nameEntityManagerOwner,
nameCommonEntityOwnerManager
),
nameEntityManagerOwner = if_else(
nameFullEntityOwnerManager %>% is.na,
nameFullEntityOwnerManager,
nameFullEntityOwnerManager
)
) %>%
dplyr::select(idCRD, everything())
}
table_data <-
table_data %>%
mutate(
nameFullEntityOwnerManager = if_else(
nameFullEntityOwnerManager %>% is.na,
nameEntityManagerOwner,
nameFullEntityOwnerManager
)
)
table_data <-
table_data %>%
mutate(nameEntityManager = name_entity_manager) %>%
dplyr::select(
nameEntityManager,
nameCommonEntityOwnerManager,
statusEntityManagerOwner,
isEntityOwnerManagerEntity,
dateEntityManagerOwnerPurchased,
rangeManagerEntityOwnership,
idEntityManagerOwner,
typeEntityManagerOwner,
isControlPerson,
idTypeEntityManagerOwner:typeIDEntityManagerOwner,
everything()
)
} else {
table_data <-
tibble(nameEntityManager = name_entity_manager)
}
return(table_data)
}
section_data <-
page %>%
parse_section()
section_data <-
section_data %>%
mutate(idCRD) %>%
dplyr::select(idCRD, everything())
if (return_wide) {
section_data <-
section_data %>%
mutate_all(.funs = as.character) %>%
mutate(countItem = 1:n(),
idCRD = idCRD %>% as.numeric()) %>%
gather(item, value, -c(nameEntityManager, countItem, idCRD)) %>%
mutate(
countItem = countItem - 1,
countItem = countItem %>% as.character(),
countItem = ifelse(countItem == "0", '', countItem)
) %>%
unite(item, item, countItem, sep = '') %>%
dplyr::filter(!value %>% is.na()) %>%
suppressWarnings()
column_order <-
c('idCRD', 'nameEntityManager', section_data$item)
section_data <-
section_data %>%
spread(item, value) %>%
dplyr::select(one_of(column_order))
section_data <-
section_data %>%
mutate_at(.vars =
section_data %>% dplyr::select(dplyr::matches("^amount|^count")) %>% names(),
.funs = as.numeric) %>%
mutate_at(.vars =
section_data %>% dplyr::select(dplyr::matches("^has|^is")) %>% names(),
.funs = as.logical) %>%
mutate_at(.vars =
section_data %>% dplyr::select(dplyr::matches("^date")) %>% names(),
funs(. %>% lubridate::ymd()))
}
return(section_data %>% distinct())
}
.get_schedule_b_data <-
function(url = 'http://www.adviserinfo.sec.gov/IAPD/content/viewform/adv/Sections/iapd_AdvScheduleBSection.aspx?ORG_PK=142979&FLNG_PK=00AB630A0008018801764C5100236B05056C8CC0',
return_wide = F) {
idCRD <-
url %>%
.get_pk_url_crd()
page <-
url %>%
.get_html_page()
parse_section <-
function(page) {
name_entity_manager <-
page %>%
.get_entity_manager_name()
table_exists <-
page %>% html_nodes(css = '#ctl00_ctl00_cphMainContent_cphAdvFormContent_ScheduleBPHSection_ctl00_ownersGrid') %>%
length() > 0
if (table_exists) {
table_data <-
page %>%
html_nodes(css = '#ctl00_ctl00_cphMainContent_cphAdvFormContent_ScheduleBPHSection_ctl00_ownersGrid') %>%
html_table(fill = T, header = F) %>%
data.frame(stringsAsFactors = F) %>%
as_tibble() %>%
slice(-1) %>%
mutate(X1 = X1 %>% str_to_upper())
names(table_data) <-
c(
'nameEntityManagerOwnerOwner',
'idTypeEntityManagerOwnerOwner',
'nameEntityManagerOwner',
'statusEntityManagerOwnerOwner',
'monthYearEntityManagerOwnerOwnerPurchased',
'idRangeManagerEntityOwnerOwnership',
'isOwnerOwnerControlPerson',
'isOwnerOwnerPublicReportingEntity',
'idEntityManagerOwnerOwner'
)
table_data <-
table_data %>%
mutate(
idCRD,
isOwnerOwnerControlPerson = if_else(isOwnerOwnerControlPerson == "Y", TRUE, FALSE),
isOwnerOwnerPublicReportingEntity = if_else(isOwnerOwnerPublicReportingEntity == "Y", TRUE, FALSE),
dateEntityManagerOwnerOwnerPurchased = '01/' %>% paste0(monthYearEntityManagerOwnerOwnerPurchased) %>% lubridate::dmy() %>% as.Date
) %>%
left_join(
.get_type_manager_entity_owner_df() %>%
dplyr::rename(
idTypeEntityManagerOwnerOwner = idTypeEntityManagerOwner,
typeEntityManagerOwnerOwner = typeEntityManagerOwner,
isEntityOwnerOwnerManagerEntity = isEntityOwnerManagerEntity
)
) %>%
left_join(
.get_range_entity_owner_df() %>% dplyr::rename(
idRangeManagerEntityOwnerOwnership = idRangeManagerEntityOwnership,
rangeManagerEntityOwnerOwnership = rangeManagerEntityOwnership
)
) %>%
dplyr::select(-monthYearEntityManagerOwnerOwnerPurchased) %>%
suppressMessages()
has_individual_data <-
table_data %>%
dplyr::filter(idTypeEntityManagerOwnerOwner == "I") %>% nrow > 0
if (has_individual_data) {
has_entity_df <-
table_data %>% dplyr::filter(isEntityOwnerOwnerManagerEntity == T) %>% nrow > 0
if (has_entity_df) {
entity_df <-
table_data %>% dplyr::filter(isEntityOwnerOwnerManagerEntity == T)
entity_df <-
entity_df %>%
mutate(
countDash = idEntityManagerOwnerOwner %>% str_count('\\-'),
typeIDEntityManagerOwner = if_else(countDash == 0, 'idCRD', 'idEIN')
) %>%
dplyr::select(-countDash)
individual_data <-
table_data %>%
dplyr::filter(idTypeEntityManagerOwnerOwner == "I")
individual_data <-
individual_data$nameEntityManagerOwnerOwner %>%
future_map_dfr(.parse_manager_owner_name) %>%
dplyr::rename(
nameEntityManagerOwnerOwner = nameEntityManagerOwner,
nameCommonEntityOwnerOwnerManager = nameCommonEntityOwnerManager,
nameFullEntityManagerOwnerOwner = nameFullEntityOwnerManager,
nameFirstEntityManagerOwnerOwner = nameFirstEntityManagerOwner,
nameMiddleEntityManagerOwnerOwner = nameMiddleEntityManagerOwner,
nameLastEntityManagerOwnerOwner = nameLastEntityManagerOwner
) %>%
right_join(individual_data) %>%
suppressMessages() %>%
mutate(
countDash = idEntityManagerOwnerOwner %>% str_count('\\-'),
typeIDEntityManagerOwnerOwner = if_else(countDash == 0, 'idEmployee', 'idSSN')
) %>%
dplyr::select(-countDash)
table_data <-
individual_data %>%
bind_rows(entity_df) %>%
mutate(
nameCommonEntityOwnerOwnerManager = if_else(
nameCommonEntityOwnerOwnerManager %>% is.na,
nameEntityManagerOwnerOwner,
nameCommonEntityOwnerOwnerManager
),
nameEntityManagerOwnerOwner = if_else(
nameFullEntityManagerOwnerOwner %>% is.na,
nameEntityManagerOwnerOwner,
nameFullEntityManagerOwnerOwner
)
) %>%
suppressMessages() %>%
mutate(
countDash = idEntityManagerOwnerOwner %>% str_count('\\-'),
typeIDEntityManagerOwnerOwner = if_else(countDash == 0, 'idEmployee', 'idSSN')
) %>%
dplyr::select(-countDash) %>%
dplyr::select(idCRD, everything())
} else {
individual_data <-
table_data %>%
dplyr::filter(idTypeEntityManagerOwnerOwner == "I")
individual_data <-
individual_data$nameEntityManagerOwnerOwner %>%
future_map_dfr(.parse_manager_owner_name) %>%
dplyr::rename(
nameEntityManagerOwnerOwner = nameEntityManagerOwner,
nameCommonEntityOwnerOwnerManager = nameCommonEntityOwnerManager,
nameFullEntityManagerOwnerOwner = nameFullEntityOwnerManager,
nameFirstEntityManagerOwnerOwner = nameFirstEntityManagerOwner,
nameMiddleEntityManagerOwnerOwner = nameMiddleEntityManagerOwner,
nameLastEntityManagerOwnerOwner = nameLastEntityManagerOwner
) %>%
right_join(individual_data) %>%
suppressMessages() %>%
mutate(
countDash = idEntityManagerOwnerOwner %>% str_count('\\-'),
typeIDEntityManagerOwnerOwner = if_else(countDash == 0, 'idEmployee', 'idSSN')
) %>%
dplyr::select(-countDash)
table_data <-
individual_data %>%
mutate(
nameCommonEntityOwnerOwnerManager = if_else(
nameCommonEntityOwnerOwnerManager %>% is.na,
nameEntityManagerOwnerOwner,
nameCommonEntityOwnerOwnerManager
),
nameEntityManagerOwnerOwner = if_else(
nameFullEntityManagerOwnerOwner %>% is.na,
nameEntityManagerOwnerOwner,
nameFullEntityManagerOwnerOwner
)
) %>%
suppressMessages() %>%
mutate(
countDash = idEntityManagerOwnerOwner %>% str_count('\\-'),
typeIDEntityManagerOwnerOwner = if_else(countDash == 0, 'idEmployee', 'idSSN')
) %>%
dplyr::select(-countDash) %>%
dplyr::select(idCRD, everything())
}
} else {
table_data <-
table_data %>%
mutate(
countDash = idEntityManagerOwnerOwner %>% str_count('\\-'),
typeIDEntityManagerOwnerOwner = if_else(countDash == 0, 'idCRD', 'idEIN')
) %>%
dplyr::select(-countDash) %>%
dplyr::select(idCRD, everything())
}
if ('nameFullEntityOwnerOwnerManager' %in% names(table_data)) {
table_data <-
table_data %>%
mutate(
nameFullEntityOwnerOwnerManager = if_else(
nameFullEntityOwnerOwnerManager %>% is.na,
nameEntityManagerOwnerOwner,
nameFullEntityOwnerOwnerManager
)
)
} else {
table_data <-
table_data %>%
mutate(nameFullEntityOwnerOwnerManager = nameEntityManagerOwnerOwner)
}
if ('nameCommonEntityOwnerOwnerManager' %in% names(table_data)) {
table_data <-
table_data %>%
mutate(
nameCommonEntityOwnerOwnerManager = if_else(
nameCommonEntityOwnerOwnerManager %>% is.na,
nameEntityManagerOwner,
nameCommonEntityOwnerOwnerManager
)
)
} else {
table_data <-
table_data %>%
mutate(nameCommonEntityOwnerOwnerManager = nameEntityManagerOwnerOwner)
}
table_data <-
table_data %>%
mutate(nameEntityManager = name_entity_manager) %>%
dplyr::select(
nameEntityManager,
nameEntityManagerOwnerOwner,
statusEntityManagerOwnerOwner,
nameEntityManagerOwner,
isEntityOwnerOwnerManagerEntity,
dateEntityManagerOwnerOwnerPurchased,
rangeManagerEntityOwnerOwnership,
idEntityManagerOwnerOwner,
typeEntityManagerOwnerOwner,
isOwnerOwnerControlPerson,
everything()
)
} else {
table_data <-
tibble(nameEntityManager = name_entity_manager)
}
return(table_data)
}
section_data <-
page %>%
parse_section()
section_data <-
section_data %>%
mutate(idCRD) %>%
dplyr::select(idCRD, everything())
if (return_wide) {
section_data <-
section_data %>%
mutate_all(.funs = as.character) %>%
mutate(countItem = 1:n(),
idCRD = idCRD %>% as.numeric()) %>%
gather(item, value, -c(nameEntityManager, countItem, idCRD)) %>%
mutate(
countItem = countItem - 1,
countItem = countItem %>% as.character(),
countItem = ifelse(countItem == "0", '', countItem)
) %>%
unite(item, item, countItem, sep = '') %>%
dplyr::filter(!value %>% is.na()) %>%
suppressWarnings()
column_order <-
c('idCRD', 'nameEntityManager', section_data$item)
section_data <-
section_data %>%
spread(item, value) %>%
dplyr::select(one_of(column_order))
section_data <-
section_data %>%
mutate_at(.vars =
section_data %>% dplyr::select(dplyr::matches("^amount|^count")) %>% names(),
.funs = as.numeric) %>%
mutate_at(.vars =
section_data %>% dplyr::select(dplyr::matches("^has|^is")) %>% names(),
.funs = as.logical) %>%
mutate_at(.vars =
section_data %>% dplyr::select(dplyr::matches("^date")) %>% names(),
funs(. %>% lubridate::ymd()))
}
return(section_data %>% distinct())
}
.get_schedule_d_data <-
function(url = 'http://www.adviserinfo.sec.gov/IAPD/content/viewform/adv/Sections/iapd_AdvScheduleDSection.aspx?ORG_PK=284340&FLNG_PK=0573726A0008018802C736510026C985056C8CC0',
join_data = F,
return_wide = F) {
idCRD <-
url %>%
.get_pk_url_crd()
page <-
url %>%
.get_html_page()
name_entity_manager <-
page %>% .get_entity_manager_name()
parse_schedule_d <-
function(page, join_data, return_wide) {
table_ids <-
page %>%
html_nodes('.main div') %>%
html_attr('id')
table_ids <-
table_ids[!table_ids %>% is.na()]
table_ids <-
table_ids[table_ids %>% str_detect('PFID|pageMessages|Lookup|Header') == F]
table_node_df <-
tibble(idCSSTable = table_ids %>% paste0('#', .)) %>%
mutate(idTable = 1:n())
table_nodes <-
page %>%
html_nodes('.flatBorderTable tr td .PaperFormTableData')
all_table_node_df <-
seq_along(table_nodes) %>%
future_map_dfr(function(x) {
raw_nodes <-
table_nodes[[x]] %>%
html_text(trim = T) %>%
str_replace_all('\r|\n|\t', '') %>%
stri_replace_all_charclass("\\p{WHITE_SPACE}", " ") %>%
stri_trim_both() %>%
gsub("^\\s+|\\s+$", "", .) %>%
str_split('\\ ') %>%
flatten_chr() %>%
str_trim()
raw_nodes <-
raw_nodes[!raw_nodes == '']
if (raw_nodes %>% length() > 0) {
tibble(idTable = x, nodeText = raw_nodes)
}
}) %>%
mutate(idRow = 1:n())
parse_other_office_locations <-
function(page, return_wide) {
table_nodes <-
page %>%
html_nodes('.flatBorderTable tr td .PaperFormTableData')
end_table_df <-
seq_along(table_nodes) %>%
future_map_dfr(function(x) {
raw_nodes <-
table_nodes[[x]] %>%
html_text(trim = T) %>%
str_replace_all('\r|\n|\t', '') %>%
stri_replace_all_charclass("\\p{WHITE_SPACE}", " ") %>%
stri_trim_both() %>%
gsub("^\\s+|\\s+$", "", .) %>%
str_split('\\ ') %>%
flatten_chr()
raw_nodes <-
raw_nodes[!raw_nodes == '']
is_end_table <-
raw_nodes %>% grep("A. PRIVATE FUND|SECTION 6", .) %>%
length() > 0
tibble(
idTable = x,
isEndTable = is_end_table,
nodesText = raw_nodes
)
})
has_end_table <-
end_table_df %>% dplyr::filter(isEndTable == T) %>% nrow > 0
if (has_end_table) {
end_table_no <-
end_table_df %>%
dplyr::filter(isEndTable == T) %>%
slice(1) %>%
.$idTable - 1
} else {
end_table_no <-
table_nodes %>% length
}
location_df <-
1:(end_table_no) %>%
future_map_dfr(function(x) {
raw_nodes <-
table_nodes[[x]] %>%
html_text(trim = T) %>%
str_replace_all('\r|\n|\t', '') %>%
stri_replace_all_charclass("\\p{WHITE_SPACE}", " ") %>%
stri_trim_both() %>%
gsub("^\\s+|\\s+$", "", .) %>%
str_split('\\ ') %>%
flatten_chr()
raw_nodes <-
raw_nodes[!raw_nodes == '']
raw_nodes <-
raw_nodes[raw_nodes %>% str_detect(":[A-Z a-z 1-9]")] %>% str_trim()
has_nodes <-
raw_nodes %>% length() > 0
if (has_nodes) {
tibble(idTable = x, itemvalueNode = raw_nodes)
}
}) %>%
dplyr::filter(
!itemvalueNode %>% str_detect(
'Name of entity where books|Facis|Facsimile|NAME OF THE INDEPENDENT|SUCH NON-US FUNDS|ITEM|item|Item|SCHEDULE|NVESTMENT ACQUISITION|THE ANALYSIS AND MANAGEMEN|MAIL STOP'
)
) %>%
dplyr::filter(!itemvalueNode %>% str_detect('NOTE: |Name of the')) %>%
dplyr::filter(itemvalueNode %>% str_detect('\\:'))
name_entity_manager <-
page %>%
.get_entity_manager_name()
has_other_locations <-
location_df %>% nrow > 0
if (has_other_locations) {
if (location_df$idTable %>% unique() %>% min() > 1) {
offset_value <-
location_df$idTable %>% unique() %>% min() - 1
location_df <-
location_df %>%
mutate(idTable = idTable - offset_value)
}
all_locations <-
location_df$idTable %>% unique() %>%
future_map_dfr(function(x) {
addressOfficeSecondary <-
location_df %>%
dplyr::filter(idTable == x)
addressOfficeSecondary %>%
separate(itemvalueNode, c('itemNode', 'valueNode'), '\\:') %>%
left_join(.get_location_name_df()) %>%
dplyr::select(idTable, nameNode, valueNode) %>%
mutate(
valueNode = valueNode %>% str_trim() %>% str_to_upper(),
nameNode = nameNode %>% paste0('ManagerOfficeSecondary')
) %>%
suppressMessages()
}) %>%
dplyr::rename(countItem = idTable,
item = nameNode,
value = valueNode) %>%
dplyr::filter(!value %>% is.na()) %>%
mutate(nameEntityManager = name_entity_manager) %>%
dplyr::select(nameEntityManager, everything()) %>%
arrange(countItem) %>%
spread(item, value)
if (names(all_locations) %>% str_count('^address|^city|^country|^state') %>% sum() >= 4) {
if (names(all_locations) %>% str_count(
'stateManagerOfficeSecondary|zipManagerOfficeSecondary|addressStreet1ManagerOfficeSecondary|cityManagerOfficeSecondary'
) %>% sum() == 4) {
all_locations <-
all_locations %>%
mutate(
locationSecondary = addressStreet1ManagerOfficeSecondary %>% paste0(
' ',
cityManagerOfficeSecondary,
', ',
stateManagerOfficeSecondary,
' ',
countryManagerOfficeSecondary,
' ',
zipManagerOfficeSecondary
)
)
}
}
if (return_wide) {
all_locations <-
all_locations %>%
.widen_adv_data()
}
} else {
all_locations <-
tibble(nameEntityManager = name_entity_manager)
}
return(all_locations)
}
parse_for_manager_website_data <-
function(page) {
nodes <-
page %>%
.get_html_node_text('.PrintHistRed') %>%
str_to_lower
name_entity_manager <-
page %>% .get_entity_manager_name()
if (nodes %>% grep('http', .) %>% length() > 0) {
urlManager <-
nodes[nodes %>% grep('http', .)]
url_df <-
tibble(urlManager) %>%
mutate(countItem = 1:n()) %>%
gather(item, value, -countItem) %>%
dplyr::filter(!value %>% is.na()) %>%
spread(item, value) %>%
mutate(nameEntityManager = name_entity_manager) %>%
dplyr::select(nameEntityManager, everything())
rm(nodes)
url_df <-
url_df %>%
.widen_adv_data()
} else {
url_df <-
url_df <-
tibble(nameEntityManager = name_entity_manager)
}
return(url_df)
}
parse_related_advisor <-
function(page, return_wide) {
name_entity_manager <-
page %>%
.get_entity_manager_name()
page_input_ids <-
page %>%
html_nodes('input') %>%
html_attr('id')
has_adviser_nodes <-
page_input_ids %>% str_count(
'ctl00_ctl00_cphMainContent_cphAdvFormContent_AffiliatedAdvisersList_rptrAfflAdvisers_'
) %>% sum() > 0
if (has_adviser_nodes) {
node_id_value <-
page_input_ids[page_input_ids %>% str_detect(
'ctl00_ctl00_cphMainContent_cphAdvFormContent_AffiliatedAdvisersList_rptrAfflAdvisers_'
)] %>%
paste0('#', ., ' + table')
related_adviser_node_df <-
tibble(cssNode = node_id_value) %>%
mutate(idTable = 1:n()) %>%
dplyr::select(idTable, everything())
related_advisor_df <-
related_adviser_node_df$idTable %>%
future_map_dfr(function(x) {
table_nodes <-
page %>%
html_nodes(css = related_adviser_node_df$cssNode[x]) %>%
html_text(trim = T) %>%
str_replace_all('\r|\n|\t', '') %>%
stri_replace_all_charclass("\\p{WHITE_SPACE}", " ") %>%
stri_trim_both() %>%
gsub("^\\s+|\\s+$", "", .) %>%
str_split(" +") %>%
flatten_chr()
nameLegalRelatedEntity <-
table_nodes[table_nodes %>% grep('Legal Name of', .) + 1] %>% str_replace_all('Related Person: ', '')
nameBusinessRelatedEntity <-
table_nodes[table_nodes %>% grep('Primary Business Name of', .) + 2]
if (table_nodes %>% grep('CRD Number', .) %>% length() > 0) {
idCRDRelatedEntity <-
table_nodes[table_nodes %>% grep('CRD Number', .)] %>% gsub('\\CRD Number', '', .) %>%
gsub('\\(|\\)', '', .) %>% gsub(' if any:', '', .)
if (idCRDRelatedEntity == '') {
idCRDRelatedEntity <-
NA
} else {
idCRDRelatedEntity <-
idCRDRelatedEntity %>% str_trim() %>% as.numeric()
}
}
data_df <-
tibble(
idTable = x,
nameBusinessRelatedEntity,
nameLegalRelatedEntity,
idCRDRelatedEntity
) %>%
mutate(
nameBusinessRelatedEntity = if_else(
nameBusinessRelatedEntity == "SAME",
nameLegalRelatedEntity,
nameBusinessRelatedEntity
),
idCRDRelatedEntity = idCRDRelatedEntity %>% as.numeric()
)
has_image_check_box <-
page %>%
html_nodes(related_adviser_node_df$cssNode[x]) %>%
html_nodes('img') %>% length() == 33
if (has_image_check_box) {
check_nodes <-
page %>%
html_nodes(related_adviser_node_df$cssNode[x]) %>%
html_nodes('img') %>%
html_attr('alt') %>%
str_trim()
node_df <-
tibble(
nameItem = c(
'isBrokerDealer',
'isOtherInvestmentAdvisor',
'isRegisteredMunicipalAdvisor',
'isRegisteredSwapDealer',
'isMajorSwapParticipant',
'isCommodityPoolOperator',
'isFutureCommissionMerchant',
'isBankThrift',
'isTrustCompany',
'isAccountingFirm',
'isLawFirm',
'isInsuranceCompany',
'isPensionConsultant',
'isRealEstateBrokerDealer',
'isNonPooledInvestmentSponsor',
'isPooledInvestmentSponsor'
),
valueItem = T
) %>%
mutate(fullnameItem = nameItem) %>%
bind_rows(
list(
c(
'isRelatedPersonControlled',
'hasRelatedPersonCommonControl',
'isRelatedPersonClientCustodian',
'isRelatedPersonNotOperationIndependent'
) %>%
.get_item_name_yes_no_df(),
'isPrivateResidence' %>%
.has_item_check_name(),
c(
'isRelatedPersonExemptInvestmentAdviser',
'isRelatedPersonForeignEntityRegistered',
'hasRelatedPersonSharedSupervisedPerson',
'hasRelatedPersonSharedAddress'
) %>%
.get_item_name_yes_no_df()
)
)
box_df <-
tibble(nodeName = check_nodes) %>%
mutate(idTable = x) %>%
left_join(.get_check_box_value_df()) %>%
bind_cols(node_df) %>%
dplyr::filter(isNodeChecked == T) %>%
dplyr::select(idTable, nameItem, valueItem) %>%
suppressMessages() %>%
spread(nameItem, valueItem)
data_df <-
data_df %>%
left_join(box_df) %>%
suppressMessages()
}
return(data_df)
}) %>%
dplyr::rename(countItem = idTable) %>%
gather(item, value, -countItem) %>%
dplyr::filter(!value %>% is.na()) %>%
mutate(nameEntityManager = name_entity_manager) %>%
dplyr::select(nameEntityManager, everything()) %>%
arrange(countItem, item) %>%
spread(item, value) %>%
dplyr::select(
c(
nameEntityManager,
countItem,
nameBusinessRelatedEntity,
nameLegalRelatedEntity,
everything()
)
)
if (return_wide) {
related_advisor_df <-
related_advisor_df %>%
.widen_adv_data() %>%
.mutate_adv_data()
}
} else {
related_advisor_df <-
tibble(nameEntityManager = name_entity_manager)
}
return(related_advisor_df)
}
parse_record_locations <-
function(page, all_table_node_df, return_wide) {
has_book_keeping_data <-
all_table_node_df %>%
dplyr::filter(nodeText %>% str_detect('Briefly describe the books')) %>% nrow > 0
if (has_book_keeping_data) {
node_locations <-
all_table_node_df$nodeText %>% grep('Briefly describe the books', .)
record_df <-
c(-1, 5, 7, 10, 12, 14, 16, 18, 19, 20, 21) %>%
future_map_dfr(function(x) {
has_value <-
all_table_node_df$nodeText[node_locations - x] %>% length() >
0
if (has_value) {
val <- all_table_node_df$nodeText[node_locations - x]
} else {
val <- NA
}
tibble(value = val,
idNode = x)
}) %>%
dplyr::filter(!value %>% is.na()) %>%
dplyr::filter(
!value %>% str_detect(
'CANCELLED CHECKS ARE HELD BY THE BANK AND KEPT OFFSITE|Number and Street 2:|City:|Country|Number and Street 1'
)
)
record_df <-
record_df %>%
mutate(
value = value %>% str_replace_all('Name of entity where books and records are kept:', '')
) %>%
left_join(tibble(
idNode = c(-1, 5, 7, 10, 12, 14, 16, 18, 19, 20, 21) ,
nameItem = c(
'descriptionRecordsKept',
'faxRecordKeeper',
'phoneRecordKeeper',
'zipRecordKeeper',
'countryRecordKeeper',
'stateRecordKeeper',
'cityRecordKeeper',
'addressStreet2RecordKeeper',
'addressStreet1RecordKeeper',
'addressStreet1RecordKeeper',
'nameEntityRecordKeeper'
)
)) %>%
group_by(nameItem) %>%
mutate(countItem = 1:n()) %>%
ungroup() %>%
suppressMessages() %>%
dplyr::select(-idNode) %>%
dplyr::select(countItem, nameItem, everything()) %>%
arrange(countItem, nameItem) %>%
mutate(value = value %>% str_to_upper) %>%
spread(nameItem, value) %>%
mutate(nameEntityManager = name_entity_manager) %>%
dplyr::select(nameEntityManager, countItem, everything())
if ('addressStreet2RecordKeeper' %in% names(record_df)) {
record_df <-
record_df %>%
replace_na(list(addressStreet2RecordKeeper = '')) %>%
unite(
addressStreet1RecordKeeper,
addressStreet1RecordKeeper,
addressStreet2RecordKeeper,
sep = ' '
)
}
if (names(record_df) %>% str_count('^addressStreet1|^city|^state|^country') %>% sum() >= 4) {
record_df <-
record_df %>%
mutate(
locationRecordKeeper = addressStreet1RecordKeeper %>% paste0(
' ',
cityRecordKeeper,
', ',
stateRecordKeeper,
' ',
countryRecordKeeper,
' ',
zipRecordKeeper
)
)
}
if (return_wide) {
record_df <-
record_df %>%
.widen_adv_data()
}
if (record_df %>% ncol > 100 |
record_df %>% nrow > 8) {
record_df <-
tibble(nameEntityManager = name_entity_manager)
}
} else {
record_df <-
tibble(nameEntityManager = name_entity_manager)
}
return(record_df)
}
parse_control_person_data <-
function(page, return_wide) {
nodes <-
page %>%
.get_html_node_text('.PrintHistRed')
name_entity_manager <-
page %>% .get_entity_manager_name()
if (nodes[nodes %>% str_detect('INDIRECTLY CONTROLS')] %>% length() > 0) {
node_locations <-
nodes %>% grep('INDIRECTLY CONTROLS', .)
control_person_df <-
8:0 %>%
future_map_dfr(function(x) {
tibble(value = nodes[node_locations - x],
idNode = x)
}) %>%
left_join(tibble(
idNode = 8:0,
nameItem = c(
'nameControlPerson',
'idCRDControlPerson',
'dateEffectiveControlPerson',
'addressStreet1ControlPerson',
'cityControlPerson',
'stateControlPerson',
'countryControlPerson',
'zipControlPerson',
'descriptionControlPerson'
)
)) %>%
group_by(nameItem) %>%
mutate(countItem = 1:n()) %>%
ungroup() %>%
suppressMessages() %>%
dplyr::select(-idNode)
control_person_df <-
control_person_df %>%
spread(nameItem, value) %>%
mutate(nameEntityManager = name_entity_manager) %>%
dplyr::select(nameEntityManager, everything())
control_person_df <-
control_person_df %>%
mutate_at(.vars =
control_person_df %>% dplyr::select(dplyr::matches("^date")) %>% names(),
funs(. %>% lubridate::mdy())) %>%
mutate_at(.vars =
control_person_df %>% dplyr::select(dplyr::matches(
"^country[A-Z]|^name|^state|^city"
)) %>% names(),
.funs = str_to_upper) %>%
dplyr::select(
nameEntityManager,
nameControlPerson,
descriptionControlPerson,
everything()
)
if (names(control_person_df) %>% str_count('^address|^city|^country|^state') %>% sum() >= 4) {
control_person_df <-
control_person_df %>%
mutate(
locationControlPerson = addressStreet1ControlPerson %>% paste0(
' ',
cityControlPerson,
', ',
stateControlPerson,
' ',
countryControlPerson,
' ',
zipControlPerson
)
) %>%
dplyr::select(nameEntityManager:countItem,
locationControlPerson,
everything())
}
if ('nameControlPerson' %in% names(control_person_df)) {
name_df <-
control_person_df$nameControlPerson %>%
future_map_dfr(.parse_manager_owner_name)
names(name_df) <-
names(name_df) %>%
str_replace_all('EntityManagerOwner|EntityOwnerManager',
'ControlPerson')
control_person_df <-
control_person_df %>%
left_join(name_df) %>%
suppressMessages() %>%
dplyr::select(
nameEntityManager,
nameCommonControlPerson,
descriptionControlPerson,
everything()
)
}
if (return_wide) {
control_person_df <-
control_person_df %>%
mutate(
countItem = countItem - 1,
countItem = countItem %>% as.character(),
countItem = ifelse(countItem == "0", '', countItem)
) %>%
unite(item, nameItem, countItem, sep = '') %>%
mutate(nameEntityManager = name_entity_manager) %>%
dplyr::select(nameEntityManager, everything())
col_order <-
c('nameEntityManager', control_person_df$item)
control_person_df <-
control_person_df %>%
spread(item, value) %>%
dplyr::select(one_of(col_order))
control_person_df <-
control_person_df %>%
mutate_at(
.vars =
control_person_df %>% dplyr::select(dplyr::matches("^idCRD")) %>% names(),
.funs = as.numeric
) %>%
mutate_at(.vars =
control_person_df %>% dplyr::select(dplyr::matches("^date")) %>% names(),
funs(. %>% lubridate::mdy()))
}
} else {
control_person_df <-
tibble(nameEntityManager = name_entity_manager)
}
return(control_person_df)
}
parse_public_control_persons <-
function(page, all_table_node_df) {
has_public_control_data <-
all_table_node_df %>% dplyr::filter(nodeText == 'Full legal name of the public reporting company:') %>% nrow > 0
if (has_public_control_data) {
namePublicCompanyLegal <-
all_table_node_df %>%
dplyr::filter(
idRow == all_table_node_df %>% dplyr::filter(
nodeText == 'Full legal name of the public reporting company:'
) %>% .$idRow + 1
) %>% .$nodeText
idCIKPublicCompany <-
all_table_node_df %>%
dplyr::filter(
idRow == all_table_node_df %>% dplyr::filter(
nodeText %>% str_detect("The public reporting company's CIK number")
) %>% .$idRow + 1
) %>% .$nodeText %>%
as.numeric()
public_control_df <-
tibble(nameEntityManager = name_entity_manager) %>%
mutate(idCIKPublicCompany, namePublicCompanyLegal)
} else {
public_control_df <-
tibble(nameEntityManager = name_entity_manager)
}
return(public_control_df)
}
parse_other_disclosures <-
function(page, return_wide) {
has_other_text_node <-
!page %>%
html_nodes(
'#ctl00_ctl00_cphMainContent_cphAdvFormContent_SchedDMisc_ctl00_trIAPDHeader + tr span'
) %>%
html_text() %>% str_trim() == ''
if (has_other_text_node) {
node_text <-
page %>%
html_nodes(
'#ctl00_ctl00_cphMainContent_cphAdvFormContent_SchedDMisc_ctl00_trIAPDHeader + tr span'
) %>%
html_text()
node_text <-
node_text %>%
str_split('\n|\r') %>%
flatten_chr() %>%
str_to_upper() %>%
str_trim()
node_text <-
node_text[!node_text == '']
other_node_text_df <-
tibble(
nameEntityManager = name_entity_manager,
item = 'descriptionOtherDisclosures',
value = node_text
) %>%
mutate(countItem = 1:n()) %>%
spread(item, value)
if (return_wide) {
other_node_text_df <-
other_node_text_df %>%
.widen_adv_data()
}
} else {
other_node_text_df <-
tibble(nameEntityManager = name_entity_manager)
}
return(other_node_text_df)
}
parse_other_office_locations_safe <-
possibly(parse_other_office_locations, NULL)
parse_record_locations_safe <-
possibly(parse_record_locations, NULL)
other_office_location_df <-
page %>%
parse_other_office_locations_safe(return_wide = return_wide)
record_location_df <-
page %>%
parse_record_locations_safe(all_table_node_df = all_table_node_df,
return_wide = return_wide)
parse_related_advisor_safe <-
possibly(parse_related_advisor, NULL)
related_adviser_df <-
page %>%
parse_related_advisor_safe(return_wide = F)
parse_control_person_data_safe <-
possibly(parse_control_person_data, NULL)
control_person_df <-
page %>%
parse_control_person_data_safe(return_wide = return_wide)
parse_for_manager_website_data_safe <-
possibly(parse_for_manager_website_data, NULL)
website_df <-
page %>%
parse_for_manager_website_data_safe()
parse_public_control_persons_safe <-
possibly(parse_public_control_persons, NULL)
public_control_df <-
page %>%
parse_public_control_persons_safe(all_table_node_df = all_table_node_df)
parse_other_disclosures_safe <-
possibly(parse_other_disclosures, NULL)
other_data_df <-
page %>%
parse_other_disclosures_safe(return_wide = return_wide)
if (join_data) {
section_data <-
other_office_location_df %>%
left_join(related_adviser_df) %>%
left_join(record_location_df) %>%
left_join(website_df) %>%
left_join(control_person_df) %>%
left_join(public_control_df) %>%
left_join(other_data_df) %>%
suppressMessages()
} else {
section_data <-
tibble(
nameTable = c(
'Other Office Locations',
'Record Locations',
'Related Advisers',
'Website',
'Control Persons',
'Control Public Entities',
'Other Disclosures'
),
dataTable = list(
other_office_location_df,
record_location_df,
related_adviser_df,
website_df,
control_person_df,
public_control_df,
other_data_df
)
) %>%
mutate(nameEntityManager = name_entity_manager,
countColumns = dataTable %>% map_dbl(ncol))
}
return(section_data)
}
section_data <-
page %>%
parse_schedule_d(join_data = join_data, return_wide = return_wide)
section_data <-
section_data %>%
mutate(idCRD) %>%
dplyr::select(idCRD, everything())
if (!'nameEntityManager' %in% names(section_data)) {
section_data <-
section_data %>%
mutate(nameEntityManager = name_entity_manager) %>%
dplyr::select(idCRD, nameEntityManager, everything())
}
if ((!return_wide) & join_data == T) {
section_data <-
section_data %>%
gather(item, value, -c(nameEntityManager, idCRD)) %>%
mutate(
countItem = item %>% as.character() %>% readr::parse_number(),
countItem = if_else(countItem %>% is.na, 0, countItem),
countItem = countItem + 1,
item = item %>% str_replace_all('[1-9]', '')
) %>%
arrange(countItem) %>%
suppressWarnings() %>%
suppressMessages()
}
section_data <-
section_data %>%
dplyr::select(idCRD, nameEntityManager, everything())
return(section_data)
}
.get_manager_signatory_data <-
function(url = 'http://www.adviserinfo.sec.gov/IAPD/content/viewform/adv/Sections/iapd_AdvSignatureSection.aspx?ORG_PK=160489&FLNG_PK=02FF0ECC00080185033F257005F016CD056C8CC0') {
idCRD <-
url %>%
.get_pk_url_crd()
page <-
url %>%
.get_html_page()
name_entity_manager <-
page %>%
.get_entity_manager_name()
parse_signature_page <-
function(page) {
is_old <-
(page %>%
.get_html_node_text('.PrintHistRed') %>% is.na == T) %>% as.numeric() %>% sum() >= 1
if (is_old) {
nodes <-
page %>%
.get_html_node_text('font') %>%
unique() %>%
.[1:3]
} else {
nodes <-
page %>%
.get_html_node_text('.PrintHistRed') %>%
str_replace_all('&', 'AND') %>%
unique()
}
if (nodes %>% length() == 3) {
item_names <-
c('nameEntityManagerSignatory',
'dateADVFiling',
'titleSignatory')
signature_df <-
tibble(item = item_names,
value = nodes) %>%
spread(item, value) %>%
mutate(dateADVFiling = dateADVFiling %>% lubridate::mdy())
}
if (nodes %>% length() == 2) {
item_names <-
c('nameEntityManagerSignatory',
'dateADVFiling')
signature_df <-
tibble(item = item_names,
value = nodes) %>%
spread(item, value)
}
rm(nodes)
rm(page)
return(signature_df)
}
signature_df <-
page %>%
parse_signature_page() %>%
mutate(idCRD, nameEntityManager = name_entity_manager) %>%
dplyr::select(idCRD, nameEntityManager, everything())
return(signature_df)
}
.get_section_drp <-
function(url = 'http://www.adviserinfo.sec.gov/IAPD/content/viewform/adv/Sections/iapd_AdvDrpSection.aspx?ORG_PK=103863&FLNG_PK=05CBF6920008018902B1D9910035D515056C8CC0',
return_wide = F) {
idCRD <-
url %>%
.get_pk_url_crd()
page <-
url %>%
.get_html_page()
name_entity_manager <-
page %>%
.get_entity_manager_name()
clean_adv_drp_data <-
function(data, widen_data = F) {
has_data <-
data %>% ncol > 1
if (has_data) {
data <-
data %>%
dplyr::select(-countItem) %>%
mutate_at(.vars =
data %>% dplyr::select(dplyr::matches("^date")) %>% names(),
.funs = as.character) %>%
gather(itemName, value, -c(nameEntityManager, typeCharges)) %>%
unite(item, itemName, typeCharges, sep = '') %>%
dplyr::filter(!value %>% is.na()) %>%
distinct() %>%
group_by(item) %>%
mutate(countItem = 1:n()) %>%
ungroup() %>%
spread(item, value) %>%
.mutate_adv_data()
if (widen_data) {
data <-
data %>%
.widen_adv_data
}
}
return(data)
}
generate_table_name_df <-
function(item_name = 'dateCharged',
hit_word = 'Charged',
offset = 1,
is_numeric = F,
is_date = T,
is_employment_firm = F,
replace_words = NA,
filter_words = NA) {
name_df <-
tibble(
nameItem = item_name,
hit_word,
offset,
is_numeric,
is_date,
is_employment_firm,
replace_words,
filter_words
)
return(name_df)
}
parse_drp_table_data <-
function(all_table_node_df,
item_name = 'descriptionPersonEntityViolator',
hit_word = 'Other Product Types:',
offset = 1,
is_numeric = F,
is_date = F,
is_employment_firm = F,
replace_words = NA,
filter_words = '^7.') {
row_locs <-
all_table_node_df %>%
dplyr::filter(nodeText %>% str_detect(hit_word)) %>%
.$idRow + offset
values <-
all_table_node_df %>% slice(row_locs) %>% .$nodeText
if (!filter_words %>% is.na()) {
has_values <-
values[!values %>% str_detect(filter_words)] %>% length() > 0
} else {
has_values <-
values %>% length() > 0
}
if (has_values) {
if (!filter_words %>% is.na()) {
values <-
values[!values %>% str_detect(filter_words)]
}
} else {
values <-
NA
}
if (is_date) {
values <-
seq_along(values) %>%
future_map(function(x) {
date_values <-
values[[x]] %>%
str_split('\\: ') %>%
flatten_chr()
val_length <-
date_values %>% length()
if (val_length == 1) {
date_value <-
date_values[[1]]
}
if (val_length == 2) {
date_value <-
date_values %>%
.[[2]]
}
if (!val_length %in% c(1, 2)) {
date_value <-
NA
}
return(date_value)
}) %>%
flatten_chr() %>%
lubridate::mdy() %>%
as.character() %>%
suppressWarnings()
}
if (is_employment_firm) {
values <-
seq_along(values) %>%
map_chr(function(x) {
emp_length <-
values[x] %>%
str_split('\\:') %>%
flatten_chr()
if (emp_length %>% length() == 1) {
emp_val <-
emp_length %>%
.[[1]]
}
if (emp_length %>% length() == 2) {
emp_val <-
emp_length %>%
.[[2]]
}
if (!emp_length %>% length() %in% c(1, 2)) {
emp_val <-
NA
}
return(emp_val)
})
}
if (!replace_words %>% is.na()) {
values <-
values %>%
str_replace_all(replace_words, '')
}
if (is_numeric) {
values <-
values %>%
str_replace_all('\\$', '') %>%
as.character() %>%
readr::parse_number() %>%
suppressWarnings()
}
values <-
values[!values == '']
values <-
values %>% gsub('N/A', NA, .)
values <-
values[!values %>% is.na]
has_values <-
values %>% length() > 0
if (has_values) {
table_df <-
tibble(value = values) %>%
dplyr::filter(!value %>% is.na()) %>%
mutate(countItem = 1:n(),
nameItem = item_name) %>%
dplyr::select(countItem, nameItem, value)
if (!filter_words %>% is.na()) {
table_df <-
table_df %>%
dplyr::filter(!value %>% str_detect(filter_words))
}
table_df <-
table_df %>%
group_by(nameItem, value) %>%
mutate(
lagValue = countItem - dplyr::lag(countItem),
lagValue = ifelse(lagValue %>% is.na, 0, lagValue)
) %>%
ungroup() %>%
dplyr::filter(!lagValue == 1) %>%
mutate(countItem = 1:n()) %>%
dplyr::select(-lagValue)
} else {
table_df <-
tibble(nameItem = item_name)
}
return(table_df)
}
parse_drp_table_data_safe <-
possibly(parse_drp_table_data, NULL)
parse_criminal_data <-
function(page) {
has_no_filings <-
page %>%
html_nodes('#aspnetForm a[name*="Criminal"] + table') %>%
html_text() %>% str_trim() %>% nchar() < 130
if (!has_no_filings) {
table_nodes <-
page %>%
html_nodes('#aspnetForm a[name*="Criminal"] + .flatBorderTable') %>%
html_text()
all_table_node_df <-
seq_along(table_nodes) %>%
future_map_dfr(function(x) {
raw_nodes <-
table_nodes[[x]] %>%
str_replace_all('\r|\n|\t', '') %>%
stri_replace_all_charclass("\\p{WHITE_SPACE}", " ") %>%
stri_trim_both() %>%
gsub("^\\s+|\\s+$", "", .) %>%
str_split('\\ ') %>%
flatten_chr() %>%
str_trim()
raw_nodes <-
raw_nodes[!raw_nodes == '']
if (raw_nodes %>% length() > 0) {
tibble(idTable = x, nodeText = raw_nodes)
}
}) %>%
dplyr::select(nodeText) %>%
mutate(idRow = 1:n())
drp_name_df <-
generate_table_name_df(
item_name = 'idCRDViolator',
hit_word = 'CRD Number:',
offset = 1,
is_numeric = T,
is_date = F,
is_employment_firm = F,
replace_words = NA,
filter_words = NA
) %>%
bind_rows(
list(
generate_table_name_df(
item_name = 'namePersonEntityViolator',
hit_word = 'Name:',
offset = 1,
is_numeric = F,
is_date = F,
is_employment_firm = F,
replace_words = NA,
filter_words = NA
),
generate_table_name_df(
item_name = 'descriptionPersonEntityViolator',
hit_word = 'relatedbusiness and your o',
offset = 2,
is_numeric = F,
is_date = F,
is_employment_firm = F,
replace_words = NA,
filter_words = NA
),
generate_table_name_df(
item_name = 'descriptionLocationCharges',
hit_word = 'Formal Charge(s) were brought in:',
offset = 1,
is_numeric = F,
is_date = F,
is_employment_firm = F,
replace_words = NA,
filter_words = NA
),
generate_table_name_df(
item_name = 'dateCharged',
hit_word = 'Charged',
offset = 1,
is_numeric = F,
is_date = T,
is_employment_firm = F,
replace_words = NA,
filter_words = NA
),
generate_table_name_df(
item_name = 'dateEventStatus',
hit_word = 'Event Status Date',
offset = 1,
is_numeric = F,
is_date = T,
is_employment_firm = F,
replace_words = NA,
filter_words = NA
),
generate_table_name_df(
item_name = 'descriptionDispositionDisclosure',
hit_word = 'Disposition Disclosure',
offset = 2,
is_numeric = F,
is_date = F,
is_employment_firm = F,
replace_words = NA,
filter_words = NA
),
generate_table_name_df(
item_name = 'descriptionViolationCircumstances',
hit_word = 'Disposition Disclosure',
offset = 2,
is_numeric = F,
is_date = F,
is_employment_firm = F,
replace_words = NA,
filter_words = NA
)
)
)
all_drp_data <-
1:nrow(drp_name_df) %>%
future_map_dfr(function(x) {
parse_drp_table_data_safe(
all_table_node_df = all_table_node_df,
item_name = drp_name_df$nameItem[x],
hit_word = drp_name_df$hit_word[x],
replace_words = drp_name_df$replace_words[x],
filter_words = drp_name_df$filter_words[x],
offset = drp_name_df$offset[x],
is_numeric = drp_name_df$is_numeric[x],
is_date = drp_name_df$is_date[x],
is_employment_firm = drp_name_df$is_employment_firm[x]
)
}) %>%
mutate(nameEntityManager = name_entity_manager) %>%
dplyr::select(nameEntityManager, everything()) %>%
distinct() %>%
arrange(countItem) %>%
dplyr::filter(!countItem %>% is.na()) %>%
spread(nameItem, value) %>%
.mutate_adv_data() %>%
dplyr::select(c(
nameEntityManager,
countItem,
dplyr::matches('namePerson'),
everything()
)) %>%
mutate(typeCharges = 'Criminal') %>%
suppressWarnings()
} else{
all_drp_data <-
tibble(nameEntityManager = name_entity_manager)
}
return(all_drp_data)
}
parse_regulary_action_data <-
function(page) {
has_no_filings <-
page %>%
html_nodes('#aspnetForm a[name*="Regulatory"] + table') %>%
html_text() %>% str_trim() %>% nchar() < 200
if (!has_no_filings) {
table_nodes <-
page %>%
html_nodes('#aspnetForm a[name*="Regulatory"] + .flatBorderTable') %>%
html_text()
all_table_node_df <-
seq_along(table_nodes) %>%
future_map_dfr(function(x) {
raw_nodes <-
table_nodes[[x]] %>%
str_replace_all('\r|\n|\t', '') %>%
stri_replace_all_charclass("\\p{WHITE_SPACE}", " ") %>%
stri_trim_both() %>%
gsub("^\\s+|\\s+$", "", .) %>%
str_split('\\ ') %>%
flatten_chr() %>%
str_trim()
raw_nodes <-
raw_nodes[!raw_nodes == '']
if (raw_nodes %>% length() > 0) {
tibble(idTable = x, nodeText = raw_nodes)
}
}) %>%
dplyr::select(nodeText) %>%
mutate(idRow = 1:n())
drp_name_df <-
generate_table_name_df(
item_name = 'idCRDViolator',
hit_word = 'CRD Number:',
offset = 1,
is_numeric = T,
is_date = F,
is_employment_firm = F,
replace_words = NA,
filter_words = NA
) %>%
bind_rows(
list(
generate_table_name_df(
item_name = 'namePersonEntityViolator',
hit_word = 'Name:',
offset = 1,
is_numeric = F,
is_date = F,
is_employment_firm = F,
replace_words = NA,
filter_words = NA
),
generate_table_name_df(
item_name = 'entityEnforcmentAgencyViolation',
hit_word = 'foreign financial regulatory authority,',
offset = 3,
is_numeric = F,
is_date = F,
is_employment_firm = F,
replace_words = NA,
filter_words = NA
),
generate_table_name_df(
item_name = 'sanctionPrincpalViolation',
hit_word = 'Principal Sanction:',
offset = 0,
is_numeric = F,
is_date = F,
is_employment_firm = F,
replace_words = 'Principal Sanction:',
filter_words = NA
),
generate_table_name_df(
item_name = 'descriptionSanctions',
hit_word = 'Other Sanctions:',
offset = 1,
is_numeric = F,
is_date = F,
is_employment_firm = F,
replace_words = NA,
filter_words = c('^3.')
),
generate_table_name_df(
item_name = 'dateInitiated',
hit_word = 'Date Initiated',
offset = 0,
is_numeric = F,
is_date = T,
is_employment_firm = F,
replace_words = NA,
filter_words = NA
),
generate_table_name_df(
item_name = 'idDocket',
hit_word = 'Docket/Case Number',
offset = 0,
is_numeric = F,
is_date = F,
is_employment_firm = F,
replace_words = 'Docket/Case Number:',
filter_words = NA
),
generate_table_name_df(
item_name = 'nameEmployingFirmViolationTime',
hit_word = 'Employing Firm when activity ',
offset = 0,
is_numeric = F,
is_date = F,
is_employment_firm = T,
replace_words = NA,
filter_words = NA
),
generate_table_name_df(
item_name = 'typeProductViolation',
hit_word = 'Principal Product Type:',
offset = 0,
is_numeric = F,
is_date = F,
is_employment_firm = F,
replace_words = 'Principal Product Type:',
filter_words = NA
),
generate_table_name_df(
item_name = 'descriptionProductViolation',
hit_word = 'Other Product Types:',
offset = 1,
is_numeric = F,
is_date = F,
is_employment_firm = F,
replace_words = NA,
filter_words = c('^7.')
),
generate_table_name_df(
item_name = 'descriptionAllegationViolation',
hit_word = 'Describe the allegations',
offset = 1,
is_numeric = F,
is_date = F,
is_employment_firm = F,
replace_words = NA,
filter_words = c('^8.')
),
generate_table_name_df(
item_name = 'dateResolution',
hit_word = 'Resolution Date',
offset = 1,
is_numeric = F,
is_date = T,
is_employment_firm = F,
replace_words = NA,
filter_words = NA
),
generate_table_name_df(
item_name = 'amountFineViolation',
hit_word = 'Monetary/Fine Amount:',
offset = 1,
is_numeric = T,
is_date = F,
is_employment_firm = F,
replace_words = NA,
filter_words = NA
),
generate_table_name_df(
item_name = 'detailOrderViolation',
hit_word = 'Ordered:',
offset = 1,
is_numeric = F,
is_date = F,
is_employment_firm = F,
replace_words = NA,
filter_words = NA
),
generate_table_name_df(
item_name = 'descriptionSatisfactionConditions',
hit_word = 'waived:',
offset = 1,
is_numeric = F,
is_date = F,
is_employment_firm = F,
replace_words = NA,
filter_words = NA
),
generate_table_name_df(
item_name = 'descriptionViolationCircumstances',
hit_word = 'Provide a brief summary of details related to the action status:',
offset = 2,
is_numeric = F,
is_date = F,
is_employment_firm = F,
replace_words = NA,
filter_words = NA
)
)
)
all_drp_data <-
1:nrow(drp_name_df) %>%
future_map_dfr(function(x) {
parse_drp_table_data_safe(
all_table_node_df = all_table_node_df,
item_name = drp_name_df$nameItem[x],
hit_word = drp_name_df$hit_word[x],
replace_words = drp_name_df$replace_words[x],
filter_words = drp_name_df$filter_words[x],
offset = drp_name_df$offset[x],
is_numeric = drp_name_df$is_numeric[x],
is_date = drp_name_df$is_date[x],
is_employment_firm = drp_name_df$is_employment_firm[x]
)
}) %>%
mutate(nameEntityManager = name_entity_manager) %>%
dplyr::select(nameEntityManager, everything()) %>%
distinct() %>%
dplyr::filter(!countItem %>% is.na()) %>%
arrange(countItem) %>%
spread(nameItem, value) %>%
.mutate_adv_data() %>%
dplyr::select(c(nameEntityManager,
countItem,
everything())) %>%
mutate(typeCharges = 'Regulatory') %>%
suppressWarnings()
} else{
all_drp_data <-
tibble(nameEntityManager = name_entity_manager)
}
return(all_drp_data)
}
parse_civil_action_data <-
function(page) {
has_no_filings <-
page %>%
html_nodes('#aspnetForm a[name*="Civil"] + table') %>%
html_text() %>% str_trim() %>% nchar() < 200
if (!has_no_filings) {
table_nodes <-
page %>%
html_nodes('#aspnetForm a[name*="Civil"] + .flatBorderTable') %>%
html_text()
all_table_node_df <-
seq_along(table_nodes) %>%
future_map_dfr(function(x) {
raw_nodes <-
table_nodes[[x]] %>%
str_replace_all('\r|\n|\t', '') %>%
stri_replace_all_charclass("\\p{WHITE_SPACE}", " ") %>%
stri_trim_both() %>%
gsub("^\\s+|\\s+$", "", .) %>%
str_split('\\ ') %>%
flatten_chr() %>%
str_trim()
raw_nodes <-
raw_nodes[!raw_nodes == '']
if (raw_nodes %>% length() > 0) {
tibble(idTable = x, nodeText = raw_nodes)
}
}) %>%
dplyr::select(nodeText) %>%
mutate(idRow = 1:n())
drp_name_df <-
generate_table_name_df(
item_name = 'idCRDViolator',
hit_word = 'CRD Number:',
offset = 1,
is_numeric = T,
is_date = F,
is_employment_firm = F,
replace_words = NA,
filter_words = NA
) %>%
bind_rows(
list(
generate_table_name_df(
item_name = 'namePersonEntityViolator',
hit_word = 'Name:',
offset = 1,
is_numeric = F,
is_date = F,
is_employment_firm = F,
replace_words = NA,
filter_words = NA
),
generate_table_name_df(
item_name = 'entityEnforcmentAgencyViolation',
hit_word = 'foreign financial regulatory authority,',
offset = 3,
is_numeric = F,
is_date = F,
is_employment_firm = F,
replace_words = NA,
filter_words = NA
),
generate_table_name_df(
item_name = 'sanctionPrincpalViolation',
hit_word = 'Principal Sanction:',
offset = 0,
is_numeric = F,
is_date = F,
is_employment_firm = F,
replace_words = 'Principal Sanction:',
filter_words = NA
),
generate_table_name_df(
item_name = 'descriptionSanctions',
hit_word = 'Other Sanctions:',
offset = 1,
is_numeric = F,
is_date = F,
is_employment_firm = F,
replace_words = NA,
filter_words = c('^3.')
),
generate_table_name_df(
item_name = 'dateInitiated',
hit_word = 'Date Initiated',
offset = 0,
is_numeric = F,
is_date = T,
is_employment_firm = F,
replace_words = NA,
filter_words = NA
),
generate_table_name_df(
item_name = 'idDocket',
hit_word = 'Docket/Case Number',
offset = 0,
is_numeric = F,
is_date = F,
is_employment_firm = F,
replace_words = 'Docket/Case Number:',
filter_words = NA
),
generate_table_name_df(
item_name = 'nameEmployingFirmViolationTime',
hit_word = 'Employing Firm when activity ',
offset = 0,
is_numeric = F,
is_date = F,
is_employment_firm = T,
replace_words = NA,
filter_words = NA
),
generate_table_name_df(
item_name = 'typeProductViolation',
hit_word = 'Principal Product Type',
offset = 0,
is_numeric = F,
is_date = F,
is_employment_firm = F,
replace_words = 'Principal Product Type:',
filter_words = NA
),
generate_table_name_df(
item_name = 'descriptionProductViolation',
hit_word = 'Other Product Types',
offset = 1,
is_numeric = F,
is_date = F,
is_employment_firm = F,
replace_words = NA,
filter_words = c('^7.')
),
generate_table_name_df(
item_name = 'descriptionAllegationViolation',
hit_word = 'Describe the allegations',
offset = 1,
is_numeric = F,
is_date = F,
is_employment_firm = F,
replace_words = NA,
filter_words = c('\\^8.')
),
generate_table_name_df(
item_name = 'dateResolution',
hit_word = 'Resolution Date',
offset = 1,
is_numeric = F,
is_date = T,
is_employment_firm = F,
replace_words = NA,
filter_words = NA
),
generate_table_name_df(
item_name = 'amountFineViolation',
hit_word = 'Monetary/Fine Amount:',
offset = 1,
is_numeric = T,
is_date = F,
is_employment_firm = F,
replace_words = NA,
filter_words = NA
),
generate_table_name_df(
item_name = 'detailOrderViolation',
hit_word = 'Ordered:',
offset = 1,
is_numeric = F,
is_date = F,
is_employment_firm = F,
replace_words = NA,
filter_words = NA
),
generate_table_name_df(
item_name = 'descriptionSatisfactionConditions',
hit_word = 'waived:',
offset = 1,
is_numeric = F,
is_date = F,
is_employment_firm = F,
replace_words = NA,
filter_words = NA
),
generate_table_name_df(
item_name = 'descriptionViolationCircumstances',
hit_word = 'Provide a brief summary of details related to the action status:',
offset = 2,
is_numeric = F,
is_date = F,
is_employment_firm = F,
replace_words = NA,
filter_words = NA
)
)
)
all_drp_data <-
1:nrow(drp_name_df) %>%
future_map_dfr(function(x) {
parse_drp_table_data_safe(
all_table_node_df = all_table_node_df,
item_name = drp_name_df$nameItem[x],
hit_word = drp_name_df$hit_word[x],
replace_words = drp_name_df$replace_words[x],
filter_words = drp_name_df$filter_words[x],
offset = drp_name_df$offset[x],
is_numeric = drp_name_df$is_numeric[x],
is_date = drp_name_df$is_date[x],
is_employment_firm = drp_name_df$is_employment_firm[x]
)
}) %>%
mutate(nameEntityManager = name_entity_manager) %>%
dplyr::select(nameEntityManager, everything()) %>%
distinct()
all_drp_data <-
all_drp_data %>%
dplyr::filter(!countItem %>% is.na()) %>%
arrange(countItem) %>%
spread(nameItem, value) %>%
.mutate_adv_data() %>%
dplyr::select(c(nameEntityManager,
countItem,
everything())) %>%
mutate(typeCharges = 'Civil') %>%
suppressWarnings()
} else{
all_drp_data <-
tibble(nameEntityManager = name_entity_manager)
}
return(all_drp_data)
}
criminal_df <-
page %>%
parse_criminal_data() %>%
clean_adv_drp_data(widen_data = return_wide)
regulatory_df <-
page %>%
parse_regulary_action_data() %>%
clean_adv_drp_data(widen_data = return_wide)
civil_df <-
page %>%
parse_civil_action_data() %>%
clean_adv_drp_data(widen_data = return_wide)
if (return_wide) {
all_drp_data <-
criminal_df %>% dplyr::select(-dplyr::matches("^countItem")) %>%
left_join(regulatory_df %>%
dplyr::select(-dplyr::matches("^countItem"))) %>%
left_join(civil_df %>%
dplyr::select(-dplyr::matches("^countItem"))) %>%
suppressMessages()
if ('nameEntityManager' %in% names(all_drp_data)) {
all_drp_data <-
all_drp_data %>%
mutate(nameEntityManager = name_entity_manager)
}
all_drp_data <-
all_drp_data %>%
distinct() %>%
mutate(idCRD) %>%
dplyr::select(idCRD, nameEntityManager, everything())
} else {
all_drp_data <-
tibble(
nameTable = c('Criminal DRP', 'Regulatory CRD', 'Civil CRD'),
dataTable = list(criminal_df, regulatory_df, civil_df)
) %>%
mutate(idCRD)
if (!'nameEntityManager' %in% names(all_drp_data)) {
all_drp_data <-
all_drp_data %>%
mutate(nameEntityManager = name_entity_manager)
}
all_drp_data <-
all_drp_data %>%
dplyr::select(idCRD, nameEntityManager, everything())
all_drp_data <-
all_drp_data %>%
mutate(countColumns = dataTable %>% map_dbl(ncol)) %>%
dplyr::select(-countColumns)
}
return(all_drp_data)
}
.get_crd_sections_data <-
function(id_crd = 159127,
all_sections = TRUE,
score_threshold = .2,
section_names = c(
"Registration",
"Identifying Information",
"Organization",
"Successions",
"Private Fund Reporting",
"Direct Manager Owners",
"Indirect Manager Owners",
"Other Manager Information",
"Manager Signatories"
),
flatten_tables = TRUE) {
data <-
.get_managers_adv_sitemap_adv(idCRDs = id_crd, score_threshold = score_threshold) %>%
distinct() %>%
dplyr::filter(!idSection %>% str_detect('section12SmallBusiness')) %>%
suppressWarnings() %>%
suppressMessages()
section_null <-
section_names %>% purrr::is_null()
if (all_sections) {
section_names <-
data$nameSectionActual
}
if (section_null &
!all_sections) {
stop("You must select a section, possibilties for this search are:\n" %>%
paste0(paste0(
data$nameSectionActual, collapse = '\n'
)))
}
get_adv_sections <-
function(data, all_sections) {
.get_manager_sec_page_safe <-
possibly(.get_manager_sec_page, tibble())
.get_section_drp_safe <-
possibly(.get_section_drp, tibble())
.get_section_1_data_safe <-
possibly(.get_section_1_data, tibble())
.get_section_2_data_safe <-
possibly(.get_section_2_data, tibble())
.get_section_3_data_safe <-
possibly(.get_section_3_data, tibble())
.get_section_4_data_safe <-
possibly(.get_section_4_data, tibble())
.get_section_5_data_safe <-
possibly(.get_section_5_data, tibble())
.get_section_6_data_safe <-
possibly(.get_section_6_data, tibble())
.get_section_7a_data_safe <-
possibly(.get_section_7a_data, tibble())
.get_section_7b_data_safe <-
possibly(.get_section_7b_data, tibble())
.get_section_8_data_safe <-
possibly(.get_section_8_data, tibble())
.get_section_9_data_safe <-
possibly(.get_section_9_data, tibble())
.get_section_10_data_safe <-
possibly(.get_section_10_data, tibble())
.get_section_11_data_safe <-
possibly(.get_section_11_data, tibble())
.get_section_12_data_safe <-
possibly(.get_section_12_data, tibble())
.get_schedule_a_data_safe <-
possibly(.get_schedule_a_data, tibble())
.get_schedule_b_data_safe <-
possibly(.get_schedule_b_data, tibble())
.get_schedule_d_data_safe <-
possibly(.get_schedule_d_data, tibble())
.get_manager_signatory_data_safe <-
possibly(.get_manager_signatory_data, tibble())
if (!all_sections) {
no_rows <-
data %>%
dplyr::filter(nameSectionActual %in% section_names) %>% nrow == 0
if (no_rows) {
stop(
"Sorry tables can only be:\n",
data$nameSectionActual %>% paste0(collapse = '\n')
)
}
data <-
data %>%
dplyr::filter(nameSectionActual %in% section_names)
}
all_data <-
1:nrow(data) %>%
map_dfr(function(x) {
f <-
data$nameFunction[[x]] %>% lazyeval::as_name() %>% eval()
url <-
data$urlADVSection[[x]]
df_name <-
data$nameData[[x]] %>%
str_remove_all("data_") %>%
str_to_title() %>%
paste0('data', .)
nameADVPage <-
data$nameSectionActual[[x]]
nameADVPage %>% message()
paste0('idCRD: ', id_crd, ' - ', nameADVPage) %>% cat(fill = T)
g <-
f %>%
list(quote(url)) %>%
as.call()
assign(x = df_name, eval(g))
data <-
tibble(nameADVPage = nameADVPage,
dataTable = list(df_name %>% as_name() %>% eval()))
return(data)
})
return(all_data)
}
get_adv_sections_safe <-
possibly(get_adv_sections, tibble())
all_data <-
get_adv_sections_safe(data = data, all_sections = all_sections) %>%
suppressWarnings()
all_data <-
all_data %>%
mutate(isNULL = dataTable %>% map_dbl(length) == 0) %>%
dplyr::filter(!isNULL) %>%
dplyr::select(-isNULL) %>%
mutate(countCols = dataTable %>% map_dbl(ncol),
countRows = dataTable %>% map_dbl(nrow))
name_entity_manager <-
all_data %>%
unnest() %>%
.$nameEntityManager %>%
unique() %>%
.[[1]] %>%
suppressWarnings()
name_entity_manager <-
name_entity_manager[!name_entity_manager %>% is.na]
not_wide_tables <-
c('Indirect Manager Owners')
all_data <-
all_data %>%
mutate(idCRD = id_crd,
nameEntityManager = name_entity_manager) %>%
dplyr::select(idCRD, nameEntityManager, nameADVPage, everything()) %>%
dplyr::filter(countCols > 2) %>%
dplyr::filter(countRows > 0) %>%
mutate(isDataWide = if_else(countRows == 1, T, F)) %>%
mutate(isDataWide = if_else(nameADVPage %in% not_wide_tables, F, isDataWide))
if ('Advisory Business Information' %in% all_data$nameADVPage) {
has_aum_total <-
all_data %>%
dplyr::filter(nameADVPage == 'Advisory Business Information') %>%
dplyr::select(dataTable) %>%
unnest() %>%
dplyr::select(dplyr::matches("amountAUMTotal")) %>% ncol() == 1
if (has_aum_total) {
total_aum <-
all_data %>%
dplyr::filter(nameADVPage == 'Advisory Business Information') %>%
dplyr::select(dataTable) %>%
unnest() %>%
.$amountAUMTotal %>%
formattable::currency(digits = 0)
"Parsed " %>%
paste0(name_entity_manager,
'\nThey have ',
total_aum,
' in Total Assets Under Management') %>% cat(fill = T)
}
} else {
"Parsed " %>%
paste0(name_entity_manager) %>%
cat(fill = T)
}
for (x in 1:nrow(all_data)) {
all_data$dataTable[[x]]$nameEntityManager <-
name_entity_manager
if (all_data$nameADVPage[x] %in% c('Other Manager Information', 'DRPs')) {
count_rows <-
all_data$dataTable[[x]]$dataTable %>% length
for (count in 1:(count_rows)) {
all_data$dataTable[[x]]$dataTable[[count]]$nameEntityManager <-
name_entity_manager
}
}
}
if (flatten_tables) {
get_all_manager_description_data <-
function(all_data) {
select_nesting_vars <-
function(data) {
data <-
data %>%
dplyr::select(nameADVPage, dataTable)
return(data)
}
widen_data <-
all_data %>%
dplyr::filter(isDataWide)
widened_data <-
1:nrow(widen_data) %>%
map_dfr(function(x) {
data <-
widen_data %>%
select_nesting_vars() %>%
dplyr::select(-nameADVPage) %>%
slice(x) %>%
unnest() %>%
mutate_all(as.character) %>%
gather(nameItem, value, -c(idCRD, nameEntityManager))
return(data)
}) %>%
distinct() %>%
group_by(nameItem) %>%
mutate(countItem = 1:n()) %>%
ungroup() %>%
suppressWarnings() %>%
dplyr::filter(!value %>% is.na()) %>%
mutate(idCRD = idCRD %>% as.numeric()) %>%
mutate(
countItem = countItem - 1,
countItem = countItem %>% as.character(),
countItem = ifelse(countItem == "0", '', countItem)
) %>%
unite(item, nameItem, countItem, sep = '') %>%
distinct() %>%
suppressWarnings() %>%
mutate(value = value %>% str_trim())
col_order <-
c('idCRD', 'nameEntityManager', widened_data$item)
data <-
widened_data %>%
spread(item, value) %>%
dplyr::select(one_of(col_order)) %>%
.mutate_adv_data()
return(data)
}
manager_description_data <-
all_data %>%
get_all_manager_description_data() %>%
mutate_if(is.character, str_trim)
all_data <-
tibble(
idCRD = id_crd,
nameEntityManager = name_entity_manager,
nameTable = 'Manager Description',
dataTable = list(manager_description_data)
) %>%
bind_rows(
all_data %>% dplyr::filter(isDataWide == F) %>%
dplyr::select(idCRD, nameEntityManager,
nameTable = nameADVPage,
dataTable)
) %>%
dplyr::select(idCRD, nameEntityManager, nameTable, dataTable)
}
gc()
return(all_data)
}
.get_search_crd_ids <-
function(entity_names = c('EJF Capital', '137 Ventures'),
crd_ids = NULL) {
if (entity_names %>% purrr::is_null() & (crd_ids %>% purrr::is_null())) {
stop("Please enter search names or CRD IDs")
}
crd_df <-
tibble(idCRD = NA)
if (!entity_names %>% purrr::is_null()) {
finra_entities_safe <-
purrr::possibly(finra_entities, tibble())
search_name_df <-
entity_names %>%
future_map_dfr(function(x) {
finra_entities(
entity_names = x,
return_message = FALSE,
ocr_pdf = FALSE
)
})
id_crds <-
search_name_df %>%
.$idCRD %>%
as.character() %>%
readr::parse_number()
crd_df <-
crd_df %>%
bind_rows(tibble(idCRD = id_crds))
}
if (!crd_ids %>% purrr::is_null()) {
crd_df <-
crd_df %>%
bind_rows(tibble(idCRD = crd_ids))
}
crds <-
crd_df %>%
dplyr::filter(!idCRD %>% is.na()) %>%
distinct() %>%
.$idCRD
return(crds)
}
.return_selected_adv_tables <-
function(data,
all_sections,
table_names,
gather_data) {
table_names <-
data$nameTable %>% unique()
return_selected_adv_table <-
function(data,
table_name,
gather_data) {
table_names <-
c('Manager Description', data$nameTable %>% unique())
if (table_name %>% str_count(table_names) %>% sum() == 0) {
stop("Names can only be\n" %>% paste0(paste0(table_names, collapse = '\n')))
}
data_selected <-
data %>%
dplyr::filter(nameTable %>% str_detect(table_name)) %>%
dplyr::select(idCRD, nameTable, dataTable) %>%
mutate(idRow = 1:n())
crd_df <-
data_selected %>%
dplyr::select(idRow, idCRD)
has_nested_list <-
data_selected %>%
dplyr::select(dataTable) %>%
unnest() %>%
future_map(class) %>%
as_tibble() %>%
gather(column, valueCol) %>%
.$valueCol %>% str_count('list') %>% sum() >= 1
if (!has_nested_list) {
data_selected <-
data_selected %>%
mutate(countColumns = dataTable %>% map_dbl(ncol)) %>%
dplyr::filter(countColumns > 1) %>%
dplyr::select(-countColumns) %>%
dplyr::filter(nameTable == table_name) %>%
dplyr::select(dataTable) %>%
unnest()
section_df <-
.get_sec_sitemap_df() %>%
dplyr::select(nameSectionActual, idSection) %>%
bind_rows(
tibble(nameSectionActual = 'Manager Description',
idSection = 'managerDescription')
)
df_name <-
tibble(nameSectionActual = table_name) %>%
left_join(section_df) %>%
suppressMessages() %>%
.$idSection
if (gather_data) {
data_selected <-
data_selected %>%
dplyr::select(-dplyr::matches('^count[I]|^numberFund')) %>%
mutate_at(
.vars =
data_selected %>% dplyr::select(-idCRD) %>% dplyr::select(-dplyr::matches('^count[I]|^numberFund')) %>% names(),
.funs = as.character
) %>%
gather(nameItem, valueItem, -c(idCRD, nameEntityManager)) %>%
arrange(idCRD) %>%
mutate(nameItem = nameItem %>% str_replace_all('[0-9]', '')) %>%
mutate(
nameTable = table_names[x],
nameItem = nameItem %>% str_replace_all('[0-9]', '')
) %>%
group_by(idCRD, nameEntityManager, nameItem) %>%
mutate(countItemManager = 1:n()) %>%
ungroup() %>%
dplyr::select(nameTable,
idCRD:nameItem,
countItemManager,
valueItem) %>%
suppressWarnings() %>%
arrange(idCRD, nameItem, countItemManager)
}
assign(x = df_name, eval(data_selected %>% .mutate_adv_data()), envir = .GlobalEnv)
section_df %>%
dplyr::filter(nameSectionActual == table_name) %>%
.$idSection %>% cat(fill = T)
}
if (has_nested_list) {
has_no_count_col <-
data_selected %>%
unnest() %>% names() %>% str_count('countColumns') %>% sum() == 0
if (has_no_count_col) {
count_values <-
data_selected %>%
.$dataTable %>% map_dbl(ncol)
data_selected <-
data_selected %>%
mutate(countColumns = count_values)
}
data_selected <-
data_selected %>%
dplyr::select(idRow, dataTable) %>%
unnest() %>%
dplyr::select(idRow, nameTable, dataTable) %>%
mutate(countColumn = map_dbl(dataTable, ncol)) %>%
dplyr::filter(countColumn > 1) %>%
dplyr::select(-countColumn)
if (data_selected %>% nrow >= 1) {
table_names <-
data_selected$nameTable %>%
str_replace_all(' ', '') %>%
paste0('manager', .) %>%
unique()
table_name_df <-
tibble(idTable = table_names,
nameTable = data_selected$nameTable %>% unique())
data_selected <-
data_selected %>%
dplyr::select(idRow, dataTable, nameTable) %>%
left_join(crd_df) %>%
left_join(table_name_df) %>%
suppressMessages() %>%
dplyr::select(idCRD, idTable, nameTable, dataTable)
seq_along(table_names) %>%
future_map(function(x) {
table_names[x] %>% message
table_data <-
data_selected %>%
dplyr::filter(idTable == table_names[x]) %>%
dplyr::select(idCRD, dataTable) %>%
unnest
table_data <-
table_data %>%
mutate_at(.vars = table_data %>% dplyr::select(dplyr::matches("^amount[A-Z]")) %>% names(),
funs(. %>% currency(digits = 0))) %>%
mutate_at(.vars = table_data %>% dplyr::select(dplyr::matches("^pct[A-Z]")) %>% names(),
funs(. %>% percent(digits = 0)))
if (gather_data) {
table_data <-
table_data %>%
dplyr::select(-dplyr::matches('^count[I]|^numberFund')) %>%
mutate_at(
.vars =
table_data %>% dplyr::select(-idCRD) %>% dplyr::select(-dplyr::matches('^count[I]|^numberFund')) %>% names(),
.funs = as.character
) %>%
gather(nameItem,
valueItem,
-c(idCRD, nameEntityManager)) %>%
arrange(idCRD) %>%
dplyr::filter(!valueItem %>% is.na()) %>%
mutate(
nameTable = table_names[x],
nameItem = nameItem %>% str_replace_all('[0-9]', '')
) %>%
group_by(idCRD, nameEntityManager, nameItem) %>%
mutate(countItemManager = 1:n()) %>%
ungroup() %>%
dplyr::select(nameTable,
idCRD:nameItem,
countItemManager,
valueItem) %>%
suppressWarnings() %>%
arrange(idCRD, nameItem, countItemManager)
}
df_name <-
table_names[x]
assign(x = df_name, eval(table_data %>% .mutate_adv_data()), envir = .GlobalEnv)
})
}
}
invisible()
}
return_selected_adv_table_safe <-
possibly(return_selected_adv_table, NULL)
table_names %>%
walk(function(x) {
return_selected_adv_table_safe(data = data,
table_name = x,
gather_data = gather_data)
})
invisible()
}
#' IAPD registered managers ADV data
#'
#' This function parses the specified sections of the
#' \href{https://www.sec.gov/answers/formadv.htm}{Form ADV} for specified managers.
#'
#' @param entity_names vector entities you want to search
#' @param crd_ids vector of CRDs to search
#' @param all_sections include all sections \code{TRUE}(default)
#' searches all sections
#' @param section_names sections to search \itemize{
#' \item \code{Registration}: registration information
#' \item \code{Identifying Information}: indentifying information
#' \item \code{Organization}: organizational structure information
#' \item \code{Successions}: succession information
#' \item \code{Private Fund Reporting}: private funds controlled by the manager
#' \item \code{Direct Manager Owners}: direct owners of the fund manager
#' \item \code{Indirect Manager Owners}: indirect owners of the fund manager
#' \item \code{Other Manager Information}: other information
#' \item \code{Manager Signatories}: manager signatories
#' #' }
#' @param flatten_tables \code{TRUE} flattens data with multiple values into wide form
#' @param gather_data \code{TRUE} returns a long data frame
#' @param assign_to_environment \code{TRUE} assign individual data frames to your environment
#'
#' @return a \code{tibble}
#' @export
#' @import tidyverse curl dplyr formattable httr lubridate magrittr purrr readr lazyeval rvest stringi stringr tibble tidyr xml2
#' @importFrom lazyeval as_name
#' @importFrom curl curl_download
#' @importFrom magrittr %>%
#' @importFrom lubridate mdy
#' @importFrom lubridate ymd
#' @family IAPD
#' @family ADV
#' @family entity search
#' @family fund data
#' @examples
#' adv_managers_filings(entity_names = c('Blackstone Real Estate'), crd_ids = NULL,
#' all_sections = TRUE, section_names = NULL,
#' flatten_tables = TRUE, gather_data = FALSE,
#' assign_to_environment = TRUE)
adv_managers_filings <-
function(entity_names = NULL,
crd_ids = NULL,
all_sections = TRUE,
section_names = c(
"Registration",
"Identifying Information",
"Organization",
"Successions",
"Private Fund Reporting",
"Direct Manager Owners",
"Indirect Manager Owners",
"Other Manager Information",
"Manager Signatories"
),
flatten_tables = TRUE,
gather_data = FALSE,
assign_to_environment = TRUE) {
if (section_names %>% length() == 1) {
flatten_tables <-
FALSE
all_sections <-
FALSE
}
nothing_entered <-
(crd_ids %>% purrr::is_null()) & (entity_names %>% purrr::is_null())
if (nothing_entered) {
stop("Please enter a CRD ID or a search name")
}
.get_search_crd_ids_safe <-
possibly(.get_search_crd_ids, tibble())
crds <-
.get_search_crd_ids(entity_names = entity_names,
crd_ids = crd_ids)
.get_crd_sections_data_safe <-
possibly(.get_crd_sections_data, tibble())
all_data <-
seq_along(crds) %>%
map_dfr(function(x) {
.get_crd_sections_data(
id_crd = crds[x],
all_sections = all_sections,
section_names = section_names,
flatten_tables = flatten_tables
)
})
.return_selected_adv_tables_safe <-
possibly(.return_selected_adv_tables, tibble())
only_1 <-
section_names %>% length() == 1 & assign_to_environment
assign_all <-
section_names %>% length() > 1 & assign_to_environment
if (assign_all) {
all_data %>%
.return_selected_adv_tables(all_sections = TRUE,
gather_data = gather_data)
}
if (only_1) {
table_name <-
list('manager',
all_data$nameADVPage %>% str_replace_all('\\ ', '')) %>%
purrr::reduce(paste0) %>%
unique()
data <-
all_data %>%
select(dataTable) %>%
unnest()
data <-
data %>%
mutate_if(is_character,
funs(ifelse(. == "N/A", NA, .))) %>%
mutate_if(is_character,
funs(ifelse(. == "", NA, .)))
data <-
data %>%
mutate_at(data %>% select(dplyr::matches("^idCRD|idCIK")) %>% names(),
funs(. %>% as.numeric())) %>%
mutate_at(
data %>% select(
dplyr::matches(
"^name[A-Z]|^details[A-Z]|^description[A-Z]|^city[A-Z]|^state[A-Z]|^country[A-Z]|^count[A-Z]|^street[A-Z]|^address[A-Z]"
)
) %>% select(-dplyr::matches("nameElement")) %>% names(),
funs(. %>% str_to_upper())
) %>%
mutate_at(
data %>% select(dplyr::matches("^amount")) %>% names(),
funs(. %>% as.numeric() %>% formattable::currency(digits = 0))
) %>%
mutate_at(data %>% select(dplyr::matches("^is|^has")) %>% names(),
funs(. %>% as.logical())) %>%
mutate_at(
data %>% select(dplyr::matches("latitude|longitude")) %>% names(),
funs(. %>% as.numeric() %>% formattable::digits(digits = 5))
) %>%
mutate_at(
data %>% select(dplyr::matches("^price[A-Z]|pershare")) %>% select(-dplyr::matches("priceNotation")) %>% names(),
funs(. %>% formattable::currency(digits = 3))
) %>%
mutate_at(
data %>% select(dplyr::matches(
"^count[A-Z]|^number[A-Z]|^year[A-Z]"
)) %>% select(-dplyr::matches("country|county")) %>% names(),
funs(. %>% formattable::comma(digits = 0))
) %>%
mutate_at(data %>% select(
dplyr::matches(
"codeInterestAccrualMethod|codeOriginalInterestRateType|codeLienPositionSecuritization|codePaymentType|codePaymentFrequency|codeServicingAdvanceMethod|codePropertyStatus"
)
) %>% names(),
funs(. %>% as.integer())) %>%
mutate_at(data %>% select(
dplyr::matches("^ratio|^multiple|^priceNotation|^value")
) %>% names(),
funs(. %>% formattable::comma(digits = 3))) %>%
mutate_at(data %>% select(dplyr::matches("^pct|^percent")) %>% names(),
funs(. %>% formattable::percent(digits = 3))) %>%
mutate_at(
data %>% select(dplyr::matches("^amountFact")) %>% names(),
funs(. %>% as.numeric() %>% formattable::currency(digits = 3))
) %>%
suppressWarnings()
assign(x = table_name, eval(data %>% .mutate_adv_data()), envir = .GlobalEnv)
}
# gc()
return(all_data)
}
# pdf ---------------------------------------------------------------------
.parse_manager_brochure_data <-
function(url = 'https://adviserinfo.sec.gov/IAPD/IAPDFirmSummary.aspx?ORG_PK=156663') {
idCRD <-
url %>%
.get_pk_url_crd()
page <-
url %>%
.get_html_page()
name_entity_manager <-
page %>%
.get_entity_manager_name()
brochure_exists <-
page %>%
.check_html_node(node_css = '#ctl00_cphMain_landing_p2BrochureLink')
.parse_sec_manager_pdf_url <-
function(page) {
brochure_url <-
page %>%
html_nodes('a[href*="Part2Brochures.aspx"]') %>%
html_attr('href') %>%
unique() %>%
paste0('https://www.adviserinfo.sec.gov', .)
brochure_page <-
brochure_url %>%
.get_html_page()
has_brochure <-
brochure_page %>%
html_nodes('.main td a') %>%
html_attr('href') %>% length() > 0
if (has_brochure) {
url_brochure_pdf <-
brochure_page %>%
html_nodes('.main td a') %>%
html_attr('href') %>%
paste0('https://www.adviserinfo.sec.gov', .)
} else {
url_brochure_pdf <-
NA
}
return(url_brochure_pdf)
}
pdf_urls <-
page %>% .parse_sec_manager_pdf_url()
no_brochure <-
pdf_urls %>% is.na() %>% as.numeric %>% sum() >= 1
if (no_brochure) {
brochure_exists <-
F
}
parse_pdf_brochure <-
function(url = 'https://adviserinfo.sec.gov/IAPD/Content/Common/crd_iapd_Brochure.aspx?BRCHR_VRSN_ID=379240') {
info <-
url %>%
pdf_info()
info$created <-
info$created %>% as.character()
info$modified <-
info$modified %>% as.character()
info <-
info %>%
flatten_df()
sec_name_df <-
tibble(
nameSEC = c(
"version",
"pages",
"encrypted",
"linearized",
"Author",
"Creator",
"Producer",
"created",
"modified",
"metadata",
"locked",
"attachments",
"layout",
'Title'
),
nameActual = c(
"idVersion",
"countPages",
"isEncrypted",
"isLinearized",
"nameAuthor",
"nameCreator",
"nameProducer",
"datetimeCreated",
"datetimeModified",
"detailsMetadata",
"isLocked",
"hasAttachments",
"detailLayout",
'titleDocument'
)
)
sec_names <-
names(info)
actual_name_df <-
seq_along(sec_names) %>%
future_map_dfr(function(x) {
name_exists <-
sec_name_df %>%
dplyr::filter(nameSEC == sec_names[x]) %>% nrow > 0
if (name_exists) {
nameActual <-
sec_name_df %>%
dplyr::filter(nameSEC == sec_names[x]) %>%
.$nameActual
} else {
nameActual <-
NA
}
tibble(idColumn = x, nameActual)
})
columns_selected <-
actual_name_df %>%
dplyr::filter(!nameActual %>% is.na()) %>%
.$idColumn
info <-
info %>%
dplyr::select(columns_selected)
actual_names <-
actual_name_df %>%
dplyr::filter(!nameActual %>% is.na()) %>%
.$nameActual
names(info) <-
actual_names
info <-
info %>%
dplyr::select(
dplyr::matches(
"countPages|idVersion|dateTime|titleDocument|nameAuthor|nameProducer"
)
) %>%
mutate(urlPDFManagerADVBrochure = url)
pdf_pages <-
url %>%
pdf_text() %>%
str_split('\n')
pdf_text_df <-
seq_along(pdf_pages) %>%
future_map_dfr(function(x) {
page_text <-
pdf_pages[[x]] %>%
str_trim()
page_text <-
page_text %>% str_replace_all('Form ADV Part 2A: |Form ADV Part 2A Brochure|Part 2A of ADV: ',
'') %>%
str_replace_all('Firm Brochure', '') %>%
str_trim()
page_text <-
page_text[!page_text == '']
remove_last_line <-
!page_text[page_text %>% length() %>% max] %>%
as.character() %>%
readr::parse_number() %>% is.na %>% suppressWarnings()
if (remove_last_line) {
page_text <-
page_text[1:(page_text %>% length() - 1)]
}
clean_text <-
function (text.var) {
text.var <-
gsub("\\s+",
" ",
gsub("\\\\r|\\\\n|\\n|\\\\t", " ", text.var))
return(text.var)
}
page_text <-
page_text %>%
stri_replace_all_charclass("\\p{WHITE_SPACE}", " ") %>%
stri_trim_both() %>%
clean_text
page_text <-
page_text %>% paste0(collapse = ' ') %>%
clean_text
page_text <-
page_text %>%
stringi::stri_trans_general("latin-ascii")
page_data <-
tibble(numberPage = x, textPage = page_text)
return(page_data)
}) %>%
dplyr::select(textPage) %>%
summarise(textBrochure = textPage %>% paste0(collapse = '\n')) %>%
mutate(urlPDFManagerADVBrochure = url) %>%
right_join(info) %>%
suppressMessages() %>%
dplyr::select(datetimeCreated,
textBrochure,
everything())
return(pdf_text_df)
}
if (brochure_exists) {
urlPDFManagerADVBrochure <-
page %>%
.parse_sec_manager_pdf_url()
brochure_data <-
urlPDFManagerADVBrochure %>%
future_map_dfr(function(x) {
parse_pdf_brochure(url = x)
}) %>%
mutate(nameEntityManager = name_entity_manager, idCRD)
brochure_data <-
brochure_data %>%
mutate_at(.vars = brochure_data %>% dplyr::select(dplyr::matches("datetime")) %>% names(),
.funs = ymd_hms) %>%
arrange(desc(datetimeCreated))
} else {
brochure_data <-
tibble(nameEntityManager = name_entity_manager, idCRD)
}
brochure_data <-
brochure_data %>%
.select_start_vars()
return(brochure_data)
}
.get_manager_brochure_data <-
function(id_crd = 156663,
split_pages = TRUE) {
url <-
.get_managers_adv_sitemap_adv(idCRDs = id_crd) %>%
distinct() %>%
dplyr::filter(nameSectionActual == 'Registration') %>%
.$urlADVSection %>%
suppressWarnings() %>%
suppressMessages() %>%
str_replace('http', 'https')
pdf_data <-
url %>%
.parse_manager_brochure_data()
if ('textBrochure' %in% names(pdf_data)) {
if (split_pages) {
pdf_data <-
pdf_data %>%
separate_rows(textBrochure, sep = '\n')
}
}
pdf_data
}
#' OCR IAPD Managers Brochures
#'
#' This function OCRs a firm's brochure
#' required for Part 2A of the Form-ADV
#'
#' @param entity_names vector names of the companies you want to search
#' @param crd_ids numeric vector CRDs you want to search
#' @param nest_data \code{TRUE} return nested data frame
#' @param split_pages \code{TRUE} split brochure into individual pages
#'
#' @return
#' @export
#' @import curl dplyr formattable httr lubridate magrittr purrr readr lazyeval rvest stringi stringr tibble pdftools tidyr jsonlite
#' @examples
#' adv_managers_brochures(entity_names = c('137 Ventures', 'Divco'), crd_ids = 156663, split_pages = TRUE, nest_data = TRUE)
adv_managers_brochures <-
function(entity_names = NULL,
crd_ids = NULL,
split_pages = TRUE,
nest_data = FALSE) {
nothing_entered <-
(crd_ids %>% purrr::is_null()) & (entity_names %>% purrr::is_null())
if (nothing_entered) {
stop("Please enter a CRD ID or a search name")
}
.get_search_crd_ids_safe <-
possibly(.get_search_crd_ids, tibble())
crds <-
.get_search_crd_ids_safe(entity_names = entity_names, crd_ids = crd_ids)
.get_manager_brochure_data_safe <-
possibly(.get_manager_brochure_data, tibble())
all_data <-
crds %>%
future_map_dfr(function(crd) {
manager_pdf <-
.get_manager_brochure_data_safe(id_crd = as.numeric(crd), split_pages = split_pages)
if (nrow(manager_pdf) >= 1) {
manager <- manager_pdf$nameEntityManager %>% unique()
date_file <- manager_pdf$datetimeCreated %>% unique()
glue::glue("Acquired ADV brochure for {manager} filed on {date_file}") %>%
cat(fill = T)
}
manager_pdf
})
if (nest_data) {
all_data <-
all_data %>%
nest(-c(
idCRD,
nameEntityManager,
titleDocument,
datetimeCreated,
countPages
),
.key = dataBrochure)
}
all_data
}
# Registered_Advisor_Data Downloands -------------------------------------------------
#' Get ADV summary filing urls for all available periods
#'
#'#' @return
#' @export
#' @import rvest stringr dplyr tibble httr stringr lubridate purrr
#' @examples
adv_period_urls <-
function() {
httr::set_config(httr::config(ssl_verifypeer = 0L))
page <-
'https://www.sec.gov/help/foiadocsinvafoiahtm.html' %>%
httr::GET() %>%
httr::content()
url_nodes <-
page %>%
html_nodes('.views-field-field-display-title a')
titles <-
url_nodes %>% html_text()
urls <- url_nodes %>%
html_attr('href') %>%
str_c("https://www.sec.gov", .)
data <-
tibble(nameTitle = titles, urlZip = urls) %>%
separate(nameTitle,
extra = "merge",
fill = "right",
into = c("typeReport", "periodReport"),
sep = '\\, ') %>%
mutate(
typeReport = typeReport %>% str_to_upper(),
isExempt = typeReport %>% str_detect("EXEMPT")
) %>%
mutate(baseNameURL = urlZip %>% basename())
bases <-
urls %>% basename()
df_dates <-
data %>%
select(periodReport) %>%
mutate(dateData = glue::glue("01 {periodReport}") %>% lubridate::dmy())
data <-
data %>%
left_join(df_dates, by = "periodReport") %>%
mutate(dateData = case_when(
is.na(dateData) ~ substr(baseNameURL, 3, 8) %>% lubridate::mdy(),
TRUE ~ dateData
))
data <-
data %>%
mutate(
quarterData = lubridate::quarter(dateData),
yearData = lubridate::year(dateData),
periodData = dateData %>% format("%Y-%m")
) %>%
distinct()
gc()
data
}
.assign_sec_names <-
function(data) {
df_actual_names <-
dictionary_sec_names()
actual_names <-
names(data) %>%
map_chr(function(name) {
no_name <-
df_actual_names %>%
filter(nameSEC == name) %>%
nrow() == 0
if (no_name) {
glue::glue("Missing {name} in dictionary") %>% cat(fill = T)
return(name)
}
df_actual_names %>%
filter(nameSEC == name) %>%
pull(nameActual) %>%
unique() %>%
.[[1]]
})
actual_names
}
#' SEC Name Dictionary
#'
#' @return
#' @export
#'
#' @examples
dictionary_sec_names <-
function() {
sec_name_df <-
tibble(
nameSEC = c("SEC Region", "Organization CRD#", "SEC#", "Firm Type", "Primary Business Name",
"Legal Name", "Main Office Street Address 1", "Main Office Street Address 2",
"Main Office City", "Main Office State", "Main Office Country",
"Main Office Postal Code", "Main Office Telephone Number", "Main Office Facsimile Number",
"Mail Office Street Address 1", "Mail Office Street Address 2",
"Mail Office City", "Mail Office State", "Mail Office Country",
"Mail Office Postal Code", "SEC Current Status", "SEC Status Effective Date",
"Jurisdiction Notice Filed-Effective Date", "Latest ADV Filing Date",
"Form Version", "1I", "Website Address", "1M", "1N", "CIK#",
"1O", "1P", "2A(1)", "2A(2)", "2A(3)", "2A(4)", "2A(5)", "2A(6)",
"2A(7)", "2A(8)", "2A(9)", "2A(10)", "2A(11)", "2A(12)", "2A(13)",
"3A", "3A-Other", "3B", "3C-State", "3C-Country", "5A", "5B(1)",
"5B(2)", "5B(3)", "5B(4)", "5B(5)", "5B(6)", "5C(1)", "5C(1)-If more than 100, how many",
"5C(2)", "5D(1)(a)", "5D(1)(b)", "5D(1)(c)", "5D(1)(d)", "5D(1)(e)",
"5D(1)(f)", "5D(1)(g)", "5D(1)(h)", "5D(1)(i)", "5D(1)(j)", "5D(1)(k)",
"5D(1)(l)", "5D(1)(m)", "5D(1)(m)-Other", "5D(2)(a)", "5D(2)(b)",
"5D(2)(c)", "5D(2)(d)", "5D(2)(e)", "5D(2)(f)", "5D(2)(g)", "5D(2)(h)",
"5D(2)(i)", "5D(2)(j)", "5D(2)(k)", "5D(2)(l)", "5D(2)(m)", "5D(2)(m)-Other",
"5E(1)", "5E(2)", "5E(3)", "5E(4)", "5E(5)", "5E(6)", "5E(7)",
"5E(7)-Other", "5F(1)", "5F(2)(a)", "5F(2)(b)", "5F(2)(c)", "5F(2)(d)",
"5F(2)(e)", "5F(2)(f)", "5G(1)", "5G(2)", "5G(3)", "5G(4)", "5G(5)",
"5G(6)", "5G(7)", "5G(8)", "5G(9)", "5G(10)", "5G(11)", "5G(12)",
"5G(12)-Other", "5H", "5H-If more than 500, how many", "5I(1)",
"5I(2)", "5J", "6A(1)", "6A(2)", "6A(3)", "6A(4)", "6A(5)", "6A(6)",
"6A(7)", "6A(8)", "6A(9)", "6A(10)", "6A(11)", "6A(12)", "6A(13)",
"6A(14)", "6A(14)-Other", "6B(1)", "6B(2)", "6B(3)", "7A(1)",
"7A(2)", "7A(3)", "7A(4)", "7A(5)", "7A(6)", "7A(7)", "7A(8)",
"7A(9)", "7A(10)", "7A(11)", "7A(12)", "7A(13)", "7A(14)", "7A(15)",
"7A(16)", "7B", "8A(1)", "8A(2)", "8A(3)", "8B(1)", "8B(2)",
"8B(3)", "8C(1)", "8C(2)", "8C(3)", "8C(4)", "8D", "8E", "8F",
"8G(1)", "8G(2)", "8H", "8I", "9A(1)(a)", "9A(1)(b)", "9A(2)(a)",
"9A(2)(b)", "9B(1)(a)", "9B(1)(b)", "9B(2)(a)", "9B(2)(b)", "9C(1)",
"9C(2)", "9C(3)", "9C(4)", "9D(1)", "9D(2)", "9E", "9F", "10A",
"11", "11A(1)", "11A(2)", "11B(1)", "11B(2)", "11C(1)", "11C(2)",
"11C(3)", "11C(4)", "11C(5)", "11D(1)", "11D(2)", "11D(3)", "11D(4)",
"11D(5)", "11E(1)", "11E(2)", "11E(3)", "11E(4)", "11F", "11G",
"11H(1)(a)", "11H(1)(b)", "11H(1)(c)", "11H(2)", "SEC Region Name",
"Organization CRD #", "SEC #", "Main Street Address 1", "Main Street Address 2",
"Main Office City, State, Postal Code", "Mail Office City, State, Postal Code",
"Contact Name", "Telephone Number", "Legal Status", "Types of Advisory Activities",
"Regulator Status", "Effective Date", "Criminal Disclosures",
"Regulatory Action Disclosures", "Civil Judicial Disclosures",
"Bankruptcy Disclosures", "Judgment/Lien Disclosures", "Bond Payout Disclosures",
"SEC Registration Status", "Status Effective Date", "World Wide Web Site Address",
"1L", "5A-If more than 1,000, how many", "5B(1)-If more than 1,000, how many",
"5B(2)-If more than 1,000, how many", "5B(3)-If more than 1,000, how many",
"5C", "5C-If more than 500, how many", "5D(1)", "5D(2)", "5D(3)",
"5D(4)", "5D(5)", "5D(6)", "5D(7)", "5D(8)", "5D(9)", "5D(10)",
"5D(10)-Other", "5G(10)-Other", "6A(7)-Other", "9A(1)", "9A(2)",
"9B(1)", "9B(2)", "9C", "10", "Current Status", "FINRA BD Status",
"2B(1)", "2B(2)", "2B(3)", "Total number of offices, other than your Principal Office and place of business",
"1O - If yes, approx. amount of assets", "5D(a)(1)", "5D(a)(2)",
"5D(a)(3)", "5D(b)(1)", "5D(b)(2)", "5D(b)(3)", "5D(c)(1)", "5D(c)(2)",
"5D(c)(3)", "5D(d)(1)", "5D(d)(3)", "5D(e)(1)", "5D(e)(3)", "5D(f)(1)",
"5D(f)(3)", "5D(g)(1)", "5D(g)(2)", "5D(g)(3)", "5D(h)(1)", "5D(h)(2)",
"5D(h)(3)", "5D(i)(1)", "5D(i)(2)", "5D(i)(3)", "5D(j)(1)", "5D(j)(2)",
"5D(j)(3)", "5D(k)(1)", "5D(k)(2)", "5D(k)(3)", "5D(l)(1)", "5D(l)(2)",
"5D(l)(3)", "5D(m)(1)", "5D(m)(2)", "5D(m)(3)", "5D(n)(1)", "5D(n)(2)",
"5D(n)(3)", "5D(n)(3) - Other", "5F(3)", "5I(2)(a)", "5I(2)(b)",
"5I(2)(c)", "5J(1)", "5J(2)", "8H(1)", "8H(2)",
"7A8(a) BD Qual Cust", "Additional CRD Number", "Additional Regulatory Contact Name",
"Additional Regulatory Contact Person City", "Additional Regulatory Contact Person Country",
"Additional Regulatory Contact Person E-mail", "Additional Regulatory Contact Person Facsimile",
"Additional Regulatory Contact Person Postal Code", "Additional Regulatory Contact Person State",
"Additional Regulatory Contact Person Street Address 1", "Additional Regulatory Contact Person Street Address 2",
"Additional Regulatory Contact Person Telephone", "Additional Regulatory Contact Person Titles",
"Amount ERA US Private Fund Assets", "Any Hedge Funds", "Any Liquidity Funds",
"Any Other Funds", "Any PE Funds", "Any PFs a Master", "Any Real Estate Funds",
"Any Securitized Funds", "Any VC Funds", "Chief Compliance Officer City",
"Chief Compliance Officer Country", "Chief Compliance Officer E-mail",
"Chief Compliance Officer Facsimile", "Chief Compliance Officer Name",
"Chief Compliance Officer Other Titles", "Chief Compliance Officer Postal Code",
"Chief Compliance Officer State", "Chief Compliance Officer Street Address 1",
"Chief Compliance Officer Street Address 2", "Chief Compliance Officer Telephone",
"Control/Controlled by Related Person", "Count of 11A(1) disclosures",
"Count of 11A(2) disclosures", "Count of 11B(1) disclosures",
"Count of 11B(2) disclosures", "Count of 11C(1) disclosures",
"Count of 11C(2) disclosures", "Count of 11C(3) disclosures",
"Count of 11C(4) disclosures", "Count of 11C(5) disclosures",
"Count of 11D(1) disclosures", "Count of 11D(2) disclosures",
"Count of 11D(3) disclosures", "Count of 11D(4) disclosures",
"Count of 11D(5) disclosures", "Count of 11E(1) disclosures",
"Count of 11E(2) disclosures", "Count of 11E(3) disclosures",
"Count of 11E(4) disclosures", "Count of 11F disclosures", "Count of 11G disclosures",
"Count of 11H(1)(a) disclosures", "Count of 11H(1)(b) disclosures",
"Count of 11H(1)(c) disclosures", "Count of 11H(2) disclosures",
"Count of BD Affiliates", "Count of Control person Public Reporting Company",
"Count of IA Affiliates", "Count of IA/BD Affiliates", "Count of Private Funds - 7B(1)",
"Count of Private Funds - 7B(2)", "EIN of the Other person compensating CCO",
"First ADV Filing Date", "Location of Books and Records City",
"Location of Books and Records Country", "Location of Books and Records Postal Code",
"Location of Books and Records State", "Location of Books and Records Street Address 1",
"Location of Books and Records Street Address 2", "Mail Office Private Residence Flag",
"Main Office Private Residence Flag", "Name of the other person compensating CCO",
"Share Location", "Share Supervised Persons", "Sole Proprietor City",
"Sole Proprietor Country", "Sole Proprietor Postal Code", "Sole Proprietor State",
"Sole Proprietor Street Address 1", "Sole Proprietor Street Address 2",
"Total Gross Assets of Private Funds", "Total number of additional CRD numbers",
"Total Number of Books and Records Locations", "Total number of CIK numbers",
"Total number of Hedge funds", "Total number of Liquidity funds",
"Total number of Other funds", "Total number of PE funds", "Total number of Real Estate funds",
"Total number of Securitized funds", "Total number of VC funds",
"Total Number of Website Addresses", "Under Common Control",
"12A", "12B(1)", "12B(2)", "12C(1)", "12C(2)", "4A", "4B",
"5.G.(3) - Total amount of Parallel Assets", "5.G.(3) - Total number of RICs or BDCs",
"5.I.(2) - Total number of wrap fee programs", "5.K.(1)(a)(i) end year percentage",
"5.K.(1)(a)(i) midyear percentage", "5.K.(1)(a)(ii) end year percentage",
"5.K.(1)(a)(ii) midyear percentage", "5.K.(1)(a)(iii) end year percentage",
"5.K.(1)(a)(iii) midyear percentage", "5.K.(1)(a)(iv) end year percentage",
"5.K.(1)(a)(iv) midyear percentage", "5.K.(1)(a)(ix) end year percentage",
"5.K.(1)(a)(ix) midyear percentage", "5.K.(1)(a)(v) end year percentage",
"5.K.(1)(a)(v) midyear percentage", "5.K.(1)(a)(vi) end year percentage",
"5.K.(1)(a)(vi) midyear percentage", "5.K.(1)(a)(vii) end year percentage",
"5.K.(1)(a)(vii) midyear percentage", "5.K.(1)(a)(viii) end year percentage",
"5.K.(1)(a)(viii) midyear percentage", "5.K.(1)(a)(x) end year percentage",
"5.K.(1)(a)(x) midyear percentage", "5.K.(1)(a)(xi) end year percentage",
"5.K.(1)(a)(xi) midyear percentage", "5.K.(1)(a)(xii) - Other description",
"5.K.(1)(a)(xii) end year percentage", "5.K.(1)(a)(xii) midyear percentage",
"5.K.(1)(b)(i) end year percentage", "5.K.(1)(b)(ii) end year percentage",
"5.K.(1)(b)(iii) end year percentage", "5.K.(1)(b)(iv) end year percentage",
"5.K.(1)(b)(ix) end year percentage", "5.K.(1)(b)(v) end year percentage",
"5.K.(1)(b)(vi) end year percentage", "5.K.(1)(b)(vii) end year percentage",
"5.K.(1)(b)(viii) end year percentage", "5.K.(1)(b)(x) end year percentage",
"5.K.(1)(b)(xi) end year percentage", "5.K.(1)(b)(xii) - Other description",
"5.K.(1)(b)(xii) end year percentage", "5.K.(2)(a)(i)(1) 10-149",
"5.K.(2)(a)(i)(1) less 10", "5.K.(2)(a)(i)(1) over 150", "5.K.(2)(a)(i)(2) 10-149",
"5.K.(2)(a)(i)(2) less 10", "5.K.(2)(a)(i)(2) over 150", "5.K.(2)(a)(i)(3)(a) 10-149 percentage",
"5.K.(2)(a)(i)(3)(a) less 10 percentage", "5.K.(2)(a)(i)(3)(a) over 150 percentage",
"5.K.(2)(a)(i)(3)(b) 10-149 percentage", "5.K.(2)(a)(i)(3)(b) less 10 percentage",
"5.K.(2)(a)(i)(3)(b) over 150 percentage", "5.K.(2)(a)(i)(3)(c) 10-149 percentage",
"5.K.(2)(a)(i)(3)(c) less 10 percentage", "5.K.(2)(a)(i)(3)(c) over 150 percentage",
"5.K.(2)(a)(i)(3)(d) 10-149 percentage", "5.K.(2)(a)(i)(3)(d) less 10 percentage",
"5.K.(2)(a)(i)(3)(d) over 150 percentage", "5.K.(2)(a)(i)(3)(e) 10-149 percentage",
"5.K.(2)(a)(i)(3)(e) less10 percentage", "5.K.(2)(a)(i)(3)(e) over 150 percentage",
"5.K.(2)(a)(i)(3)(f) 10-149 percentage", "5.K.(2)(a)(i)(3)(f) less 10 percentage",
"5.K.(2)(a)(i)(3)(f) over 150 percentage", "5.K.(2)(a)(ii)(1) 10-149",
"5.K.(2)(a)(ii)(1) less 10", "5.K.(2)(a)(ii)(1) over 150", "5.K.(2)(a)(ii)(2) 10-149",
"5.K.(2)(a)(ii)(2) less 10", "5.K.(2)(a)(ii)(2) over 150", "5.K.(2)(a)(ii)(3)(a) 10-149 percentage",
"5.K.(2)(a)(ii)(3)(a) less 10 percentage", "5.K.(2)(a)(ii)(3)(a) over 150 percentage",
"5.K.(2)(a)(ii)(3)(b) 10-149 percentage", "5.K.(2)(a)(ii)(3)(b) less 10 percentage",
"5.K.(2)(a)(ii)(3)(b) over 150 percentage", "5.K.(2)(a)(ii)(3)(c) 10-149 percentage",
"5.K.(2)(a)(ii)(3)(c) less 10 percentage", "5.K.(2)(a)(ii)(3)(c) over 150 percentage",
"5.K.(2)(a)(ii)(3)(d) 10-149 percentage", "5.K.(2)(a)(ii)(3)(d) less 10 percentage",
"5.K.(2)(a)(ii)(3)(d) over 150 percentage", "5.K.(2)(a)(ii)(3)(e) 10-149 percentage",
"5.K.(2)(a)(ii)(3)(e) less10 percentage", "5.K.(2)(a)(ii)(3)(e) over 150 percentage",
"5.K.(2)(a)(ii)(3)(f) 10-149 percentage", "5.K.(2)(a)(ii)(3)(f) less 10 percentage",
"5.K.(2)(a)(ii)(3)(f) over 150 percentage", "5.K.(2)(b)(1) 10-149",
"5.K.(2)(b)(1) less 10", "5.K.(2)(b)(1) over150", "5.K.(2)(b)(2) 10-149",
"5.K.(2)(b)(2) less 10", "5.K.(2)(b)(2) over150", "5.K.(4) - Total amount of custodians that hold 10% or more of separately managed assets",
"5.K.(4) - Total number of custodians that hold 10% or more of separately managed assets",
"5K(1)", "5K(2)", "5K(3)", "5K(4)", "9C Unqual Opinion", "Acquired Firm",
"Acquired Firm CRD#", "Acquired Firm SEC#", "Total Custody Amount",
"Total Number of Acquired Firms", "Total number of relying advisers",
"Umbrella Registration"
),
nameActual =c("idRegionSEC", "idCRD", "idSEC", "typeRegulationSEC", "nameEntityManager",
"nameEntityManagerLegal", "addressStreet1OfficePrimary", "addressStreet2OfficePrimary",
"cityOfficePrimary", "stateOfficePrimary", "countryOfficePrimary",
"zipcodeOfficePrimary", "phoneOfficePrimary", "faxOfficePrimary",
"addressStreet1OfficeMail", "addressStreet2OfficeMail", "cityOfficeMail",
"stateOfficeMail", "countryOfficeMail", "zipcodeOfficeMail", "statusSEC",
"dateStatusSEC", "stateDateJurisdictionNotice", "dateADVLatest",
"dateFormVersion", "hasEntityMultipleURLs", "urlManager", "isForeignRegisteredEntity",
"isSECSection12_15Reporter", "idCIK", "hasAUMGreater1B", "idLEI",
"hasAUMGreater100M", "hasAUMUnder100MOver25m", "hasPrincipalOfficeWY",
"hasPrincipalOfficeForeign", "isAdviser1940InvestmentActCompany",
"isAdviserBusinessDevelopmentCompany25MInCapital", "isAdviserPensionCapitalGreater200M",
"isAdviserRelated203A", "isAdviserNew203A", "isAdviserMultiState203A",
"isAdviserInternet", "hasSECOrderProhibitingRegistration", "isAdviserSECIneligible",
"typeEntity", "typeEntityDetail", "monthFiscalYearEnd", "stateEntityOrganized",
"countryEntityOrganized", "countEmployeesTotal", "countEmployeesInvestmentAdvisory",
"countEmployeesBrokerDealer", "countEmployeesStateRegisteredInvestmentAdviser",
"countEmployeesStateRegisteredInvestmentAdviserMultipleEntities",
"countEmployeesLicensedInsuranceAgents", "countEmployeesSolicitAdvisoryClients",
"rangeClients", "countClientsOver100Rounded", "pctClientsForeign",
"rangeClientsIndividualNonHighNetWorth", "rangeClientsIndividualHighNetWorth",
"rangeClientsBankThrift", "rangeClientsInvestmentCompany", "rangeClientsBusinessDevelopmentCompany",
"rangeClientsPooledInvestmentVehicle", "rangeClientsPensionPlan",
"rangeClientsCharitableOrganization", "rangeClientsCorporationOther",
"rangeClientsStateMunicipalGovernment", "rangeClientsInvestmentAdviserOther",
"rangeClientsInsuranceCompany", "rangeClientsOther", "typeClientsOther",
"rangeAUMIndividualNonHighNetWorth", "rangeAUMIndividualHighNetWorth",
"rangeAUMBankThrift", "rangeAUMInvestmentCompany", "rangeAUMBusinessDevelopmentCompany",
"rangeAUMPooledInvestmentVehicle", "rangeAUMPensionPlan", "rangeAUMCharitableOrganization",
"rangeAUMCorporationOthr", "rangeAUMStateMunicipalGovernment",
"rangeAUMInvestmentAdviserOther", "rangeAUMInsuranceCompany",
"rangeAUMOther", "typeAUMOther", "hasFeeAUM", "hasFeeHourlyCharge",
"hasFeeSubscription", "hasFeeFixed", "hasFeeCommission", "hasFeePerformance",
"hasFeeOther", "typeFeeOther", "isManagerSecuritiesPortfolio",
"amountAUMDiscretionary", "amountAUMNonDiscretionary", "amountAUMTotal",
"countAccountsDiscretionary", "countAccountsNonDiscretionary",
"countAccountsTotal", "hasFinancialPlanning", "hasPortfolioManagementIndividualSmallBusiness",
"hasPortfolioManagementInvestmentCompanies", "hasPortfolioManagementPooledInvestmentVehicles",
"hasPortfolioManagementInstitutionalClients", "hasServicePensionConsulting",
"hasServiceInvestmentAdviserSelection", "hasServicePeriodicalPublication",
"hasServiceSecurityRating", "hasServiceMarketTiming", "hasServiceEducationSeminars",
"hasServiceOther", "typeServiceOther", "rangeClientsFinancialPlanning",
"countClientsFinancialPlanningOver500Rounded", "hasFeeWrapSponsor",
"hasFeeWrapPortfolioManager", "isAdviserLimitedInvestmentTypes",
"isBrokerDealer", "isBrokerDealerRepresentative", "isCommodityPoolOperator",
"isFuturesMerchant", "isRealEstateBrokerDealerAgent", "isInsuranceBrokerAgent",
"isBank", "isTrustCompany", "isRegisteredMunicipalAdviser", "isRegisteredSecuritySwapDealer",
"isRegistredSecuritySwapParticipant", "isAccountingFirm", "isLawFirm",
"isOtherFinancialProductSalesperson", "typeOtherFinancialProductSalesperson",
"isBusinessActiveNonListedActivity", "typeBusinessActiveNonListedActivity",
"hasProductNonInvestmentAdvice", "hasRelatedBrokerDealer", "hasRelatedInvestmentAdviserOthr",
"hasRelatedRegisteredMunicipalAdviser", "hasRelatedRegisteredSecuritySwapDealer",
"hasRelatedRegisteredSecuritySwapParticipant", "hasRelatedRegisteredCommodityPoolOperator",
"hasRelatedRegisteredFuturesMerchant", "hasRelatedBankThrift",
"hasRelatedTrust", "hasRelatedAccountingFirm", "hasRelatedLawFirm",
"hasRelatedInsuranceCompany", "hasRelatedPensionConsultant",
"hasRelatedRealEstateBrokerDealer", "hasRelatedLimitedPartnershipSyndicator",
"hasRelatedGeneralPartnerManagingMemberSyndicator", "hasRelatedPrivateFundAdviser",
"isSecuritiesBuyerFromClientsForSelfToClientsFromOwned", "isSecuritiesFirmBoughSoldClientRecommended",
"isSecuritiesClientRecommendedFirmOwnedSecurity", "hasTradeExecutionClient",
"isSecuritiesUnderwriterPurchaserManagerClientRecommendedSecurity",
"hasRecommendedPurchaseSaleFirmOwnedSecurity", "hasClientDiscretionBuySell",
"hasClientDiscretionBuySellAmount", "hasClientDiscretionBrokerSelection",
"hasClientDiscretionCommisionCost", "isClientBrokerRelatedParty",
"hasClientBrokerRecommendation", "isClientBrokerRecommenationRelatedParty",
"isBrokerSoftDollarRecipient", "isBrokerSoftDollarEligibleResearchService",
"hasCompensationForClientReferrals", "isCompensatedForClientReferrals",
"hasCustodyClientCash", "hasCustodyClientSecurities", "amountAUMClientSecurities",
"countCustodyClients", "hasAdvisoryCustodyClientCash", "hasAdvisoryCustodyClientSecurities",
"amountAUMAdvisoryClientCash", "countAdvisoryCustodyClientCash",
"hasQualifiedCustodianSendInvestorQuarterlyReports", "hasIndependentAccountAuditPooledInvestments",
"hasIndependentAccountSurpriseAuditClientFunds", "hasIndependentAccountantPrepareInternalControlReports",
"isQualifiedCustodian", "hasRelatedQualifiedCustodian", "monthYearLastSurpriseAudit",
"countQualifiedCustodians", "hasControlPersonUnnamed", "hasManagementSupervisedPersonEvent",
"hasFelonyPleaConviction", "hasFelonyCharge", "hasMisdemeanorPleaConviction",
"hasMisdemeanrCharge", "hasSEC_CFTCFalseStatementOmission",
"hasSEC_CFTCStatuteViolation", "hasSEC_CFTCAuthorizationAction",
"hasSEC_CFTCOrderAgainst", "hasSEC_CFCPenaltyCeaseDesist",
"hasFederalStateForeignFalseStatement", "hasFederalStateForeignInvestmentViolation",
"hasFederalStateForeignBusinessRevokeSuspended", "hasFederalStateForeignOrderAgainst",
"hasFederalStateForeignLicenseRevoked", "hasSelfRegulatedBodyFalseStatement",
"hasSelfRegulatedBodyRuleViolation", "hasSelfRegulatedBodyBusinessRevokeSuspension",
"hasSelfRegulatedBodyActivityBan", "hasAttorneyAccountantFederalContractorPriorBanRevoke",
"isSubjectToRegulatoryProceeding", "hasDomesticForeignCourtEnjoinedInvestmentActivity",
"hasDomesticForeignCourtGuiltyStatuteViolation", "hasDomesticForeignCourtDismissedActionSettlementPursuant",
"isManagerDomesticForeignCourtSubjectToProceeding", "nameSECRegion",
"idCRD", "idSEC", "addressStreet1OfficePrimary", "addressStreet1OfficePrimary2",
"cityStateZipOfficePrimary", "cityStateZipOfficeMail", "nameContact",
"phoneOfficePrimary", "typeEntity", "descriptionManagerServices",
"statusSEC", "dateStatusSEC", "countCriminalDisclosures", "countRegulatoryActions",
"countCivilDisclosures", "countBankruptcyDisclosures", "countJudgementLiens",
"countBondPayoutDisclosures", "statusSEC", "dateStatusSEC", "urlManager",
"hasMultipleEntityURLs", "countEmployeesTotalOver1000", "countEmployeesInvestmentAdvisoryOver1000",
"countEmployeesBrokerDealerOver1000", "countEmployeesStateRegisteredInvestmentAdviserOver1000",
"rangeClients", "countClientsOver500", "rangeClientsIndividualNonHighNetWorth",
"rangeClientsIndividualHighNetWorth", "rangeClientsBankThrift",
"rangeClientsInvestmentCompany", "rangeClientsBusinessDevelopmentCompany",
"rangeClientsPooledInvestmentVehicle", "rangeClientsPensionPlan",
"rangeClientsCharitableOrganization", "rangeClientsCorporationOther",
"rangeClientsStateMunicipalGovernment", "rangeClientsInvestmentAdviserOther",
"typeServicesOther", "typeBusinessOther", "hasCustodyClientCash",
"hasCustodyClientSecurities", "amountAUMAdvisoryClientCashSecurities",
"countAdvisoryCustodyClientCashSecurities", "hasAdvisoryCustodyClientCashSecurities",
"hasControlPersonUnnamed", "statusSEC", "statusFINRA", "hasExemptionAsSolelyVentureAdviser",
"hasExemptionAsPrivateFundManagerUnder150MAUM", "hasExemptionSoleyPrivateFundManagerAUMOver150M",
"countOfficesOther", "rangeAssetsOther", "countClientsIndividualNonHighNetWorth",
"hasClientsLessThan5IndividualNonHighNetWorth", "amountRegulatoryAUMClientIndividualNonHighNetWorth",
"countClientsIndividualHighNetWorth", "hasClientsLessThan5IndividualHighNetWorth",
"amountRegulatoryAUMIndividualHighNetWorth", "countClientsBankingThrift",
"hasClientsLessThan5BankingThrift", "amountRegulatoryAUMBankingThrift",
"countClientsInvestmentCompanies", "amountRegulatoryAUMInvestmentCompanies",
"countClientsBusinessDevelopmentCompanies", "amountRegulatoryAUMBusinessDevelopmentCompanies",
"countClientsPooledInvestmentCompanies", "amountRegulatoryAUMPooledInvestmentCompanies",
"countClientsPensionPlans", "hasClientsLessThan5PensionPlans",
"amountRegulatoryAUMPensionPlans", "countClientsCharitableOrgs",
"hasClientsLessThan5CharitableOrgs", "amountRegulatoryAUMCharitableOrgs",
"countClientsMunicipalEntities", "hasClientsLessThan5MunicipalEntities",
"amountRegulatoryAUMMunicipalEntities", "countClientsInvestmentAdvisorsOther",
"hasClientsLessThan5InvestmentAdvsiorsOther", "amountRegulatoryAUMInvestmentAdvisorsOther",
"countClientsInsuranceCompanies", "hasClientsLessThan5InsuranceCompanies",
"amountRegulatoryAUMInsuranceCompanies", "countClientsSovereignWealthFunds",
"hasClientsLessThan5SovereignWealthFunds", "amountRegulatoryAUMSovereignWealthFunds",
"countClientsCorporationsOther", "hasClientsLessThan5CorporationsOther",
"amountRegulatoryAUMCorporationsOther", "countClientsOther",
"hasClientsLessThan5Other", "amountRegulatoryAUMOOther", "descriptionOther",
"amountRegulatoryAUMNonUSPersons", "amountRegulatoryAUMWrapFee",
"amountRegulatoryAUMWrapFeePortfolioManager", "amountRegulatoryAUMWrapFeeSponsor",
"hasLimitedInvestmentAdvice", "hasClientReportingDifferentThanIAPD",
"hasEmployeeClientReferralFees", "hasEmployeeCompensationClientReferralFees",
"hasQualified7ACustomers", "idCRDAdditional", "nameRegulatoryContactAdditional",
"cityRegulatoryContactAdditional", "countryRegulatoryContactAdditional",
"emailRegulatoryContactAdditional", "faxRegulatoryContactAdditional",
"zipcodeRegulatoryContactAdditional", "stateRegulatoryContactAdditional",
"addressStreet1RegulatoryContactAdditional", "addressStreet2RegulatoryContactAdditional",
"phoneRegulatoryContactAdditional", "titleRegulatoryContactAdditional",
"amountUSPrivateFundAssetsERA", "hasHedgeFunds", "hasLiquidityFunds",
"hasFundsOther", "hasPEFunds", "hasPFMasterFund", "hasRealEstateFunds",
"hasSecuritizedFunds", "hasVCFunds", "cityChiefComplianceOfficer",
"countryChiefComplianceOfficer", "emailChiefComplianceOfficer",
"faxChiefComplianceOfficer", "nameChiefComplianceOfficer",
"titleOtherChiefComplianceOfficer", "zipcodeChiefComplianceOfficer",
"stateChiefComplianceOfficer", "addressStreet1ChiefComplianceOfficer",
"addresssStreet2ChiefComplianceOfficer", "phoneChiefComplianceOfficer",
"isRelatedPersonControlledManager", "countDisclosuresFelonyPleaConviction",
"countDisclosuresFelonyCharge", "countDisclosuresMisdemeanorPleaConviction",
"countDisclosuresMisdemeanorCharge", "countDisclosuresSEC_CFTCFalseStatementOmission",
"countDisclosuresSEC_CFTCStatuteViolation", "countDisclosuresSEC_CFTCAuthorizationAction",
"countDisclosuresSEC_CFTCOrderAgainst", "countDisclosuresSEC_CFCPenaltyCeaseDesist",
"countDisclosuresFederalStateForeignFalseStatement", "countDisclosuresFederalStateForeignInvestmentViolation",
"countDisclosuresFederalStateForeignBusinessRevokeSuspended", "countDisclosuresFederalStateForeignOrderAgainst",
"countDisclosuresFederalStateForeignLicenseRevoked", "countDisclosuresSelfRegulatedBodyFalseStatement",
"countDisclosuresSelfRegulatedBodyRuleViolation", "countDisclosuresSelfRegulatedBodyBusinessRevokeSuspension",
"countDisclosuresSelfRegulatedBodyActivityBan", "countDisclosuresAttorneyAccountantFederalContractorPriorBanRevoke",
"countDisclosuresSubjectToRegulatoryProceeding",
"countDisclosuresDomesticForeignCourtEnjoinedInvestmentActivity",
"countDisclosuresDomesticForeignCourtGuiltyStatuteViolation",
"countDisclosuresDomesticForeignCourtDismissedActionSettlementPursuant",
"countDisclosuresDomesticForeignCourtSubjectToProceeding",
"countAffiliatesBusinessDevelopment", "countControlPersonsPublicCompany",
"countAffiliatesInvestmentAdvisors", "countAffiliatesInvestmentAdvisorsBusinesDevelopmentTotal", "countPrivateFunds7B1",
"countPrivateFunds7B2", "idEINCompanyCOO",
"dateADVFilingFirast",
"cityBooks",
"countryBooks", "zipcodeBooks",
"stateBooks", "addressStreet1Books",
"addressStreet2Books",
"isMailOfficeAddressResidence",
"isMainOfficeAddressResidence", "nameEntityCOOCompensator",
"isSharedOffice", "hasSharedSupervisedPersons", "citySoleProprietor",
"countrySoleProprietor", "zipcodeSoleProprietor", "stateSoleProprietor",
"addressStreet1SoleProprietor", "addressStreet2SoleProprietor",
"amountAUMTotal", "countCRDs",
"countRecordsLocations", "countCIKs",
"countHedgeFunds", "countLiquidityFunds",
"countFundsOther", "countPEFunds", "countRealEstateFunds",
"countSecuritizedFunds", "countVentureFunds",
"countWebsites", "isUnderCommonControl",
"hasOverAssets5M", "hasControlOtherAdvisorOverAssets25M", "hasControlOtherAdvisorOverAssets5M", "isControlledByAdvisorOverAssets25M", "isControlledByAdvisorOverAssets5M", "hasLegalStatusChanged", "dateLegalStatusChange",
"countParallelAssets", "countRIC_BIC",
"countWrapFeePrograms",
"pctManagedAccountExchangeTradedSecuritiesYearEnd",
"pctManagedAccountExchangeTradedSecuritiesYearMid", "pctManagedAccountNonExchangeTradedSecuritiesYearEnd",
"pctManagedAccountNonExchangeTradedSecuritiesYearMid",
"pctManagedAccountBondsGovernmentAgencyYearEnd",
"pctManagedAccountBondsGovernmentAgencyYearMid",
"pctManagedAccountBondsStateLocalYearEnd",
"pctManagedAccountBondsStateLocalYearMid",
"pctManagedAccountSecuritiesIC_BCYearEnd",
"pctManagedAccountSecuritiesIC_BCYearMid",
"pctManagedAccountBondsSovereignYearEnd",
"pctManagedAccountBondsSovereignYearMid",
"pctManagedAccountBondsCorporateInvestmentGradeYearEnd",
"pctManagedAccountBondsCorporateInvestmentGradeYearMid",
"pctManagedAccountBondsCorporateNonInvestmentGradeYearEnd",
"pctManagedAccountBondsCorporateNonInvestmentGradeYearMid",
"pctManagedAccountDerivativesYearEnd",
"pctManagedAccountDerivativesYearMid",
"pctManagedAccountSecuritiesPooledInvestmentYearEnd",
"pctManagedAccountSecuritiesPooledInvestmentYearMid",
"pctManagedAccountCashYearEnd",
"pctManagedAccountCashYearMid",
"descriptionManagedAccountOther",
"pctManagedAccountOtherYearEnd",
"pctManagedAccountOtherYearMid",
"pctManagedAccountExchangeTradedSecuritiesYearEndFinal",
"pctManagedAccountNonExchangeTradedSecuritiesYearEndFinal",
"pctManagedAccountBondsGovernmentAgencyYearEndFinal",
"pctManagedAccountBondsStateLocalYearEndFinal",
"pctManagedAccountSecuritiesIC_BCYearEndFinal",
"pctManagedAccountBondsSovereignYearEndFinal",
"pctManagedAccountBondsCorporateInvestmentGradeYearEndFinal",
"pctManagedAccountBondsCorporateNonInvestmentGradeYearEndFinal",
"pctManagedAccountDerivativesYearEndFinal",
"pctManagedAccountSecuritiesPooledInvestmentYearEndFinal",
"pctManagedAccountCashYearEnd",
"descriptionManagedAccountOtherFinal",
"pctManagedAccountOtherYearEndFinal",
"amountAUMRegulatory10_149PercentNotionalYearMid",
"amountAUMRegulatory10LessPercentNotionalYearMid",
"amountAUMRegulatory150OverPercentNotionalYearMid",
"amountBorrowings10_149PercentNotionalYearMid",
"amountBorrowings10LessPercentNotionalYearMid",
"amountBorrowings150OverPercentNotionalYearMid",
"pctDerivativeInterestRate10_149PercentNotionalYearMid",
"pctDerivativeInterestRateDerivative10LessPercentNotionalYearMid",
"pctDerivativeInterestRateDerivative150OverPercentNotionalYearMid",
"pctDerivativeFOREXD10_149PercentNotionalYearMid",
"pctDerivativeFOREXD10LessPercentNotionalYearMid",
"pctDerivativeFOREXD150OverPercentNotionalYearMid",
"pctDerivativeCredit10_149PercentNotionalYearMid",
"pctDerivativeCredit10LessPercentNotionalYearMid",
"pctDerivativeCredit150OverPercentNotionalYearMid",
"pctDerivativeEquity10_149PercentNotionalYearMid",
"pctDerivativeEquity10LessPercentNotionalYearMid",
"pctDerivativeEquity150OverPercentNotionalYearMid",
"pctDerivativeCommodity10_149PercentNotionalYearMid",
"pctDerivativeCommodity10LessPercentNotionalYearMid", "pctDerivativeCommodity150OverPercentNotionalYearMid",
"pctDerivativeOther10_149PercentNotionalYearMid", "pctDerivativeOther10LessPercentNotionalYearMid",
"pctDerivativeOther150OverPercentNotionalYearMid",
"amountAUMRegulatory10_149PercentNotionalYearEnd",
"amountAUMRegulatory10LessPercentNotionalYearEnd",
"amountAUMRegulatory150OverPercentNotionalYearEnd",
"amountBorrowings10_149PercentNotionalYearEnd",
"amountBorrowings10LessPercentNotionalYearEnd",
"amountBorrowings150OverPercentNotionalYearEnd",
"pctDerivativeInterestRate10_149PercentNotionalYearEnd",
"pctDerivativeInterestRateExposure10LessPercentNotionalYearEnd", "pctDerivativeInterestRate150OverPercentNotionalYearEnd",
"pctDerivativeFOREXDExposure10_149PercentNotionalYearEnd", "pctDerivativeFOREXDExposure10LessPercentNotionalYearEnd",
"pctDerivativeFOREXD150OverPercentNotionalYearEnd",
"pctDerivativeCreditExposure10_149PercentNotionalYearEnd",
"pctDerivativeCreditExposure10LessPercentNotionalYearEnd",
"pctDerivativeCredit150OverPercentNotionalYearEnd",
"pctDerivativeEquityExposure10_149PercentNotionalYearEnd", "pctDerivativeEquityExposure10LessPercentNotionalYearEnd",
"pctDerivativeEquity150OverPercentNotionalYearEnd",
"pctDerivativeCommodityExposure10_149PercentNotionalYearEnd",
"pctDerivativeCommodity10LessPercentNotionalYearEnd",
"pctDerivativeCommodity150OverPercentNotionalYearEnd",
"pctDerivativeOther10_149PercentNotionalYearEnd", "pctDerivativeOther10LessPercentNotional",
"pctDerivativeOther150OverPercentNotionalYearEnd",
"amountBorrowings10_149PercentNotionalYearEndSubAdviser",
"amountAUMRegulatory10LessPercentNotionalYearSubAdviser", "amountAUMRegulatory150OverPercentNotionalYearEndSubAdviser",
"amountBorrowings10_149PercentNotionalYearEndSubAdviser",
"amountBorrowings10LessPercentNotionalYearEndSubAdviser",
"amountBorrowings150OverPercentNotionalYearEndSubAdviser",
"amountCustodiansHoldingOver10PCTAssetsManaged",
"countCustodiansHoldingOver10PCTAssetsManaged",
"hasSeperateAccounts", "hasSeperateAccountsWithBorrowingsDerivatives", "hasSeperateAccountCustodians", "has5K4", "hasUnEqualAccountingOpinion", "nameFirmAcquired",
"idCRDFirmsAcquired", "idSECFirmsAcquired", "amountCustodyAUM",
"countFirmsAcquired", "countRelyingAdvisors",
"hasUmbrellaRegistration"
)
)
sec_name_df
}
.parse_adv_excel_data <-
function(file_path = "/Users/alexbresler/Desktop/adv_data/ia080116.xlsx") {
file_path %>%
readxl::read_excel()
}
.parse_adv_csv <-
function(file_path = "/Users/alexbresler/Desktop/adv_data/IA FOIA Download 7-30-10.CSV") {
file_path %>%
read_csv()
}
.parse_adv_txt_data <-
function(file_path = "/Users/alexbresler/Desktop/adv_data/5010912_10044_00050000_00050000.txt") {
file_path %>%
read_delim(delim = '|', col_names = T) %>%
dplyr::select(-dplyr::matches("X26"))
data
}
.parse_sec_adv_data_url <-
function(url = 'https://www.sec.gov/foia/iareports/ia090116.zip',
return_message = TRUE) {
options(scipen = 999999)
base_url <- url %>% basename()
# date_data <-
# base_url %>%
# str_split("\\-|\\_|\\.zip") %>% flatten_chr() %>% .[[1]] %>%
# str_replace_all("ia", "") %>% lubridate::mdy()
is_exempt <-
url %>% str_detect("exempt")
tmp <-
tempfile()
url %>%
curl_download(url = ., tmp)
con <-
unzip(tmp)
is_excel <-
con %>% stringr::str_detect("XLS|xls|xlsx|XLSX") %>% sum(na.rm = TRUE) > 0
if (is_excel) {
adv_data <-
con[[1]] %>% .parse_adv_excel_data()
}
is_csv <-
con %>% str_detect("csv|CSV") %>% sum(na.rm = TRUE) > 0
if (is_csv) {
adv_data <-
con %>%
.parse_adv_csv() %>%
suppressWarnings()
}
is_txt <-
con %>% str_detect("txt|TXT") %>% sum(na.rm = TRUE) > 0
if (is_txt) {
adv_data <-
con %>%
.parse_adv_txt_data()
}
con %>%
unlink()
actual_names <-
.assign_sec_names(data = adv_data)
if (adv_data %>% tibble::has_name("5H__1")) {
adv_data <-
adv_data %>%
dplyr::rename(`5I(1)` = `5H__1`)
}
adv_data <-
adv_data %>%
purrr::set_names(actual_names
)
column_ids <-
tibble(nameActual = names(adv_data)) %>%
mutate(idRow = 1:n()) %>%
group_by(nameActual) %>%
filter(idRow == min(idRow)) %>%
pull(idRow)
adv_data <-
adv_data[, column_ids] %>%
as_tibble() %>%
select(idCRD, dplyr::matches("^nameEntityManager"), everything())
has_columns <-
(
adv_data %>%
dplyr::select(-dplyr::matches("country")) %>%
dplyr::select(dplyr::matches("^count[A-Z]")) %>% ncol() > 0
) &
(
adv_data %>%
dplyr::select(-dplyr::matches("country")) %>%
dplyr::select(dplyr::matches("^count[A-Z]")) %>%
future_map(class) %>%
as_tibble() %>%
gather(column, class) %>%
dplyr::filter(class == 'character') %>% nrow() > 0
)
if (has_columns) {
change_to_range_cols <-
adv_data %>%
dplyr::select(-dplyr::matches("country")) %>%
dplyr::select(dplyr::matches("^count[A-Z]")) %>%
future_map(class) %>%
as_tibble() %>%
gather(column, class) %>%
dplyr::filter(class == 'character') %>%
.$column
for (x in seq_along(change_to_range_cols)) {
name_loc <-
change_to_range_cols[x] %>% grep(names(adv_data)) %>% min
names(adv_data)[name_loc] <-
names(adv_data)[name_loc] %>% str_replace("count", 'range')
}
}
if (adv_data %>% dplyr::select(dplyr::matches("^has[A-Z]|^is[A-Z]")) %>% names() %>% length() > 0) {
adv_data <-
adv_data %>%
mutate_at(.vars =
adv_data %>% dplyr::select(dplyr::matches("^has[A-Z]|^is[A-Z]")) %>% names(),
.funs = str_trim) %>%
mutate_at(.vars =
adv_data %>% dplyr::select(dplyr::matches("^has[A-Z]|^is[A-Z]")) %>% names(),
funs(if_else(. == "Y", TRUE, FALSE))) %>%
suppressWarnings()
}
if (adv_data %>% dplyr::select(dplyr::matches("^url[M]")) %>% names() %>% length() > 0) {
adv_data <-
adv_data %>%
mutate_at(.vars =
adv_data %>% dplyr::select(dplyr::matches("^url[M]")) %>% names(),
.funs = str_to_lower) %>%
suppressWarnings()
}
if (adv_data %>% dplyr::select(dplyr::matches("^status[SEC]")) %>% dplyr::select(-dplyr::matches("date")) %>% names() %>% length() > 0) {
adv_data <-
adv_data %>%
mutate_at(
.vars =
adv_data %>% dplyr::select(dplyr::matches("^status[SEC]")) %>% dplyr::select(-dplyr::matches("date")) %>% names(),
funs(. %>% stringr::str_to_upper())
) %>%
suppressWarnings()
}
whack_date <-
adv_data %>%
dplyr::select(dplyr::matches("^date")) %>%
keep(is.character) %>%
names() %>% length() > 0
if (whack_date) {
char_col <-
adv_data %>%
dplyr::select(dplyr::matches("^date")) %>%
keep(is.character) %>%
names
adv_data[, char_col] <-
adv_data[, char_col] %>%
pull(1) %>%
lubridate::mdy()
}
adv_data <-
adv_data %>%
mutate_at(.vars =
adv_data %>% dplyr::select(
dplyr::matches(
"^type[A-Z]|^range[A-Z]|^address[A-Z]|^city[A-Z]|^zip[A-Z]|^fax[A-Z]|^name[A-Z]|^state[A-Z]|^status[A-Z]|monthYearLastSurpriseAudit|^date[A-Z]"
)
) %>% names(),
.funs = as.character) %>%
mutate(idCRD = idCRD %>% as.integer())
if ('idLEI' %in% names(adv_data)) {
if (adv_data$idLEI %>% class() == 'numeric') {
adv_data <-
adv_data %>%
mutate(idLEI = idLEI %>% as.character() %>% str_replace_all("N/A", NA))
}
}
if (adv_data %>% tibble::has_name("countClientsFinancialPlanningOver500Rounded")) {
if (adv_data$countClientsFinancialPlanningOver500Rounded %>% class() == "logical") {
adv_data <-
adv_data %>% dplyr::select(-countClientsFinancialPlanningOver500Rounded)
}
}
if (adv_data %>% tibble::has_name("idCIK")) {
if (adv_data$idCIK %>% class() == "logical") {
adv_data <- adv_data %>% dplyr::select(-idCIK)
}
}
if ('rangeClientsFinancialPlanning' %in% names(adv_data)) {
if (adv_data$rangeClientsFinancialPlanning %>% class() == 'numeric') {
adv_data <-
adv_data %>%
mutate(rangeClientsFinancialPlanning = rangeClientsFinancialPlanning %>% as.character())
}
}
if (return_message) {
glue::glue("Parsed {url}") %>% cat(fill = T)
}
adv_data <-
adv_data %>%
mutate(isExempt = is_exempt) %>%
dplyr::select(isExempt, everything())
to_upper_names <- adv_data %>% dplyr::select(
dplyr::matches("^country|^name|^city|^state|^range[A-Z]|^type[A-Z]")
) %>% names()
adv_data <-
adv_data %>%
mutate_at(to_upper_names,
funs(. %>% stringr::str_to_upper())) %>%
mutate(urlZip = url)
adv_data
}
.parse_adv_urls <- function(urls = 'https://www.sec.gov/foia/iareports/ia090116.zip', return_message = TRUE) {
df <-
tibble()
success <- function(res) {
parse_sec_adv_data_url_safe <-
purrr::possibly(.parse_sec_adv_data_url, tibble())
page_url <- res$url
glue::glue("parsing {page_url}") %>% cat(fill = T)
data <-
page_url %>%
parse_sec_adv_data_url_safe(return_message = return_message)
df <<-
df %>%
bind_rows(data)
}
failure <- function(msg) {
tibble()
}
urls %>%
future_map(function(x) {
curl_fetch_multi(url = x, success, failure)
})
multi_run()
df
}
#' ADV managers periods data
#'
#' This function returns monthly summary
#' information for every ADV filing manager
#' from 2006 onwards.
#'
#' @param periods dates in year-month form
#' @param all_periods include all periods
#' @param is_exempt exempt, non-exempt filers
#' @param nest_data return a nested data frame
#' @param return_message return a message after parsing data
#' @import dplyr stringr lubridate readr readxl rvest purrr httr tidyr tibble glue
#' @importFrom curl curl_download
#' @return where \code{nest_data} is \code{TRUE} a nested tibble by period and type of filer,
#' where \code{nest_data} is \code{FALSE} a tibble
#' @export
#' @family IAPD
#' @family ADV
#' @family entity search
#' @family fund data
#' @examples
#' \dontrun{
#' adv_managers_periods_summaries(periods = c("2006-06", "2016-12", "2017-01"), all_periods = FALSE, is_exempt = c(FALSE,TRUE), only_most_recent = FALSE, nest_data = FALSE)
#'
#' adv_managers_periods_summaries(only_most_recent = TRUE)
#' }
adv_managers_periods_summaries <-
function(periods = c("2018-06"),
all_periods = FALSE,
only_most_recent = FALSE,
include_exempt = TRUE,
nest_data = FALSE,
return_message = TRUE) {
if (!'sec_adv_url_df' %>% exists()) {
sec_adv_url_df <-
adv_period_urls()
assign(x = 'sec_url_df', eval(sec_adv_url_df), envir = .GlobalEnv)
}
df_urls <- sec_adv_url_df
if (!include_exempt) {
df_urls <-
df_urls %>%
filter(!isExempt)
}
if (periods %>% length() > 0) {
urls <-
df_urls %>%
filter(periodData %in% periods) %>%
pull(urlZip)
}
if (all_periods) {
urls <-
df_urls %>%
pull(urlZip)
}
if (only_most_recent) {
urls <-
df_urls %>%
filter(dateData == max(dateData)) %>%
pull(urlZip)
}
.parse_adv_urls_safe <-
purrr::possibly(.parse_adv_urls, tibble())
all_adv_data <-
.parse_adv_urls(urls = urls, return_message = return_message)
if (all_adv_data %>% nrow() == 0) {
return(all_adv_data)
}
all_adv_data <-
all_adv_data %>%
dplyr::select(-isExempt) %>%
left_join(df_urls) %>%
suppressMessages()
all_adv_data <-
all_adv_data %>%
select(-one_of(c("periodReport", "typeReport"))) %>%
select(one_of(
c(
"isExempt",
"periodData",
"yearData",
"quarterData",
"dateData"
)
), everything())
all_adv_data <-
all_adv_data %>%
mutate_at(.vars =
all_adv_data %>% dplyr::select(dplyr::matches("^date[A-Z]")) %>% names(),
funs(. %>% lubridate::ymd())) %>%
mutate_at(
.vars =
all_adv_data %>% dplyr::select(
dplyr::matches(
"^name[E]|^address|^city|^status|^state|^type|^country[A-Z]"
)
) %>% dplyr::select(-dplyr::matches("stateEntityOrganized")) %>% names(),
.funs =
str_to_upper
)
all_adv_data <-
all_adv_data %>%
mutate_at(
.vars =
all_adv_data %>% dplyr::select(dplyr::matches("^count[A-Z]"), -dplyr::matches("country")) %>% names(),
.funs =
funs(. %>% formattable::comma(digits = 0))
)
has_amounts <-
names(all_adv_data) %>% str_count('^amount') %>% sum() > 0
if (has_amounts) {
all_adv_data <-
all_adv_data %>%
mutate_at(
.vars =
all_adv_data %>% dplyr::select(dplyr::matches("^amount[A-Z]")) %>% names(),
.funs =
funs(. %>% as.numeric %>% formattable::currency())
) %>%
arrange(dateData, desc(amountAUMTotal)) %>%
suppressWarnings()
}
all_adv_data <-
all_adv_data %>%
dplyr::rename(nameEntityManagerBusiness = nameEntityManager) %>%
mutate(nameEntityManager = nameEntityManagerLegal) %>%
dplyr::select(
dateData:idSEC,
nameEntityManager,
nameEntityManagerLegal,
nameEntityManagerBusiness,
everything()
) %>%
suppressMessages()
all_adv_data <-
all_adv_data %>%
mutate_at(
.vars =
all_adv_data %>% dplyr::select(dplyr::matches(
"^address|^country[A-Z]|^city^state"
)) %>% names(),
funs(. %>% stringr::str_to_upper())
)
has_office_sum <-
names(all_adv_data) %>% str_count('^addressStreet2OfficePrimary') %>% sum() > 0
if (has_office_sum) {
addressOfficePrimary <-
all_adv_data %>%
tidyr::replace_na(list(
addressStreet2OfficePrimary = '',
stateOfficePrimary = ''
)) %>%
mutate(
addressOfficePrimary =
addressStreet1OfficePrimary %>% paste0(
' ',
addressStreet2OfficePrimary,
' ',
cityOfficePrimary,
', ',
stateOfficePrimary,
', ',
countryOfficePrimary
) %>% str_trim()
) %>%
.$addressOfficePrimary
all_adv_data <-
all_adv_data %>%
mutate(addressOfficePrimary) %>%
dplyr::select(
dateData:typeRegulationSEC,
nameEntityManager,
nameEntityManagerLegal,
nameEntityManagerBusiness,
addressOfficePrimary,
everything()
)
}
if (nest_data) {
all_adv_data <-
all_adv_data %>%
nest(-c(dateData, isExempt), .key = dataADV)
}
gc()
all_adv_data
}
#' ADV managers most recent summary data
#'
#' This function returns abbreviated ADV data for
#' all filing managers for the most recent period.
#'
#' For multiple periods and all information see \code{\link{adv_managers_periods_summaries}}
#'
#' @param file_directory directory you want to save your data into, if none specified a temporary file will be created
#' @param folder_name older you want the data to be downloaded into
#' @param remove_files remove the files from the folders
#' @param empty_trash empty the trash after being read into R
#' @param return_message return a message after parsing data
#' @return a data frame
#' @export
#' @family IAPD
#' @family ADV
#' @family entity search
#' @family fund data
#' @examples
#' \dontrun{
#' adv_managers_current_period_summary(select_names = c("dateDataADV", "isExempt", "idRegionSEC", "idCRD", "idSEC", "typeRegulationSEC", "nameEntityManager", "nameEntityManagerLegal", 'addressOfficePrimary', "addressStreet1OfficePrimary", "addressStreet2OfficePrimary", "cityOfficePrimary", "stateOfficePrimary", "countryOfficePrimary", "zipOfficePrimary", "phoneOfficePrimary", "statusSEC", "dateStatusSEC", "dateADVLatest", "urlManager", "isForeignRegisteredEntity", "stateDateJurisdictionNotice", "idCIK", "hasAUMGreater1B", "idLEI", "hasAUMGreater100M", "typeEntity", "countryEntityOrganized", "countEmployeesTotal", "countEmployeesInvestmentAdvisory", "amountAUMTotal", "amountAUMDiscretionary", "amountAUMNonDiscretionary", "countAccountsDiscretionary", "countAccountsNonDiscretionary", "countAccountsTotal", "isManagerSecuritiesPortfolio", "hasFeeAUM", "hasFeeHourlyCharge", "hasFeeSubscription", "hasFeeFixed", "hasFeeCommission", "hasFeePerformance", "hasFeeOther", "typeFeeOther", "isBrokerDealer", "isBrokerDealerRepresentative", "isCommodityPoolOperator", "isFuturesMerchant", "isRealEstateBrokerDealerAgent", "isInsuranceBrokerAgent", "isBank", "isTrustCompany", "isRegisteredMunicipalAdviser", "isRegisteredSecuritySwapDealer", "isRegistredSecuritySwapParticipant", "isAccountingFirm", "isLawFirm", "isOtherFinancialProductSalesperson", "typeOtherFinancialProductSalesperson", "countEmployeesBrokerDealer", "countEmployeesStateRegisteredInvestmentAdviser", "countEmployeesStateRegisteredInvestmentAdviserMultipleEntities", "countEmployeesLicensedInsuranceAgents", "countEmployeesSolicitAdvisoryClients", "hasFelonyPleaConviction", "hasFelonyCharge", "hasMisdemeanorPleaConviction"))
#' }
adv_managers_current_period_summary <-
function(select_names = c(
"dateDataADV",
"isExempt",
"idRegionSEC",
"idCRD",
"idSEC",
"typeRegulationSEC",
"nameEntityManager",
"nameEntityManagerLegal",
'addressOfficePrimary',
"addressStreet1OfficePrimary",
"addressStreet2OfficePrimary",
"cityOfficePrimary",
"stateOfficePrimary",
"countryOfficePrimary",
"zipOfficePrimary",
"phoneOfficePrimary",
"statusSEC",
"dateStatusSEC",
"dateADVLatest",
"urlManager",
"isForeignRegisteredEntity",
"stateDateJurisdictionNotice",
"idCIK",
"hasAUMGreater1B",
"idLEI",
"hasAUMGreater100M",
"typeEntity",
"countryEntityOrganized",
"countEmployeesTotal",
"countEmployeesInvestmentAdvisory",
"amountAUMTotal",
"amountAUMDiscretionary",
"amountAUMNonDiscretionary",
"countAccountsDiscretionary",
"countAccountsNonDiscretionary",
"countAccountsTotal",
"isManagerSecuritiesPortfolio",
"hasFeeAUM",
"hasFeeHourlyCharge",
"hasFeeSubscription",
"hasFeeFixed",
"hasFeeCommission",
"hasFeePerformance",
"hasFeeOther",
"typeFeeOther",
"isBrokerDealer",
"isBrokerDealerRepresentative",
"isCommodityPoolOperator",
"isFuturesMerchant",
"isRealEstateBrokerDealerAgent",
"isInsuranceBrokerAgent",
"isBank",
"isTrustCompany",
"isRegisteredMunicipalAdviser",
"isRegisteredSecuritySwapDealer",
"isRegistredSecuritySwapParticipant",
"isAccountingFirm",
"isLawFirm",
"isOtherFinancialProductSalesperson",
"typeOtherFinancialProductSalesperson",
"countEmployeesBrokerDealer",
"countEmployeesStateRegisteredInvestmentAdviser",
"countEmployeesStateRegisteredInvestmentAdviserMultipleEntities",
"countEmployeesLicensedInsuranceAgents",
"countEmployeesSolicitAdvisoryClients",
"hasFelonyPleaConviction",
"hasFelonyCharge",
"hasMisdemeanorPleaConviction"
),
return_message = TRUE) {
adv_managers_periods_summaries_safe <-
purrr::possibly(adv_managers_periods_summaries, tibble())
all_data <-
adv_managers_periods_summaries(only_most_recent = TRUE,
include_exempt = c(TRUE, FALSE))
all_data <-
all_data %>%
dplyr::select(one_of(select_names))
return(all_data)
}
# mung --------------------------------------------------------------------
#' Extract fee references
#'
#' @param data A \code{tibble} with the OCR'd brochur data
#' @param word_threshhold
#' @param print_sentences if \code{TRUE} prints fee reference data
#'
#' @return
#' @export
#' @importFrom tidytext unnest_tokens
#' @import dplyr stringr purrr
#'
#' @examples
extract_fee_references <- function(data, word_threshhold = 5,
print_sentences = TRUE) {
data <-
data %>%
filter(!nameAuthor %>% is.na())
data <-
data %>%
dplyr::select(idCRD, nameEntityManager, textBrochure) %>%
tidytext::unnest_tokens(sentence, textBrochure, token = "sentences") %>% # tokenize to sentences
mutate(idSentence = 1:n()) %>% # create sentence IDs to check accuracy later
mutate(
hasMGMTFeeReference = sentence %>% str_detect('[0-99]%') # add column for possible fee reference
)
possible_fees <-
data %>%
dplyr::filter(hasMGMTFeeReference == T) %>% # filter down to possible sentences
dplyr::select(idCRD, nameEntityManager, sentence, idSentence) %>%
tidytext::unnest_tokens(word, sentence, token = 'words') %>% # tokenize to words
dplyr::filter(word %>% str_detect("^[0-9]")) %>% # look for numbers 1-9
mutate(word = word %>% as.numeric()) %>% # convert number to numeric
dplyr::filter(word <= word_threshhold) # look for numbers <=
if (print_sentences) {
possible_fees$idSentence %>%
unique() %>%
map_chr(function(x) {
setence_df <-
data %>%
dplyr::filter(idSentence == x)
fee_text <-
setence_df %>%
.$sentence %>% paste0('\n', ., '\n')
setence_df$nameEntityManager %>% paste0('Manager: ',., '\n', fee_text)
}) %>%
paste0(collapse = '\n') %>%
cat(fill = T)
}
possible_fees %>%
left_join(
data
) %>%
suppressMessages()
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.