.ocr_fara_url <-
function(url = "https://efile.fara.gov/docs/2244-Informational-Materials-20200410-14.pdf", return_message = T) {
if (return_message) {
glue("OCR'ing {url}") %>% message()
}
data <- pdftools::pdf_info(url) %>% flatten_df()
text <- url %>% pdftools::pdf_text()
text <- text %>% str_split(pattern = "\n") %>%
flatten_chr() %>%
str_squish() %>%
discard(list(function(x) {
x == ""
})) %>%
discard(list(function(x){
x %>% str_detect("Received by NSD/FARA|[0-9]")
})) %>%
str_c(collapse = " | ")
data %>%
mutate(text_pdf = text,
url_document_fara = url)
}
#' OCR FARA PDFs
#'
#' reads
#'
#' @param urls
#' @param return_message
#'
#' @return
#' @export
#'
#' @examples
#' ocr_fara_urls(urls = "https://efile.fara.gov/docs/2244-Informational-Materials-20200410-14.pdf")
ocr_fara_urls <-
function(urls,
seperate_text = F,
return_message = T) {
.ocr_fara_url_safe <- possibly(.ocr_fara_url, tibble())
data <-
urls %>%
future_map_dfr(function(x) {
.ocr_fara_url_safe(url = x, return_message = return_message)
})
data <-
data %>%
mutate_if(is.character, list(function(x) {
case_when(x == "" ~ NA_character_,
TRUE ~ x)
})) %>%
.remove_na()
if (seperate_text) {
data <-
data %>%
separate_rows(text_pdf, sep = "\\|") %>%
mutate_if(is.character, str_trim)
}
data
}
# dictionaries ------------------------------------------------------------
.dictionary_fara_names <-
function() {
tibble(
nameFARA = c(
"Short Form Termination Date",
"Short Form Date",
"Short Form Last Name",
"Short Form First Name",
"Registration Number",
"Registration Date",
"Registrant Name",
"Address 1",
"Address 2",
"City",
"State",
"Zip",
"Termination Date",
"Name",
"Business Name",
"Foreign Principal Termination Date",
"Foreign Principal",
"Foreign Principal Registration Date",
"Country/Location Represented",
"Registrant Date",
"Foreign_principal",
"FP_registration_date",
"Country_location_represented",
"Registration_number",
"Registration_date",
"Registrant_name",
"Address_1",
"Address_2",
"FP_termination_date",
"Registration_Number",
"Registration_Date",
"Termination_Date",
"Business_Name",
"CDATE_STAMPED",
"REGISTRANT_NAME",
"REGISTRATION_NUMBER",
"DOCUMENT_TYPE",
"URL",
"SHORT_FORM_NAME",
"FOREIGN_PRINCIPAL_NAME",
"FOREIGN_PRINCIPAL_COUNTRY",
"Short_form_last_name",
"Short_form_first_name",
"Short_form_date",
"Short_form_termination_date"
),
nameActual =
c(
"dateShortFormTermination",
"dateShortForm",
"nameLastShortForm",
"nameFirstShortForm",
"idFARA",
"dateRegistration",
"nameRegistrant",
"addressStreet1Registrant",
"addressStreet2Registrant",
"cityRegistrant",
"stateRegistrant",
"zipcodeRegistrant",
"dateTermination",
"nameRegistrant",
"nameRegistrantEntity",
"dateTerminationForeignPrincipal",
"nameForeignPrincipal",
"dateRegistrationForeignPrincipal",
"countryForeignPrincipal",
"dateRegistrant",
"nameForeignPrincipal",
"dateRegistrationForeignPrincipal",
"countryForeignPrincipal",
"idFARA",
"dateRegistration",
"nameRegistrant",
"addressStreet1Registrant",
"addressStreet2Registrant",
"dateTerminationForeignPrincipal",
"idFARA",
"dateRegistration",
"dateTermination",
"nameRegistrantEntity",
"dateDocumentFARA",
"nameRegistrant",
"idFARA",
"typeDocument",
"urlDocumentFARA",
"nameShortForm",
"nameForeignPrincipal",
"countryForeignPrincipal",
"nameLastShortForm",
"nameFirstShortForm",
"dateShortForm",
"dateShortFormTermination"
)
)
}
.munge_fara_names <-
function(data) {
dict_names <- .dictionary_fara_names()
fdps_names <-
names(data)
actual_names <-
fdps_names %>%
map_chr(function(name) {
df_row <-
dict_names %>% filter(nameFARA == name)
if (nrow(df_row) == 0) {
glue::glue("Missing {name}") %>% message()
return(name)
}
df_row$nameActual
})
data %>%
set_names(actual_names)
}
# parse -------------------------------------------------------------------
.parse_fara_xml <-
function(url = "https://efile.fara.gov/bulk/zip/FARA_All_ForeignPrincipals.xml.zip", return_message = T) {
tmp <-
tempfile()
file <- curl_download(url, tmp)
unz_files <- unzip(file, exdir = "xml")
typeFile <- url %>% str_split("/") %>% flatten_chr() %>%
keep(function(x) {
x %>% str_detect(".xml")
}) %>%
str_remove_all(".xml.zip")
data <- read_xml(unz_files)
unz_files %>% unlink()
file %>% unlink()
unlink("xml", recursive = T)
doc <- data %>% xml_contents()
rm(data)
gc()
data <- 1:length(doc) %>%
map_dfr(function(x) {
glue("Parsing node {x}") %>% message()
doc[[x]] %>% as_list() %>% flatten_df()
})
rm(doc)
data <-
data %>%
.munge_fara_names() %>%
.munge_data(clean_address = F) %>%
mutate(urlFARA = url,
typeFile) %>%
select(typeFile, everything())
if (return_message) {
items <- nrow(data) %>% comma(digits = 0)
file_name <-
typeFile %>% str_replace_all("\\_", " ") %>% str_trim()
glue("Found {items} items under {file_name}") %>% message()
}
gc()
data
}
.parse_fara_csv <-
function(url = "https://efile.fara.gov/bulk/zip/FARA_All_Registrants.csv.zip",
return_message = T) {
tmp <-
tempfile()
file <- curl_download(url, tmp)
unz_files <- unzip(file, exdir = "xml")
data <- unz_files %>% fread(verbose = F,showProgress = FALSE) %>% as_tibble()
unz_files %>% unlink(recursive = T)
file %>% unlink(force = T)
tmp %>% unlink(force = T)
unlink("xml", recursive = T)
typeFile <- url %>% str_split("/") %>% flatten_chr() %>%
keep(function(x) {
x %>% str_detect(".csv")
}) %>%
str_remove_all(".csv.zip")
data <-
data %>%
.munge_fara_names()
data <-
data %>%
.munge_data(clean_address = F) %>%
mutate(urlFARA = url,
typeFile) %>%
select(typeFile, everything())
if (return_message) {
items <- nrow(data) %>% comma(digits = 0)
file_name <-
typeFile %>% str_replace_all("\\_", " ") %>% str_trim()
glue("Found {items} items under {file_name}") %>% message()
}
gc()
data
}
.parse_fara_csv_m <- memoise::memoise(.parse_fara_csv)
.parse_fara_xml_m <- memoise::memoise(.parse_fara_xml)
# functions ---------------------------------------------------------------
#' FARA registered entities
#'
#' Acquires all FARA registered representitives.
#' This includes information about the agents including
#' their location, periods of active work
#'
#'
#' @param method method of bulk download \itemize{
#' \item xml - default and returns all historic matches
#' \item csv - returns only a portion of registrants
#' }
#' @param only_active if \code{TRUE} returns only active registrants
#' @param return_message if \code{TRUE} returns message
#'
#' @return
#' @export
#'
#' @examples
#' \dontrun{
#' fara_registrants(method = "xml")
#' }
fara_registrants <-
function(method = "xml",
only_active = F,
return_message = T) {
if (str_to_lower(method) == "xml") {
data <- .parse_fara_xml_m(url = "https://efile.fara.gov/bulk/zip/FARA_All_Registrants.xml.zip", return_message = return_message)
} else {
data <- .parse_fara_csv_m(url = "https://efile.fara.gov/bulk/zip/FARA_All_Registrants.csv.zip", return_message = T)
}
data <- data %>%
mutate_if(is.character,
str_trim) %>%
mutate_if(is.character,
list(function(x) {
if_else(x == "", NA_character_, x)
})) %>%
mutate_at(data %>% select(matches("address")) %>% names(),
list(function(x) {
x %>% str_remove_all("-") %>% str_trim()
})) %>%
mutate(
dateRegistrationClean = dateRegistration,
dateTerminationClean = dateTermination,
dateRegistration = case_when(
is.na(dateRegistration) ~ dateRegistration,
dateRegistrationClean > dateTerminationClean ~ dateTerminationClean,
TRUE ~ dateRegistrationClean
),
dateTermination = case_when(
is.na(dateTermination) ~ dateTermination,
dateTerminationClean < dateRegistrationClean ~ dateRegistrationClean,
TRUE ~ dateTerminationClean
),
) %>%
select(-c(dateRegistrationClean, dateTerminationClean)) %>%
mutate(
isActiveEntityFARA = is.na(dateTermination),
countDaysActiveEntityFARA = case_when(
isActiveEntityFARA ~ (Sys.Date() - dateRegistration) %>% as.integer(),
TRUE ~ (dateTermination - dateRegistration) %>% as.integer()
),
addressStreet = case_when(
is.na(addressStreet2Registrant) &
!is.na(addressStreet1Registrant) ~ addressStreet1Registrant,
!is.na(addressStreet1Registrant) &
!is.na(addressStreet1Registrant) ~ str_c(addressStreet1Registrant, addressStreet2Registrant, sep = " "),
TRUE ~ NA_character_
),
cityState = case_when(
is.na(stateRegistrant) &
!is.na(cityRegistrant) ~ cityRegistrant,
!is.na(stateRegistrant) &
!is.na(cityRegistrant) ~ str_c(cityRegistrant, stateRegistrant, sep = ", "),
TRUE ~ NA_character_
),
addressCity = case_when(
is.na(cityState) &
!is.na(addressStreet) ~ addressStreet,
!is.na(cityState) &
!is.na(addressStreet) ~ str_c(addressStreet, cityState, sep = " "),
TRUE ~ NA_character_
),
locationRegistrant = case_when(
is.na(zipcodeRegistrant) & !is.na(addressCity) ~ addressCity,
!is.na(zipcodeRegistrant) &
!is.na(addressCity) ~ str_c(addressCity, zipcodeRegistrant, sep = " "),
TRUE ~ NA_character_
)
) %>%
select(-c(addressCity, cityState, addressStreet)) %>%
select(
idFARA,
isActiveEntityFARA,
idFARA,
countDaysActiveEntityFARA,
dateRegistration,
dateTermination,
nameRegistrant,
nameRegistrantDetail,
nameRegistrantEntity,
locationRegistrant,
everything()
)
if (only_active) {
data <- data %>%
filter(isActiveEntityFARA)
}
#### fix entities
data
}
#' FARA Foreign Principals
#'
#' Acquires all FARA foreign principals. This includes
#' information about the foreign princpial, their FARA agent and
#' period of registration.
#'
#' @param method method of bulk download \itemize{
#' \item xml - default and returns all historic matches
#' \item csv - returns only a portion of foreign principals
#' }
#' @param only_active if \code{TRUE} returns only active foreign principals
#' @param return_message if \code{TRUE} returns a message
#' @return \code{tibble}
#' @export
#'
#' @examples
#' \dontrun{
#' fara_principals(method = "xml")
#' }
fara_principals <-
function(method = "xml",
only_active = F,
return_message = T) {
if (str_to_lower(method) == "xml") {
data <-
.parse_fara_xml_m(url = "https://efile.fara.gov/bulk/zip/FARA_All_ForeignPrincipals.xml.zip", return_message = return_message)
} else {
data <-
.parse_fara_csv_m(url = "https://efile.fara.gov/bulk/zip/FARA_All_ForeignPrincipals.csv.zip", return_message = return_message)
}
data <-
data %>%
mutate(
dateRegistrationClean = dateRegistrationForeignPrincipal,
dateTerminationClean = dateTerminationForeignPrincipal,
dateRegistrationForeignPrincipal = case_when(
is.na(dateRegistrationForeignPrincipal) ~ dateRegistrationForeignPrincipal,
dateRegistrationClean > dateTerminationClean ~ dateTerminationClean,
TRUE ~ dateRegistrationClean
),
dateTerminationForeignPrincipal = case_when(
is.na(dateTerminationForeignPrincipal) ~ dateTerminationForeignPrincipal,
dateTerminationClean < dateRegistrationClean ~ dateRegistrationClean,
TRUE ~ dateTerminationClean
),
) %>%
select(-c(dateRegistrationClean, dateTerminationClean)) %>%
mutate(
isActiveForeignPrincipal = is.na(dateTerminationForeignPrincipal),
countDaysActiveForeignPrincipalFARA = case_when(
isActiveForeignPrincipal ~ (Sys.Date() - dateTerminationForeignPrincipal) %>% as.integer(),
TRUE ~ (
dateTerminationForeignPrincipal - dateRegistrationForeignPrincipal
) %>% as.integer()
)
)
data <-
data %>%
mutate_if(is.character,
str_trim) %>%
mutate_if(is.character,
list(function(x) {
if_else(x == "", NA_character_, x)
})) %>%
mutate_at(data %>% select(matches("address")) %>% names(),
list(function(x) {
x %>% str_remove_all("-") %>% str_trim()
})) %>%
mutate(
addressStreet = case_when(
is.na(addressStreet2Registrant) &
!is.na(addressStreet1Registrant) ~ addressStreet1Registrant,
!is.na(addressStreet1Registrant) &
!is.na(addressStreet1Registrant) ~ str_c(addressStreet1Registrant, addressStreet2Registrant, sep = " "),
TRUE ~ NA_character_
),
cityState = case_when(
is.na(stateRegistrant) &
!is.na(cityRegistrant) ~ cityRegistrant,
!is.na(stateRegistrant) &
!is.na(cityRegistrant) ~ str_c(cityRegistrant, stateRegistrant, sep = ", "),
TRUE ~ NA_character_
),
addressCity = case_when(
is.na(cityState) &
!is.na(addressStreet) ~ addressStreet,
!is.na(cityState) &
!is.na(addressStreet) ~ str_c(addressStreet, cityState, sep = " "),
TRUE ~ NA_character_
),
locationRegistrant = case_when(
is.na(zipcodeRegistrant) & !is.na(addressCity) ~ addressCity,
!is.na(zipcodeRegistrant) &
!is.na(addressCity) ~ str_c(addressCity, zipcodeRegistrant, sep = " "),
TRUE ~ NA_character_
)
) %>%
select(-c(addressCity, cityState, addressStreet))
clean_names <-
data %>% select(-one_of(c("dateRegistrant", "nameRegistrant", "nameRegistrantDetail", "typeRegistrant"))) %>% select(matches("Registrant")) %>% names()
names(data)[names(data) %in% clean_names] <-
names(data)[names(data) %in% clean_names] %>% str_replace_all("Registrant", "ForeignPrincipal")
data <-
data %>%
select(
typeFile,
isActiveForeignPrincipal,
dateRegistrationForeignPrincipal,
countryForeignPrincipal,
nameForeignPrincipal,
nameRegistrant,
countDaysActiveForeignPrincipalFARA,
dateRegistrationForeignPrincipal,
locationForeignPrincipal,
everything()
)
if (only_active) {
data <-
data %>%
filter(isActiveForeignPrincipal)
}
data
}
#' FARA filed documents
#'
#' Acquires data for all available FARA documents.
#' This includes information about ther foreign principal,
#' registrant and FARA document type.
#'
#' @param method method of bulk download \itemize{
#' \item xml - default and returns all historic matches
#' \item csv - returns only a portion of registrants' documents
#' }
#' @param only_active if \code{TRUE} returns only documents available online
#' @param return_message if \code{TRUE} returns a message
#'
#' @return
#' @export
#'
#' @examples
#' \dontrun{
#' fara_documents(method = "xml")
#' }
fara_documents <-
function(method = "xml",
only_active = F,
snake_names = T,
return_message = T) {
if (str_to_lower(method) == "xml") {
data <-
.parse_fara_xml_m(url = "https://efile.fara.gov/bulk/zip/FARA_All_RegistrantDocs.xml.zip", return_message = return_message)
} else {
data <-
.parse_fara_csv_m(url = "https://efile.fara.gov/bulk/zip/FARA_All_RegistrantDocs.csv.zip", return_message = return_message)
}
data <-
data %>%
mutate(isDocumentAvailableOnline = urlDocumentFARA != "available-fara-public-office") %>%
select(typeFile, isDocumentAvailableOnline, everything())
data <- data %>%
filter(isDocumentAvailableOnline) %>%
mutate(
urlDocumentFARA = case_when(
typeDocument == "AMENDMENT" ~ urlDocumentFARA %>% str_replace_all("amendment", "Amendment;"),
typeDocument == "SHORT-FORM" ~ urlDocumentFARA %>% str_replace_all("short-form", "Short-Form;"),
typeDocument == "EXHIBIT AB" ~ urlDocumentFARA %>% str_replace_all("exhibit-ab", "Exhibit-AB;"),
typeDocument == "SUPPLEMENTAL STATEMENT" ~ urlDocumentFARA %>% str_replace_all("supplemental-statement", "Supplemental-Statement;"),
typeDocument == "EXHIBIT C" ~ urlDocumentFARA %>% str_replace_all("exhibit-c", "Exhibit-C;"),
typeDocument == "REGISTRATION STATEMENT" ~ urlDocumentFARA %>% str_replace_all("registration-statement", "Registration-Statement;"),
typeDocument == "INFORMATIONAL MATERIALS" ~ urlDocumentFARA %>% str_replace_all("informational-materials", "Informational-Materials;"),
typeDocument == "EXHIBIT D" ~ urlDocumentFARA %>% str_replace_all("exhibit-d", "Exhibit-D;"),
typeDocument == "CONFLICT PROVISION" ~ urlDocumentFARA %>% str_replace_all("conflict-provision", "Conflict-Provision;"),
typeDocument == "DISSEMINATION REPORT" ~ urlDocumentFARA %>% str_replace_all("dissemination-report", "Dissemination-Report;")
)
) %>%
separate(urlDocumentFARA,
into = c("url1", "url2"),
extra = "merge",
sep = "\\;") %>%
mutate(
url2 = url2 %>% str_remove_all("\\.pdf") %>% str_to_upper() %>% str_c(".pdf") %>% str_remove_all("\\.pdf") %>% str_to_upper() %>% str_c(".pdf")
) %>%
unite(urlDocumentFARA,
url1,
url2,
sep = "") %>%
bind_rows(data %>% filter(!isDocumentAvailableOnline)) %>%
arrange(dateDocumentFARA) %>%
mutate(
urlDocumentFARA = case_when(
urlDocumentFARA == "available-fara-public-office" ~ NA_character_,
TRUE ~ urlDocumentFARA
)
)
data <- data %>%
mutate(
nameShortForm = nameShortForm %>% str_remove_all(", RT|\\, DIRECTOR|\\:") %>%
str_replace_all("\\, JR", "\\ JR") %>%
str_replace_all("\\, SR", "\\ SR") %>%
str_replace_all(", III", "\\ III")
)
df_names <- data %>%
filter(!is.na(nameShortForm)) %>%
distinct(nameShortForm) %>%
mutate(nameShortFormClean = rm_bracket(nameShortForm))
df_names <-
df_names %>%
filter(nameShortForm %>% str_detect("\\,")) %>%
separate(nameShortFormClean, into = c("nameLast", "nameFirst"), sep = "\\,",
extra = "merge",
convert = T) %>%
mutate_all(str_trim) %>%
unite(nameShortFormClean, nameFirst, nameLast, sep = " ") %>%
bind_rows(
df_names %>% filter(!nameShortFormClean %>% str_detect("\\,"))
)
data <-
data %>%
left_join(df_names, by = "nameShortForm") %>%
select(-nameShortForm) %>%
rename(nameShortForm = nameShortFormClean) %>%
select(one_of(names(data)), everything()) %>%
select(
typeFile:dateDocumentFARA,
typeDocument,
idFARA,
nameRegistrant,
nameRegistrantDetail,
nameShortForm,
countryForeignPrincipal,
nameForeignPrincipal,
nameForeignPrincipalDetail,
typeForeignPrincipal,
everything()
) %>%
arrange(desc(dateDocumentFARA))
if (only_active) {
data <-
data %>%
filter(isDocumentAvailableOnline)
}
if (snake_names) {
data <- data %>%
clean_names()
}
data
}
#' FARA short-form data
#'
#' Acquires all FARA short form filer data.
#' This includes information about FARA registrants and
#' their employees.
#'
#' @param method method of bulk download \itemize{
#' \item xml
#' \item csv
#' }
#' @param only_active if
#' @param return_message if \code{TRUE} returns a message
#'
#' @return \code{tibble}
#'
#' @export
#'
#' @examples
#' \dontrun{
#' fara_short_forms()
#' }
fara_short_forms <-
function(method = "xml",
only_active = F,
return_message = T) {
if (str_to_lower(method) == "xml") {
data <-
.parse_fara_xml_m(url = "https://efile.fara.gov/bulk/zip/FARA_All_ShortForms.xml.zip", return_message = return_message)
} else {
data <-
.parse_fara_csv_m(url = "https://efile.fara.gov/bulk/zip/FARA_All_ShortForms.csv.zip", return_message = return_message)
}
data <-
data %>%
mutate_if(is.character,
str_trim) %>%
mutate_if(is.character,
list(function(x) {
if_else(x == "", NA_character_, x)
})) %>%
mutate_at(data %>% select(matches("address")) %>% names(),
list(function(x) {
x %>% str_remove_all("-") %>% str_trim()
})) %>%
mutate(
isActiveEntityFARA = is.na(dateShortFormTermination),
countDaysActiveEntityFARA = case_when(
isActiveEntityFARA ~ (Sys.Date() - dateRegistration) %>% as.integer(),
TRUE ~ (dateShortFormTermination - dateRegistration) %>% as.integer()
),
addressStreet = case_when(
is.na(addressStreet2Registrant) &
!is.na(addressStreet1Registrant) ~ addressStreet1Registrant,
!is.na(addressStreet1Registrant) &
!is.na(addressStreet1Registrant) ~ str_c(addressStreet1Registrant, addressStreet2Registrant, sep = " "),
TRUE ~ NA_character_
),
cityState = case_when(
is.na(stateRegistrant) &
!is.na(cityRegistrant) ~ cityRegistrant,
!is.na(stateRegistrant) &
!is.na(cityRegistrant) ~ str_c(cityRegistrant, stateRegistrant, sep = ", "),
TRUE ~ NA_character_
),
addressCity = case_when(
is.na(cityState) &
!is.na(addressStreet) ~ addressStreet,
!is.na(cityState) &
!is.na(addressStreet) ~ str_c(addressStreet, cityState, sep = " "),
TRUE ~ NA_character_
),
locationRegistrant = case_when(
is.na(zipcodeRegistrant) & !is.na(addressCity) ~ addressCity,
!is.na(zipcodeRegistrant) &
!is.na(addressCity) ~ str_c(addressCity, zipcodeRegistrant, sep = " "),
TRUE ~ NA_character_
)
) %>%
select(-c(addressCity, cityState, addressStreet)) %>%
unite(
nameShortForm,
nameFirstShortForm,
nameLastShortForm,
sep = " ",
remove = F
) %>%
select(
idFARA,
isActiveEntityFARA,
countDaysActiveEntityFARA,
idFARA,
nameShortForm,
nameRegistrant,
matches("date"),
locationRegistrant,
everything()
)
data
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.