Nothing
#' @title derivativeTransactionsScrape
#'
#' @description The function takes the dataframe returned by
#' \code{\link{secUrlDownload}} and scrapes and compiles a dataframe of all
#' the derivative transactions that meet the keyword crtiera. If no criteria
#' is set, all transactions from the dataframe returned by
#' \code{\link{secUrlDownload}} are scraped.Note that the logic for the
#' keywords is "OR." Thus if a user sets multiple footnoteKeywords and a
#' keyword for issuerKeywords, if a transaction has only one of the
#' footnoteKeywords and none of the issuerKeywords, the transaction will
#' still be scraped.
#'
#' @param index A local object that was returned by utilizing the
#' \code{\link{secUrlDownload}}. It contains the URLs necessary to grab each
#' text version of the filed Form 4 or 5.
#' @param form Define if the form is a Form 4 or Form 5. Must match the type
#' used in \code{\link{secUrlDownload}}.
#' @param name Specify your name. This is required by the SEC.
#' @param email Specify your email. This is required by the SEC.
#' @param transactionType The character vector containing any specified
#' transaction codes. Values must be captilized to match against SEC values.
#' Detailed information about the transaction type can be found here:
#' \url{https://www.sec.gov/files/forms-3-4-5.pdf}. Default value is NA.
#' @param footnoteKeywords The character vector containing any specified key
#' words to be searched in the form's footnotes. Default is NA.
#' @param issuerKeywords The character vector containing any specified key
#' words to be searched within the issuer block. Criteria can include the firms
#' CIK or name. Values must be captilized to match against SEC values.
#' Default value is NA.
#' @param issuerTradingSymbol The character vector containing any specified
#' stock tickers. Values must be captilized to match against SEC values.
#' Default value is NA.
#' @param rptOwnerKeywords The character vector contianing any specified key
#' words to be searched within the rptOwner block. Values must be captilized
#' to match against SEC values. Additionally, the format is LAST NAME, FIRST
#' NAME. The criteria can include the individuals CIK, address, or name.
#' Default value is NA.
#' @return A dataframe containing containing the scraped information where
#' each row represents a derivative transaction. The transaction observations
#' can be grouped by form through the URL variable within the dataframe.
#' @examples
#' \donttest{
#' dat <- derivativeTransactionsScrape(index = tempIndex, form = 4,
#' name = "Your Name", email = "yourEmail@@yourEmail.com",
#' issuerKeywords = c("AMAZON", "MICROSOFT"))
#' }
#' @importFrom magrittr "%>%"
#' @importFrom utils "read.table"
#' @export
derivativeTransactionsScrape <- function(index, form, name, email, transactionType = NA, footnoteKeywords = NA, issuerKeywords = NA, issuerTradingSymbol = NA, rptOwnerKeywords = NA) {
# core functions
transactionScrape <- function(filing, counter, dat, n, k, columnNumber) {
# Loop through the column names related to the transaction info the scraper collects
for (columnName in columnNames[n:k]) {
# create the three filters that are used to bracket and extract the desired information
filterOne <- paste(a1,columnName,a2,columnName,a3,sep="")
filterTwo <- paste(e1,columnName,e2,sep="")
filterThree <- paste(e3,columnName,e2,sep="")
# extract the information and save it in the appropriate column
valueOne = filing %>% stringr::str_extract(pattern = (filterOne))
valueTwo <- gsub(filterTwo, '', valueOne)
valueThree <- gsub(filterThree, '', valueTwo)
dat[counter, columnNumber] <- valueThree
columnNumber <- columnNumber + 1
}
return(dat)
}
footnoteScrape <- function(filing, counter, dat, n, k, footnoteNumber, columnNumber) {
transactionSection <- filing %>% stringr::str_extract(pattern = "<derivativeTransaction>.*?(</derivativeTransaction>)")
footnoteSection <- filing %>% stringr::str_extract(pattern = "<footnotes>.*?(</footnotes>)")
# Loop the footnote column names and extracts the information related to the transaction
for (columnName in columnNames[n:k]) {
# checks if transaction has footnotes
countTotal <- stringr::str_count(transactionSection, paste('footnoteId id="F',footnoteNumber,'"',sep=""))
# if the transaction has footnotes, it extracts the footnote and saves it in the appropriate column
if (countTotal > 0) {
filterOne <- paste(a1, paste('footnote id="F',footnoteNumber,'"',sep=""), a2,'footnote',a3,sep="")
filterTwo <- paste(e1,paste('footnote id="F',footnoteNumber,'"',sep=""),e2,sep="")
filterThree <- paste(e3,'footnote',e2,sep="")
valueOne <- footnoteSection %>% stringr::str_extract(pattern = (filterOne))
valueTwo <- gsub(filterTwo, '', valueOne)
valueThree <- gsub(filterThree, '', valueTwo)
dat[counter, columnNumber] <- valueThree
}
columnNumber <- columnNumber + 1
footnoteNumber <- footnoteNumber + 1
}
invisible(return(dat))
}
# format inputs to the function
form <- as.character(form)
# create empty dataframe and set column names. It is more efficient to create the matrix and substitute in values than to append. If there are no keywords, we expect at maximum
# there will be six transactions for each filing.
dat <- as.data.frame(matrix(, ncol=61, nrow=nrow(index)*6))
columnNames <- c("periodOfReport", "issuerCik", "issuerName", "rptOwnerCik", "rptOwnerName", "rptOwnerState", "rptOwnerZipCode", "isDirector", "isOfficer", "isTenPercentOwner", "isOther", "officerTitle", "securityTitle", "transactionDate", "transactionFormType", "equitySwapInvolved", "transactionShares", "transactionPricePerShare", "underlyingSecurityShares", "underlyingSecurityTitle", "expirationDate", "exerciseDate", "conversionOrExercisePrice", "transactionAcquiredDisposedCode", "sharesOwnedFollowingTransaction", "directOrIndirectOwnership", "transactionCode", "natureOfOwnership", "footnote1", "footnote2", "footnote3", "footnote4", "footnote5", "footnote6", "footnote7", "footnote8", "footnote9", "footnote10", "footnote11", "footnote12", "footnote13", "footnote14", "footnote15", "footnote16", "footnote17", "footnote18", "footnote19", "footnote20", "footnote21", "footnote22", "footnote23", "footnote24", "footnote25", "footnote26", "footnote27", "footnote28", "footnote29", "footnote30", "URL", "manyPeopleManyTransactions", "Notes")
colnames(dat) <- columnNames
# create .txt file to use as the sandbox for the string manipulations
prefiling <- paste0("prefiling.txt")
# global filter variables
a1 <- '<'
a2 <- '>.*?(</'
a3 <- '>)'
e1 <- '<'
e2 <- '>'
e3 <- '</'
# loop that takes each url, pulls down the filing, checks for key words (or lack there of), and pulls the information into a column/row format if the keyword conditions are satisfied
counter <- 0
statusCounter <- 0
totalRecords <- nrow(index) # minimize computation
for (i in index[,3]) {
filing_error <- try(download.file(i, prefiling, method = "auto", quiet = TRUE, cacheOK = FALSE,
headers = c("User-Agent" = paste(name, email, options("HTTPUserAgent"), sep = " "))))
# if an error occurs, it is likely because the SEC has blocked the IP address. The scraper waits 15 minutes before sending another request.
if (class(filing_error) == "try-error") {
Sys.sleep(900)
filing_error <- try(download.file(i, prefiling, method = "auto", quiet = TRUE, cacheOK = FALSE,
headers = c("User-Agent" = paste(name, email, options("HTTPUserAgent"), sep = " "))))
}
# Track progress through the index of filings
statusCounter <- statusCounter + 1
if (statusCounter %% 100 == 0){
print(paste0(round((statusCounter/totalRecords)*100, digits = 2),"% Complete"))
}
# Limit program to 3 to 4 requests to the SEC server every second
Sys.sleep(.05)
# These following lines clean the document of uncessary characters and limits us to just the non-derivitive section
filing <- readLines(prefiling) %>%
stringr::str_c(collapse = " ") %>%
stringr::str_extract(pattern = paste("(?s)(?m)<TYPE>",form,".*?(</TEXT>)",sep="")) %>%
stringr::str_replace(pattern = "((?i)<TYPE>).*?(?=<)", replacement = "") %>%
stringr::str_replace(pattern = "((?i)<SEQUENCE>).*?(?=<)", replacement = "") %>%
stringr::str_replace(pattern = "((?i)<FILENAME>).*?(?=<)", replacement = "") %>%
stringr::str_replace(pattern = "((?i)<DESCRIPTION>).*?(?=<)", replacement = "") %>%
stringr::str_replace(pattern = "(?s)(?i)<head>.*?</head>", replacement = "") %>%
stringr::str_replace(pattern = "(?s)(?i)<(table).*?(</table>)", replacement = "") %>%
stringr::str_replace(pattern = "(?s)(?i)<(nonDerivativeTable).*?(</nonDerivativeTable>)", replacement = "") %>%
stringr::str_replace_all(pattern = "&(.{2,6});", replacement = " ") %>%
stringr::str_replace_all(pattern = "(?s) +", replacement = " ") %>%
stringr::str_replace_all(pattern = "<value>", replacement = "") %>%
stringr::str_replace_all(pattern = "</value>", replacement = "") %>%
stringr::str_replace_all(pattern = "<nonDerivativeHolding>.*?(</nonDerivativeHolding>)", replacement = "") %>%
stringr::str_replace_all(pattern = "<derivativeHolding>.*?(</derivativeHolding>)", replacement = "")
# the function is the first filter. It counts of the number of keywords a form has. If it is greater than zero, it moves onto the switch which determines how many reporting owners
# it has and how many transactions. Once that has been determined, the form goes through a second filter at the individual transaction level to ensure that only transactions that
# meet the keyword criteria are scraped. By having this first filter, it cuts down on computation (thus resources and time)
# now formFilterDerivativeTransaction
keyCount <- formFilterDerivativeTransactions(filing, footnoteKeywords, issuerKeywords, issuerTradingSymbol, rptOwnerKeywords, transactionType)
if (keyCount > 0 | (any(is.na(footnoteKeywords)) & any(is.na(issuerKeywords)) & any(is.na(issuerTradingSymbol)) & any(is.na(rptOwnerKeywords)) & any(is.na(transactionType)))) {
# This determines the amount of people in the filing and the number of filings in the record. This then dictates which loop will clean the document
filingCount <- stringr::str_count(filing, "derivativeTransaction> <securityTitle>")
peopleCount <- stringr::str_count(filing, "<reportingOwner>")
# One person/entity and one transaction loop
if (filingCount == 1 & peopleCount == 1) {
# Second filter at the transaction level checking to see if the transaction satisfies the keyword criteria. If it does, the information is scraped. If it doesn't, a new filing is pulled
keyTransactionCount <- transactionFilterDerivative(filing, footnoteKeywords, issuerKeywords, issuerTradingSymbol, rptOwnerKeywords, transactionType)
if (keyTransactionCount > 0 | (any(is.na(footnoteKeywords)) & any(is.na(issuerKeywords)) & any(is.na(issuerTradingSymbol)) & any(is.na(rptOwnerKeywords)) & any(is.na(transactionType)))) {
counter = counter + 1
# This section scrapes the information for the columns associated with transaction details. Column index 60 and 61 remain NA since we are able to scrape these transactions
dat <- transactionScrape(filing = filing, counter = counter, dat = dat, n = 1, k = 28, columnNumber = 1)
# This section identifies if there are any footnotes associated with the transaction and if so, scrapes the information and deposists it in the correct column
dat <- footnoteScrape(filing = filing, counter = counter, dat = dat, n = 29, k = 58, footnoteNumber = 1, columnNumber = 29)
# Insert the URL
dat[counter, 59] <- i
}
# Removes the count variables associated with each keyword type before running the second filter for the next transaction
exist <- c("footnoteTransactionMatches", "issuerTransactionMatches", "issuerTradingSymbolMatches", "rptOwnerKeywordsMatches", "transactionTypeMatches")
for (existing in exist) {
if (exists(existing) == TRUE){
rm(existing)
}
}
# One person/entity but multiple transactions loop
} else if (filingCount > 1 & peopleCount == 1) {
transactionCount <- stringr::str_count(filing, "derivativeTransaction> <securityTitle>")
# by deleting the transaction when we are done with it from the .txt document, this allows us to sequentially move through each transaction in the filing. When transactionCount == 0,
# the loop then knows to move onto pulling the next form.
while (transactionCount >= 1){
# Second filter at the transaction level checking to see if the transaction satisfies the keyword criteria. If it does, the information is scraped. If it doesn't, a new filing is pulled
keyTransactionCount <- transactionFilterDerivative(filing, footnoteKeywords, issuerKeywords, issuerTradingSymbol, rptOwnerKeywords, transactionType)
if (keyTransactionCount > 0 | (any(is.na(footnoteKeywords)) & any(is.na(issuerKeywords)) & any(is.na(issuerTradingSymbol)) & any(is.na(rptOwnerKeywords)) & any(is.na(transactionType)))) {
filingTransactionSection <- filing %>% stringr::str_extract(pattern = "<derivativeTransaction>.*?(</derivativeTransaction>)")
counter = counter + 1
#This section scrapes the information for the columns associated with transaction details. Column index 60 and 61 remain NA since we are able to scrape these transactions
dat <- transactionScrape(filing = filing, counter = counter, dat = dat, n = 1, k = 28, columnNumber = 1)
# This section identifies if there are any footnotes associated with the transaction and if so, scrapes the information and deposists it in the correct column
dat <- footnoteScrape(filing = filing, counter = counter, dat = dat, n = 29, k = 58, footnoteNumber = 1, columnNumber = 29)
# Insert the URL
dat[counter, 59] <- i
}
# Remove the transaction that was just reviewed and recalculate how many transaction remain
filing <- filing %>% stringr::str_replace(pattern = "<derivativeTransaction>.*?(</derivativeTransaction>)", replacement = "")
transactionCount <- stringr::str_count(filing, "derivativeTransaction> <securityTitle>")
# Removes the count variables associated with each keyword type before running the second filter for the next transaction
exist <- c("footnoteTransactionMatches", "issuerTransactionMatches", "issuerTradingSymbolMatches", "rptOwnerKeywordsMatches", "transactionTypeMatches")
for (existing in exist) {
if (exists(existing) == TRUE){
rm(existing)
}
}
}
# Multiple people/entities and one transaction loop
} else if (filingCount == 1 & peopleCount > 1) {
peopleCountTwo <- stringr::str_count(filing, "<reportingOwner>")
# by deleting each rptOwner when we are done with it from the .txt document, this allows us to sequentially move through each rptOwner in the filing. When peopleCountTwo == 0,
# the loop then knows to move onto pulling the next form.
while (peopleCountTwo >= 1) {
# Second filter at the transaction level checking to see if the transaction satisfies the keyword criteria. If it does, the information is scraped. If it doesn't, a new filing is pulled
keyTransactionCount <- transactionFilterDerivative(filing, footnoteKeywords, issuerKeywords, issuerTradingSymbol, rptOwnerKeywords, transactionType)
if (keyTransactionCount > 0 | (any(is.na(footnoteKeywords)) & any(is.na(issuerKeywords)) & any(is.na(issuerTradingSymbol)) & any(is.na(rptOwnerKeywords)) & any(is.na(transactionType)))) {
counter = counter + 1
# This section scrapes the information for the columns associated with transaction details. Column index 60 remains NA since we are able to scrape these transactions
dat <- transactionScrape(filing = filing, counter = counter, dat = dat, n = 1, k = 28, columnNumber = 1)
# This section identifies if there are any footnotes associated with the rptOnwer or the transaction if so, scrapes the information and deposists it in the correct column
# this section is for the reporting individuals
individualFootnotes <- filing %>% stringr::str_extract(pattern = "<reportingOwner>.*?(</reportingOwner>)")
columnCountTwo <- 29
footnoteCount <- 1
for (columnName in columnNames[29:58]) {
countTotal <- stringr::str_count(individualFootnotes, paste('footnoteId id="F',footnoteCount,'"',sep=""))
if (countTotal > 0) {
filterOne <- paste(a1, paste('footnote id="F',footnoteCount,'"',sep=""), a2,'footnote',a3,sep="")
filterTwo <- paste(e1,paste('footnote id="F',footnoteCount,'"',sep=""),e2,sep="")
filterThree <- paste(e3,'footnote',e2,sep="")
valueOne <- filing %>% stringr::str_extract(pattern = (filterOne))
valueTwo <- gsub(filterTwo, '', valueOne)
valueThree <- gsub(filterThree, '', valueTwo)
dat[counter, columnCountTwo] <- valueThree
}
columnCountTwo <- columnCountTwo + 1
footnoteCount <- footnoteCount + 1
}
# this section is for the transaction
filingFootnotes <- filing %>% stringr::str_extract(pattern = "<derivativeTransaction>.*?(</derivativeTransaction>)")
dat <- footnoteScrape(filing = filing, counter = counter, dat = dat, n = 29, k = 58, footnoteNumber = 1, columnNumber = 29)
# Insert the URL
dat[counter, 59] <- i
# Make a note what type of transaction this was
dat[counter, 61] <- "The transaction values in this observation is an aggregate amount that is shared by the other observations that share the same URL"
}
# Remove the rptOwner that was just reviewed and recalculate how many rptOwners remain
filing <- filing %>% stringr::str_replace(pattern = "<reportingOwner>.*?(</reportingOwner>)", replacement = "")
peopleCountTwo <- stringr::str_count(filing, "<reportingOwner>")
# Removes the count variables associated with each keyword type before running the second filter for the next transaction
exist <- c("footnoteTransactionMatches", "issuerTransactionMatches", "issuerTradingSymbolMatches", "rptOwnerKeywordsMatches", "transactionTypeMatches")
for (existing in exist) {
if (exists(existing) == TRUE){
rm(existing)
}
}
}
# Many persons/entities and many transactions loop
} else if (filingCount > 1 & peopleCount > 1) {
transactionCount <- stringr::str_count(filing, "derivativeTransaction> <securityTitle>")
# By deleting the transaction when we are done with it from the .txt document, this allows us to sequentially move through each transaction in the filing. When transactionCount == 0,
# the loop then knows to move onto pulling the next form.
# This scrapes all the transaction information. The only items people will need to do by hand is going by hand and assigning the right individual
while (transactionCount >= 1){
# Second filter at the transaction level checking to see if the transaction satisfies the keyword criteria. If it does, the information is scraped. If it doesn't, a new filing is pulled
keyTransactionCount <- transactionFilterDerivative(filing, footnoteKeywords, issuerKeywords, issuerTradingSymbol, rptOwnerKeywords, transactionType)
if (keyTransactionCount > 0 | (any(is.na(footnoteKeywords)) & any(is.na(issuerKeywords)) & any(is.na(issuerTradingSymbol)) & any(is.na(rptOwnerKeywords)) & any(is.na(transactionType)))) {
counter = counter + 1
# This section scrapes the information for the columns associated with transaction details. Columns 4 through 12 are left empty for the individual to hand code after looking
# at the transaction
dat <- transactionScrape(filing = filing, counter = counter, dat = dat, n = 1, k = 3, columnNumber = 1)
dat <- transactionScrape(filing = filing, counter = counter, dat = dat, n = 13, k = 28, columnNumber = 13)
# This section identifies if there are any footnotes associated with the transaction and if so, scrapes the information and deposists it in the correct column
# pulls the block of text for the filing we are currently scraping
filingFootnotes <- filing %>% stringr::str_extract(pattern = "<derivativeTransaction>.*?(</derivativeTransaction>)")
dat <- footnoteScrape(filing = filing, counter = counter, dat = dat, n = 29, k = 58, footnoteNumber = 1, columnNumber = 29)
# Insert the URL
dat[counter, 60] <- i
# Insert note about the transaction type
dat[counter, 61] <- "This transaction may not be a valid to key word conditions based upon the structure of many reporting owners. This transaction must be checked by hand."
}
# Remove the transaction that was just reviewed and recalculate how many transaction remain
filing <- filing %>% stringr::str_replace(pattern = "<derivativeTransaction>.*?(</derivativeTransaction>)", replacement = "")
transactionCount <- stringr::str_count(filing, "derivativeTransaction> <securityTitle>")
# Removes the count variables associated with each keyword type before running the second filter for the next transaction
exist <- c("footnoteTransactionMatches", "issuerTransactionMatches", "issuerTradingSymbolMatches", "rptOwnerKeywordsMatches", "transactionTypeMatches")
for (existing in exist) {
if (exists(existing) == TRUE){
rm(existing)
}
}
}
}
}
}
# remove empty rows in the final dataset. We initially create a dataset with the 6 times the number of rows as there are filings. But if keywords are used, we will end up with fewer rows
# filled and this removes any empty rows.
dat2 <- delete.na(dat, ncol(dat) - 1)
# remove footnote citations within columns with variables
dat2[,4:28] <- lapply(dat2[,4:28], function(x) { as.character(gsub('<footnoteId id="F([1-9]|[1-9][0-9])"/>', "", x))})
invisible(return(dat2))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.