R/utils.R

Defines functions process_file_func url_file_func line_separator_func

line_separator_func <- function(CR, LF, text)
{
  if (!is.na(CR)) {
    if (substr(text, CR + 1, CR + 1) == "\n") {
      line_separator <- "\r\n"
    } else {
      line_separator <- "\r"
    }
  } else {
    if (!is.na(LF)) {
      line_separator <- "\n"
    } else {
      stop("Undefined line separator")
    }
  }
}

url_file_func <- function(YEAR_,
                          QUARTER_)
{
  #Validating inputs to the function
  YEAR_ <- as.integer(YEAR_)
  QUARTER_ <- as.integer(QUARTER_)

  if (is.na(YEAR_) |
      is.na(QUARTER_))
    stop(
      "Error: Please supply integer values for YEAR_ and QUARTER_ starting in 2004 Q1. Example: SEC_13F_list(2004, 1)"
    )

  if (YEAR_ < 2004)
    stop(
      "Error: SEC_13F_list function only works with SEC list files starting at Q1 2004. Example: SEC_13F_list(2004, 1)"
    )
  if (QUARTER_ > 4 | QUARTER_ < 1)
    stop("Error: Please, supply integer number for QUARTER_ in range between 1 and 4")

  if (YEAR_ == 2004 & QUARTER_ == 1)
  {
    file_name <- "13f-list.pdf"
    url_file <-
      paste0("https://www.sec.gov/divisions/investment/", file_name)
  }
  else if (YEAR_ < 2021 | (YEAR_ == 2021 & QUARTER_ <= 1))
  {
    file_name <- paste0('13flist', YEAR_, 'q', QUARTER_, '.pdf')
    url_file <-
      paste0("https://www.sec.gov/divisions/investment/13f/",
             file_name)
  } else {
    file_name <- paste0('13flist', YEAR_, 'q', QUARTER_, '.pdf')
    url_file <-
      paste0("https://www.sec.gov/files/investment/",
             file_name)
  }
  return(url_file)
}

process_file_func <- function(text, YEAR_, QUARTER_) {

  CUSIP_start <- NULL
  ISSUER_NAME_start <- NULL
  ISSUER_DESCRIPTION_start <- NULL
  STATUS_start <- NULL
  PDF_STRING <- NULL
  CUSIP_end <- NULL
  HAS_LISTED_OPTION_start <- NULL
  HAS_LISTED_OPTION_end <- NULL
  ISSUER_NAME_end <- NULL
  ISSUER_DESCRIPTION_end <- NULL
  STATUS_end <- NULL
  CUSIP <- NULL
  HAS_LISTED_OPTION <- NULL
  ISSUER_NAME <- NULL
  ISSUER_DESCRIPTION <- NULL
  STATUS <- NULL

  pages <- length(text)

  table_start <- match(1, regexpr("Run Date:", text))

  text2 <- readLines(con = textConnection(text))
  table_start_2 <- match(1, regexpr("Run Date:", text2))
  pages_2 <- length(text2)
  text2 <- text2[table_start_2:pages_2] |>
    as.data.frame(stringsAsFactors = FALSE)
  names(text2) <- "PDF_STRING"

  text2$CUSIP_start <- regexpr("CUSIP", text2$PDF_STRING)
  text2$ISSUER_NAME_start <- regexpr("ISSUER NAME", text2$PDF_STRING)
  text2$ISSUER_DESCRIPTION_start  <-  regexpr("ISSUER DESCRIPTION", text2$PDF_STRING)
  text2$STATUS_start <- regexpr("STATUS", text2$PDF_STRING)

  text2$CUSIP_start <- replace(text2$CUSIP_start, which(text2$CUSIP_start == -1), NA)
  text2$ISSUER_NAME_start <- replace(text2$ISSUER_NAME_start, which(text2$ISSUER_NAME_start == -1), NA)
  text2$ISSUER_DESCRIPTION_start  <- replace(text2$ISSUER_DESCRIPTION_start, which(text2$ISSUER_DESCRIPTION_start == -1), NA)
  text2$STATUS_start <- replace(text2$STATUS_start, which(text2$STATUS_start == -1), NA)

  text2 <- subset(text2,
                  !grepl("Run Date", PDF_STRING) &
                    !grepl("Run Time", PDF_STRING) &
                    !grepl("Total C", PDF_STRING) &
                    PDF_STRING != "" &
                    !grepl("\f", PDF_STRING)
                  )

  text2 <- text2 |>
    tidyr::fill(
      CUSIP_start,
      ISSUER_NAME_start,
      ISSUER_DESCRIPTION_start,
      STATUS_start,
      .direction = "down"
    )

  text2$CUSIP_end <- text2$CUSIP_start + 11 - 1
  text2$HAS_LISTED_OPTION_start <-  text2$CUSIP_end + 1
  text2$HAS_LISTED_OPTION_end <-  text2$ISSUER_NAME_start - 1
  text2$ISSUER_NAME_end <- text2$ISSUER_DESCRIPTION_start -1
  text2$ISSUER_DESCRIPTION_end  <- text2$STATUS_start - 1
  text2$STATUS_end <- nchar(text2$PDF_STRING)
  text2$STATUS_end <- max(text2$STATUS_end)

  text2$CUSIP<- substr(text2$PDF_STRING, text2$CUSIP_start, text2$CUSIP_end)
  text2$HAS_LISTED_OPTION <- trimws(substr(text2$PDF_STRING, text2$HAS_LISTED_OPTION_start, text2$HAS_LISTED_OPTION_end), "both")
  text2$ISSUER_NAME <- trimws(substr(text2$PDF_STRING, text2$ISSUER_NAME_start, text2$ISSUER_NAME_end),
        "both")
  text2$ISSUER_DESCRIPTION <- trimws(
        substr(
          text2$PDF_STRING,
          text2$ISSUER_DESCRIPTION_start,
          text2$ISSUER_DESCRIPTION_end
        ),
        "both")
  text2$STATUS <- trimws(substr(text2$PDF_STRING, text2$STATUS_start, text2$STATUS_end), "both")
  text2$CUSIP <- gsub(" ", "", text2$CUSIP)
      #handling of edge cases for cut-offs
  # fix ISSUER_DESCRIPTION before STATUS!
  text2$ISSUER_DESCRIPTION = ifelse(
    text2$STATUS == "DDED" | text2$STATUS == "ELETED",
    trimws(substr(
      text2$ISSUER_DESCRIPTION,
      1,
      nchar(text2$ISSUER_DESCRIPTION) - 1
    ),
    "right"),
    text2$ISSUER_DESCRIPTION
  )
  #Then you can fix STATUS
  text2$STATUS = ifelse(text2$STATUS == "DDED", "ADDED", text2$STATUS)
  text2$STATUS = ifelse(text2$STATUS == "ELETED", "DELETED", text2$STATUS)

  text2$ISSUER_DESCRIPTION = ifelse(
    text2$STATUS == "D",
        paste0(text2$ISSUER_DESCRIPTION, "D"),
    text2$ISSUER_DESCRIPTION
      )

  text2$STATUS = ifelse(text2$STATUS == "D", "", text2$STATUS)

  text2$ISSUER_DESCRIPTION = ifelse(
    text2$STATUS == "D   ADDED",
        trimws(paste0(
          text2$ISSUER_DESCRIPTION, substr(text2$STATUS, 1, 4)
        ), "both"),
    text2$ISSUER_DESCRIPTION
      )
  text2$STATUS = ifelse(text2$STATUS == "D   ADDED",
                      substr(text2$STATUS, 5, 9),
                      text2$STATUS)
#2023 Q3 report
  text2$ISSUER_DESCRIPTION = ifelse(
    text2$STATUS == "RG   ADDED",
    trimws(paste0(
      text2$ISSUER_DESCRIPTION, substr(text2$STATUS, 1, 5)
    ), "both"),
    text2$ISSUER_DESCRIPTION
  )
  text2$STATUS = ifelse(text2$STATUS == "RG   ADDED",
                        substr(text2$STATUS, 6, 10),
                        text2$STATUS)
#2023 Q4 report 025072158
  text2$ISSUER_DESCRIPTION = ifelse(
    text2$STATUS == "RG",
    trimws(paste0(
      text2$ISSUER_DESCRIPTION, substr(text2$STATUS, 1, 2)
    ), "both"),
    text2$ISSUER_DESCRIPTION
  )
  text2$STATUS = ifelse(text2$STATUS == "RG",
                        "",
                        text2$STATUS)

  text2 <- subset(text2, !grepl("CUSIP", text2$CUSIP))

  text2$YEAR <- YEAR_
  text2$QUARTER <- QUARTER_
  text2 <- text2[-c(1:11)]

  return(text2)
}
yanlesin/SEC13Flist documentation built on Jan. 21, 2025, 5:23 p.m.