R/getSentiment.R
In edgar: Tool for the U.S. SEC EDGAR Retrieval and Parsing of Corporate Filings

Documented in getSentiment

#' Provides sentiment measures of EDGAR filings
#'
#' \code{getSentiment} computes sentiment measures of EDGAR filings
#'
#' getSentiment function takes CIK(s), form type(s), and year(s) as input parameters.  
#' The function first imports available downloaded filings in the local working directory 
#' 'edgar_Filings' created by \link[edgar]{getFilings} function; otherwise, 
#' it automatically downloads the filings which are not already been downloaded.
#' It then reads, cleans, and computes sentiment measures for these filings. 
#' The function returns a dataframe with filing information and sentiment measures.
#' User must follow the US SEC's fair access policy, i.e. download only what you 
#' need and limit your request rates, see www.sec.gov/os/accessing-edgar-data.
#' 
#' @usage getSentiment(cik.no, form.type, filing.year, useragent)
#' 
#' @param cik.no vector of CIK number of firms in integer format. Suppress leading 
#' zeroes from CIKs. Keep cik.no = 'ALL' if needs to download for all CIKs.
#'  
#' @param form.type character vector containing form type to be downloaded. 
#' form.type = 'ALL' if need to download all forms.  
#'
#' @param filing.year vector of four digit numeric year
#' 
#' @param useragent Should be in the form of "YourName Contact@domain.com"
#'  
#' @return Function returns dataframe containing CIK number, company name, 
#' date of filing, accession number, and various sentiment measures. 
#' This function takes the help of Loughran-McDonald (L&M) sentiment 
#' dictionaries (https://sraf.nd.edu/loughranmcdonald-master-dictionary/) to 
#' compute sentiment measures of a EDGAR filing. Following are the 
#' definitions of the text characteristics and the sentiment measures:
#' 
#' file.size = The filing size of a complete filing on the EDGAR server in 
#' kilobyte (KB).
#' 
#' word.count = The total number of words in a filing text, excluding HTML 
#' tags and exhibits text.
#' 
#' unique.word.count = The total number of unique words in a filing text, 
#' excluding HTML tags and exhibits text.
#' 
#' stopword.count = The total number of stop words in a filing text, 
#' excluding exhibits text.
#' 
#' char.count = The total number of characters in a filing text, excluding 
#' HTML tags and exhibits text.
#' 
#' complex.word.count = The total number of complex words in the filing text. 
#' When vowels (a, e, i, o, u) occur more than three times in a word, then 
#' that word is identified as a complex word.
#' 
#' lm.dictionary.count = The number of words in the filing text that occur 
#' in the Loughran-McDonald (LM) master dictionary.
#' 
#' lm.negative.count = The number of LM financial-negative words in the 
#' document.
#' 
#' lm.positive.count = The number of LM financial-positive words in the 
#' document.
#' 
#' lm.strong.modal.count = The number of LM financial-strong modal words 
#' in the document.
#' 
#' lm.moderate.modal.count = The number of LM financial-moderate Modal 
#' words in the document.
#' 
#' lm.weak.modal.count = The number of LM financial-weak modal words in 
#' the document.
#' 
#' lm.uncertainty.count = The number of LM financial-uncertainty words 
#' in the document.
#' 
#' lm.litigious.count = The number of LM financial-litigious words in 
#' the document.
#' 
#' hv.negative.count = The number of words in the document that occur in 
#' the 'Harvard General Inquirer' Negative word list, as defined by LM. 
#' 
#' @examples
#' \dontrun{
#' 
#' senti.df <- getSentiment(cik.no = c('1000180', '38079'), 
#'                          form.type = '10-K', filing.year = 2006, useragent) 
#'                          
#' ## Returns dataframe with sentiment measures of firms with CIKs 
#' ## 1000180 and 38079 filed in year 2006 for form type '10-K'.
#' 
#' senti.df <- getSentiment(cik.no = '38079', form.type = c('10-K', '10-Q'), 
#'                          filing.year = c(2005, 2006), useragent)
#'}

getSentiment <- function(cik.no, form.type, filing.year, useragent= "") {
      
    output <- getFilings(cik.no, form.type, filing.year, quarter = c(1, 2, 3, 4), 
						 downl.permit = "y", useragent)
    
    if (is.null(output)){
      # cat("Please check the CIK number.")
      return()
    }
   	
    cat("Computing sentiment measures...\n")
    
    # load Loughran & McDonald Master Dictionary and import word lists
    LoadLMDictionary <- function() {
      
      load(system.file("data/LMMasterDictionary.rda", package = "edgar"))
      
      uncertainty <- ifelse(LMMasterDictionary$uncertainty != 0, LMMasterDictionary$word, NA)
      uncertainty <- uncertainty[!is.na(uncertainty)]
      
      negative <- ifelse(LMMasterDictionary$negative != 0, LMMasterDictionary$word, NA)
      negative <- negative[!is.na(negative)]
      
      positive <- ifelse(LMMasterDictionary$positive != 0, LMMasterDictionary$word, NA)
      positive <- positive[!is.na(positive)]
      
      litigious <- ifelse(LMMasterDictionary$litigious != 0, LMMasterDictionary$word, NA)
      litigious <- litigious[!is.na(litigious)]
      
      strong.modal <- ifelse(LMMasterDictionary$modal == 1, LMMasterDictionary$word, NA)
      strong.modal <- strong.modal[!is.na(strong.modal)]
      
      moderate.modal <- ifelse(LMMasterDictionary$modal == 2, LMMasterDictionary$word, NA)
      moderate.modal <- moderate.modal[!is.na(moderate.modal)]
      
      weak.modal <- ifelse(LMMasterDictionary$modal == 3, LMMasterDictionary$word, NA)
      weak.modal <- weak.modal[!is.na(weak.modal)]
      
      harvard.iv <- ifelse(LMMasterDictionary$harvard_iv != 0, LMMasterDictionary$word, NA)
      harvard.iv <- harvard.iv[!is.na(harvard.iv)]
      
      lm.out <- list(LMMasterDictionary = LMMasterDictionary, uncertainty = uncertainty, 
                     negative = negative, positive = positive, 
                     litigious = litigious, strong.modal =strong.modal,
                     moderate.modal = moderate.modal, weak.modal = weak.modal,
                     harvard.iv = harvard.iv)
      
      return(lm.out)
    }
    
    
    ## Load LM dictionaries
    lm.dict <- LoadLMDictionary()

    progress.bar <- txtProgressBar(min = 0, max = nrow(output), style = 3)
    
    output$file.size <- NA
    output$word.count <- NA
    output$unique.word.count <- NA
    output$stopword.count <- NA
    output$char.count <- NA
    output$complex.word.count  <- NA
    output$lm.dictionary.count <- NA
    output$lm.negative.count <- NA
    output$lm.positive.count <- NA
    output$lm.strong.modal.count <- NA
    output$lm.moderate.modal.count <- NA
    output$lm.weak.modal.count <- NA
    output$lm.uncertainty.count <- NA
    output$lm.litigious.count <- NA
    output$hv.negative.count <- NA
    
    for (i in 1:nrow(output)) {
        
        f.type <- gsub("/", "", output$form.type[i])

        dest.filename <- paste0("edgar_Filings/Form ", f.type, 
                                "/", output$cik[i], "/", output$cik[i], "_", f.type, "_", 
                                output$date.filed[i], "_", output$accession.number[i], ".txt")
        
        # Read filing
        filing.text <- readLines(dest.filename)
        
        file.size <- round(file.info(dest.filename)$size/1024, 0)  ## file size in kilobyte (KB)
        
        # Extract data from first <DOCUMENT> to </DOCUMENT>
        tryCatch({
          filing.text <- filing.text[(grep("<DOCUMENT>", filing.text, ignore.case = TRUE)[1]):(grep("</DOCUMENT>", 
                                                                                                    filing.text, ignore.case = TRUE)[1])]
        }, error = function(e) {
          filing.text <- filing.text ## In case opening and closing DOCUMENT TAG not found, cosnider full web page
        })
        
        # See if 10-K is in XLBR or old text format
    if (any(grepl(pattern = "<xml>|<type>xml|<html>|10k.htm|<XBRL>", filing.text, ignore.case = T))) {
            
            doc <- XML::htmlParse(filing.text, asText = TRUE, useInternalNodes = TRUE, addFinalizer = FALSE)
			
            f.text <- XML::xpathSApply(doc, "//text()[not(ancestor::script)][not(ancestor::style)][not(ancestor::noscript)][not(ancestor::form)]", 
                XML::xmlValue)
            f.text <- iconv(f.text, "latin1", "ASCII", sub = " ")

            ## Free up htmlParse document to avoid memory leakage, this calls C function
            #.Call('RS_XML_forceFreeDoc', doc, package= 'XML')
            
        } else {
            f.text <- filing.text
        }
        
        # Preprocessing the filing text
        f.text <- gsub("\\n|\\t|,", " ", f.text)
        f.text <- paste(f.text, collapse=" ")
        f.text <- gsub("/s/", "", f.text, fixed = T)
        f.text <- gsub("'s ", "", f.text)
        f.text <- gsub("[[:punct:]]", "", f.text, perl=T)
        f.text <- gsub("[[:digit:]]", "", f.text, perl=T)
        f.text <- iconv(f.text, from = 'UTF-8', to = 'ASCII//TRANSLIT')
        f.text <- tolower(f.text)
        f.text <- gsub("\\s{2,}", " ", f.text)
        
        ### Clean text and find number of total words
        text_words <- unlist(strsplit(f.text, " "))
        text_df <- data.frame(word = unlist(text_words), nchar = nchar(text_words))
        text_df <- text_df[text_df$nchar >=3, ]
        word.count <- nrow(text_df)   # Word count
        char.count <- sum(text_df$nchar) # Character count
        stopword.count <- sum(text_df$word %in% tm::stopwords("en")) # Number of stop words
        unique.word.count <- length(unique(trimws(text_df$word, "both"))) # Number of unique words
        
        # Complex word count
        syllables.counts <- sapply(regmatches(text_df$word, gregexpr("[aeiouy]", 
                                                                     text_df$word, ignore.case = T)), length)
        syllables.counts <- syllables.counts[syllables.counts >= 3]
        complex.word.count <- length(syllables.counts)
        
        ############################## Sentiment Measures #############################
        lm.master.dict <- data.frame(word = unique(lm.dict$LMMasterDictionary$word))
        lm.master.merge <- merge(text_df, lm.master.dict, by = "word")
        lm.dictionary.count <- nrow(lm.master.merge)
        
        # Loughran-McDonald Negative word proportion
        lm.negative.count <-  nrow(text_df[which(text_df$word %in% lm.dict$negative), ])

        # Loughran-McDonald positive word proportion
        lm.positive.count <-  nrow(text_df[which(text_df$word %in% lm.dict$positive), ])

        # Loughran-McDonald strong.modal word proportion
        lm.strong.modal.count <-  nrow(text_df[which(text_df$word %in% lm.dict$strong.modal), ])

        # Loughran-McDonald moderate.modal word proportion
        lm.moderate.modal.count <-  nrow(text_df[which(text_df$word %in% lm.dict$moderate.modal), ])
        
        # Loughran-McDonald weak.modal word proportion
        lm.weak.modal.count <-  nrow(text_df[which(text_df$word %in% lm.dict$weak.modal), ])
        
        # Loughran-McDonald uncertainty word proportion
        lm.uncertainty.count <-  nrow(text_df[which(text_df$word %in% lm.dict$uncertainty), ])
        
        # Loughran-McDonald litigious word proportion
        lm.litigious.count <-  nrow(text_df[which(text_df$word %in% lm.dict$litigious), ])
        
        # hv.negative.count
        hv.negative.count <-  nrow(text_df[which(text_df$word %in% lm.dict$harvard.iv), ])
        
        # Assign all the varibles
        output$file.size[i] <- file.size
        output$word.count[i] <- word.count
        output$unique.word.count[i] <- unique.word.count
        output$stopword.count[i] <- stopword.count
        output$char.count[i] <- char.count
        output$complex.word.count [i] <- complex.word.count 
        output$lm.dictionary.count[i] <- lm.dictionary.count
        output$lm.negative.count[i] <- lm.negative.count
        output$lm.positive.count[i] <- lm.positive.count
        output$lm.strong.modal.count[i] <- lm.strong.modal.count
        output$lm.moderate.modal.count[i] <- lm.moderate.modal.count
        output$lm.weak.modal.count[i] <- lm.weak.modal.count
        output$lm.uncertainty.count[i] <- lm.uncertainty.count
        output$lm.litigious.count[i] <- lm.litigious.count
        output$hv.negative.count[i] <- hv.negative.count
        
        # update progress bar
        setTxtProgressBar(progress.bar, i)

    }
    

    # Close progress bar
    close(progress.bar)
    
    #names(output)[names(output) == 'status'] <- 'downld.status'
    output$status <- NULL
    output$quarter <- NULL
    output$filing.year <- NULL
    
    ## convert dates into R dates
    output$date.filed <- as.Date(as.character(output$date.filed), "%Y-%m-%d")

    return(output)
}

globalVariables('LMMasterDictionary')