R/upsentiment.R

#' Run SQL query on Redshift READ
#' @description
#' Run SQL query on Redshift READ
#' @param sql
#' The SQL query to run
#' @return
#' The result of the SQL query
#' @export
#'
sql_query <- function(sql) {
  redshift <- RJDBC::JDBC( "org.postgresql.Driver", Sys.getenv("REDSHIFT_DRIVER_ADDRESS"))
  con <- DBI::dbConnect(redshift, Sys.getenv("REDSHIFT_URL"))
  
  result <- DBI::dbFetch(DBI::dbSendQuery(con, sql) )

  return(result)
}


#' Pull nugget ID and nugget content for recently published nuggets
#' @description
#' Pull nugget ID and nugget content for recently published nuggets
#' @param date1
#' Beginning of date range for recently published nuggets
#' @param date2
#' End of date range for recently published nuggets
#' @return
#' A dataframe containing the nugget_id, nugget_content (both raw and cleaned)
#' @export
#'
pull_nugget_content <- function(date1, date2) {
  que <- paste("SELECT nuggets._id AS nugget_id, public_nuggets.published_at, nuggets.content AS nugget_content
                FROM traffic
                LEFT JOIN nuggets ON traffic.host_nugget_id = nuggets._id
                LEFT JOIN public_nuggets ON public_nuggets.nugget_id = nuggets._id
                WHERE public_nuggets.published_at > '", date1, "' AND public_nuggets.published_at < '", date2, "'
                GROUP BY 1,2,3
                ORDER BY 1 
                LIMIT 5000", sep = "")

  nugget_content <- sql_query(que)
  
  nugget_content <- cbind(nugget_content, nugget_content_clean = strip_html(html_string = nugget_content$nugget_content)) 
  
  nugget_content$nugget_content_clean <- as.character(nugget_content$nugget_content_clean)
  
  return(nugget_content)
}


#' Strip HTML tags from string
#' @description
#' Strip HTML tags from string
#' @param html_string
#' Original string containing HTML tags
#' @return
#' A string with HTML tags removed
#' @export
#'
strip_html <- function(html_string) {
  html_string <- gsub("<.*?>", " ", html_string)
  html_string <- gsub("\r", " ", html_string)
  html_string <- gsub("\n", " ", html_string)
  html_string <- gsub('\"', " ", html_string)
  html_string <- gsub('—&nbsp;', " ", html_string)
  html_string <- gsub("[^[:graph:]]", " ", html_string) 
  return(html_string)
}


#' Code sentiment of nugget text via NRC
#' @description
#' Code sentiment of nugget text via NRC
#' @param nugget_content
#' A data frame containing one nugget per row, with nugget_id and nugget content (raw and cleaned)
#' @return
#' A dataframe containing one row per nugget, with nugget_id and sentiment score for each
#' @export
#' 
code_nugget_sentiment <- function(nugget_content) {
  df <- data.frame(nugget_id = NA, published_at = NA, anger_nrc = NA, anticipation_nrc = NA, disgust_nrc = NA, fear_nrc = NA, joy_nrc = NA, sadness_nrc = NA, surprise_nrc = NA, trust_nrc = NA, negative_nrc = NA, positive_nrc = NA)
  
  for (i in 1:nrow(nugget_content)) {
    thetext <- paste(nugget_content$nugget_content_clean[i], " x") # This handles errors caused by rare blank nugget texts
    text_sentiment_vector <- as.numeric(colMeans(syuzhet::get_nrc_sentiment(syuzhet::get_sentences(thetext))))
  
    df <- rbind(df, c(nugget_id = nugget_content[i,1], nugget_id = nugget_content[i,2], as.numeric(as.character(text_sentiment_vector))))
  }
  
  df <- df[-1,] 
  return(df)
}


#' Get SQL for sentiment scores
#' @description
#' Get SQL for sentiment scores
#' @param nugget_text_sentiment
#' A data frame containing one nugget per row, with all NRC sentiment scores
#' @return
#' Nothing
#' @export
#' 
get_sql_for_sentiment_scores <- function(nugget_text_sentiment) {
  values <- NULL
  
  for (i in 1:nrow(nugget_text_sentiment)) {
    values[i] <- paste("('", nugget_text_sentiment$nugget_id[i], "', '", nugget_text_sentiment$published_at[i], "', '", nugget_text_sentiment$anger_nrc[i], "', '", nugget_text_sentiment$anticipation_nrc[i], "', '", nugget_text_sentiment$disgust_nrc[i], "', '", nugget_text_sentiment$fear_nrc[i], "', '", nugget_text_sentiment$joy_nrc[i], "', '", nugget_text_sentiment$sadness_nrc[i], "', '", nugget_text_sentiment$surprise_nrc[i], "', '", nugget_text_sentiment$trust_nrc[i], "', '", nugget_text_sentiment$negative_nrc[i], "', '", nugget_text_sentiment$positive_nrc[i], "')", sep="")
    
    if (i == 1) { allvalues <- values[i] } else {
      allvalues <- paste(allvalues, values[i], sep = ", ")
    }
  }
  
  sql <- paste("INSERT INTO sentiment (nugget_id, published_at, anger_nrc, anticipation_nrc, disgust_nrc, fear_nrc, joy_nrc, sadness_nrc, surprise_nrc, trust_nrc, negative_nrc, positive_nrc) VALUES ", allvalues, " ;", sep = "")
  return (sql)
}


#' Get latest date from table
#' @description
#' Get latest date from table
#' @param table
#' Name of table to query
#' @param datefield
#' Name of date field to query
#' @return
#' Date of latest row in sentiment table
#' 
get_latest_date <- function(table = 'sentiment', datefield = 'published_at') {
  sql <- paste( "select max(", datefield, ") from ", table, " limit 1;", sep = "")
  
  latest_date <- as.character( sql_query(sql) )
  
  return (latest_date)
}


#' Analyze New Nugget Sentiment
#' @description
#' Pulls content of recently published nuggets, analyzes sentiment, and saves scores to Redshift READ
#' @return
#' Does not return anything.
#' @export
#' 
analyze_new_nugget_sentiment <- function() {
  nugget_content <- pull_nugget_content(date1 = get_latest_date(), date2 = Sys.Date())
  
  if (nrow(nugget_content) > 0) {
    nugget_text_sentiment <- code_nugget_sentiment(nugget_content)
    
    sql <- get_sql_for_sentiment_scores(nugget_text_sentiment)
    
    try(sql_query(sql), silent=TRUE) # Throws an error, but we can ignore it, hence using try().
    
    print(paste("Inserted ", nrow(nugget_content), " rows.", sep = ""))
  }
  else { print("No new nuggets to code") }
}
seanwojcik/upsentiment documentation built on May 29, 2019, 4:56 p.m.