#' Run SQL query on Redshift READ
#' @description
#' Run SQL query on Redshift READ
#' @param sql
#' The SQL query to run
#' @return
#' The result of the SQL query
#' @export
#'
sql_query <- function(sql) {
redshift <- RJDBC::JDBC( "org.postgresql.Driver", Sys.getenv("REDSHIFT_DRIVER_ADDRESS"))
con <- DBI::dbConnect(redshift, Sys.getenv("REDSHIFT_URL"))
result <- DBI::dbFetch(DBI::dbSendQuery(con, sql) )
return(result)
}
#' Pull nugget ID and nugget content for recently published nuggets
#' @description
#' Pull nugget ID and nugget content for recently published nuggets
#' @param date1
#' Beginning of date range for recently published nuggets
#' @param date2
#' End of date range for recently published nuggets
#' @return
#' A dataframe containing the nugget_id, nugget_content (both raw and cleaned)
#' @export
#'
pull_nugget_content <- function(date1, date2) {
que <- paste("SELECT nuggets._id AS nugget_id, public_nuggets.published_at, nuggets.content AS nugget_content
FROM traffic
LEFT JOIN nuggets ON traffic.host_nugget_id = nuggets._id
LEFT JOIN public_nuggets ON public_nuggets.nugget_id = nuggets._id
WHERE public_nuggets.published_at > '", date1, "' AND public_nuggets.published_at < '", date2, "'
GROUP BY 1,2,3
ORDER BY 1
LIMIT 5000", sep = "")
nugget_content <- sql_query(que)
nugget_content <- cbind(nugget_content, nugget_content_clean = strip_html(html_string = nugget_content$nugget_content))
nugget_content$nugget_content_clean <- as.character(nugget_content$nugget_content_clean)
return(nugget_content)
}
#' Strip HTML tags from string
#' @description
#' Strip HTML tags from string
#' @param html_string
#' Original string containing HTML tags
#' @return
#' A string with HTML tags removed
#' @export
#'
strip_html <- function(html_string) {
html_string <- gsub("<.*?>", " ", html_string)
html_string <- gsub("\r", " ", html_string)
html_string <- gsub("\n", " ", html_string)
html_string <- gsub('\"', " ", html_string)
html_string <- gsub('— ', " ", html_string)
html_string <- gsub("[^[:graph:]]", " ", html_string)
return(html_string)
}
#' Code sentiment of nugget text via NRC
#' @description
#' Code sentiment of nugget text via NRC
#' @param nugget_content
#' A data frame containing one nugget per row, with nugget_id and nugget content (raw and cleaned)
#' @return
#' A dataframe containing one row per nugget, with nugget_id and sentiment score for each
#' @export
#'
code_nugget_sentiment <- function(nugget_content) {
df <- data.frame(nugget_id = NA, published_at = NA, anger_nrc = NA, anticipation_nrc = NA, disgust_nrc = NA, fear_nrc = NA, joy_nrc = NA, sadness_nrc = NA, surprise_nrc = NA, trust_nrc = NA, negative_nrc = NA, positive_nrc = NA)
for (i in 1:nrow(nugget_content)) {
thetext <- paste(nugget_content$nugget_content_clean[i], " x") # This handles errors caused by rare blank nugget texts
text_sentiment_vector <- as.numeric(colMeans(syuzhet::get_nrc_sentiment(syuzhet::get_sentences(thetext))))
df <- rbind(df, c(nugget_id = nugget_content[i,1], nugget_id = nugget_content[i,2], as.numeric(as.character(text_sentiment_vector))))
}
df <- df[-1,]
return(df)
}
#' Get SQL for sentiment scores
#' @description
#' Get SQL for sentiment scores
#' @param nugget_text_sentiment
#' A data frame containing one nugget per row, with all NRC sentiment scores
#' @return
#' Nothing
#' @export
#'
get_sql_for_sentiment_scores <- function(nugget_text_sentiment) {
values <- NULL
for (i in 1:nrow(nugget_text_sentiment)) {
values[i] <- paste("('", nugget_text_sentiment$nugget_id[i], "', '", nugget_text_sentiment$published_at[i], "', '", nugget_text_sentiment$anger_nrc[i], "', '", nugget_text_sentiment$anticipation_nrc[i], "', '", nugget_text_sentiment$disgust_nrc[i], "', '", nugget_text_sentiment$fear_nrc[i], "', '", nugget_text_sentiment$joy_nrc[i], "', '", nugget_text_sentiment$sadness_nrc[i], "', '", nugget_text_sentiment$surprise_nrc[i], "', '", nugget_text_sentiment$trust_nrc[i], "', '", nugget_text_sentiment$negative_nrc[i], "', '", nugget_text_sentiment$positive_nrc[i], "')", sep="")
if (i == 1) { allvalues <- values[i] } else {
allvalues <- paste(allvalues, values[i], sep = ", ")
}
}
sql <- paste("INSERT INTO sentiment (nugget_id, published_at, anger_nrc, anticipation_nrc, disgust_nrc, fear_nrc, joy_nrc, sadness_nrc, surprise_nrc, trust_nrc, negative_nrc, positive_nrc) VALUES ", allvalues, " ;", sep = "")
return (sql)
}
#' Get latest date from table
#' @description
#' Get latest date from table
#' @param table
#' Name of table to query
#' @param datefield
#' Name of date field to query
#' @return
#' Date of latest row in sentiment table
#'
get_latest_date <- function(table = 'sentiment', datefield = 'published_at') {
sql <- paste( "select max(", datefield, ") from ", table, " limit 1;", sep = "")
latest_date <- as.character( sql_query(sql) )
return (latest_date)
}
#' Analyze New Nugget Sentiment
#' @description
#' Pulls content of recently published nuggets, analyzes sentiment, and saves scores to Redshift READ
#' @return
#' Does not return anything.
#' @export
#'
analyze_new_nugget_sentiment <- function() {
nugget_content <- pull_nugget_content(date1 = get_latest_date(), date2 = Sys.Date())
if (nrow(nugget_content) > 0) {
nugget_text_sentiment <- code_nugget_sentiment(nugget_content)
sql <- get_sql_for_sentiment_scores(nugget_text_sentiment)
try(sql_query(sql), silent=TRUE) # Throws an error, but we can ignore it, hence using try().
print(paste("Inserted ", nrow(nugget_content), " rows.", sep = ""))
}
else { print("No new nuggets to code") }
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.