R/Sentiment.R

Defines functions sentiment_dist_plot get_batch_sentiment api_errors api_results post_to_api transform_data transform_dataframe

Documented in get_batch_sentiment sentiment_dist_plot

# splitting dataset for batch requests - microsoft api allows 100 requests per collection
transform_dataframe <- function(dataset, batch){
  return(split(dataset, (as.numeric(rownames(dataset))-1) %/% batch))
}

# transforming dataset into json format as required for POST request
transform_data <- function(dataset) {
  return(jsonlite::toJSON(list(documents = dataset), auto_unbox = TRUE))
}

# hitting api key with post request to get sentiment score
post_to_api <- function(data_for_api, auth_key, api_region) {
  api_link <- paste("https://", toString(api_region), ".api.cognitive.microsoft.com/text/analytics/v2.1/sentiment", sep = "")
  result <- httr::POST(toString(api_link), body = data_for_api,
            httr::add_headers(.headers = c("Content-Type"="application/json","Ocp-Apim-Subscription-Key"=toString(auth_key))))
  return(httr::content(result))
}

# extracting results from API response to POST request
api_results <- function(api_output) {
  api_returned_data <- data.frame(matrix(unlist(api_output$documents), nrow=length( api_output$documents), byrow=T),stringsAsFactors=FALSE)
  names(api_returned_data) <- c('Id', 'Sentiment Score')
  return(api_returned_data)
}

# extracting any errors generated while retrieving sentiment score
api_errors <- function(api_output) {
  api_error_data <- data.frame(matrix(unlist(api_output$errors), nrow=length( api_output$errors), byrow=T),stringsAsFactors=FALSE)
  names(api_error_data) <- c('Id', 'Error Message')
  return(api_error_data)
}

# function consolidating all the above functions with exception handling
get_batch_sentiment <- function(data, auth_key, api_region, batch_size = 100) {
  # checking if input dataset is in correct format
  if (sum(c('language', 'text') %in% colnames(data)) == 2) {
  # adding id to dataset; necessary for sending request to api
  data$Id <- c(1:length(data$text))
  # creating a temporary dataset to store results after each batch results are received. Results are appended to temp_data after processing each batch
  temp_data <- setNames(data.frame(matrix(ncol = 2, nrow = 0)), c("Id", "Sentiment Score"))
  error_data <- setNames(data.frame(matrix(ncol = 2, nrow = 0)), c("Id", "Error Message"))
  failure_message <-''
  split_data <- transform_dataframe(data, batch_size)
  for (each_data in split_data) {
  json_data <- transform_data(each_data)
  # try except handling to make sure that api URL is correct. Wrong region input will result in error.
  tryCatch(
    {
        hitting_api <- post_to_api(json_data, auth_key, api_region)
    },
    error=function(cond){
      cat("Error: Link is Invalid. Make sure valid region is passed to function.\nNote: API wrapper uses version 2.1 for sentiment analysis, therefore parameters passed should be valid for this version.")
      cat("\n\nHere's original Error Messge:\n")
      message(cond)
      return(NA)
    }
  )

  # if-else statement to handle exceptions while receiving results from the microsoft API. If some error occurs then else statement will handle the error
        if (length(hitting_api$documents) == length(each_data$Id)) {
          api_res <- api_results(hitting_api)
          temp_data <- rbind(temp_data, api_res)
          # if error occurs, output's error message and also returns sentiment score of all successfully processed batches
        } else if (length(hitting_api$documents) < length(each_data$Id) & length(hitting_api$documents) > 0 ) {
            api_res <- api_results(hitting_api)
            if (length(hitting_api$errors) > 0) {
              temp_errors <- api_errors(hitting_api)
              error_data <- rbind(error_data, temp_errors)
            }
            temp_data <- rbind(temp_data, api_res)
            if (length(hitting_api$message) > 0) {
              if (unlist(hitting_api$message) != failure_message) {
              failure_message = unlist(hitting_api$message)
              cat("Addtional Message Returned:\n", unlist(hitting_api$message))
              }
            }
          } else {
            if (length(hitting_api$errors) > 0) {
              temp_errors <- api_errors(hitting_api)
              error_data <- rbind(error_data, temp_errors)
            }
            if (length(hitting_api$message) > 0) {
              if (unlist(hitting_api$message) != failure_message) {
              failure_message = unlist(hitting_api$message)
              cat("Addtional Message Returned:\n", unlist(hitting_api$message))
              }
            }
          }
  }

  if (length(error_data$Id)>0) {
    write.csv(error_data, 'Error_Log.csv')
    cat('Check Error_Log.csv file for generated errors.')
  }

  if (length(temp_data$Id) == 0) {
    return(NA)
  } else {
    data_with_sentiment <- merge(data, temp_data, by.x='Id', by.y = 'Id', all.x = TRUE)
    data_with_sentiment$`Sentiment Score` <- as.numeric(data_with_sentiment$`Sentiment Score`)
    return(data_with_sentiment)
  }
  } else {
    cat("Error: Dataframe not passed in correct format. Read documentation for correct format.")
    return(NA)
  }
}

# distribution plot for retrieved sentiment score under negative, neutral and positive sentiment
sentiment_dist_plot <- function(data, negative_cutoff = 0.35, positive_start = 0.65, graph_alpha = 0.5, graph_type = 'density') {
  # exception handling for data imput with no sentiment score column
  if ('Sentiment Score' %in% colnames(data)) {
  # exception handling where sentiment score range for negative and positive sentiment is logically not right or non-numeric data
    if (all(is.numeric(c(negative_cutoff, positive_start))) & negative_cutoff<= positive_start & positive_start <=1) {
      data$Sentiment <-  c("Negative", "Neutral", "Positive")[ findInterval(data$`Sentiment Score`, c(0, negative_cutoff, positive_start, 1.01))]
      # changing data type for plotting
      data$Sentiment <- as.factor(data$Sentiment)
      data$Score <- as.numeric(data$`Sentiment Score`)
      # exception handling for wrong graphy type input
      if (graph_type == 'density') {
        return(ggplot(data, aes(x = Score, fill = Sentiment)) + geom_density(alpha =graph_alpha) + ggtitle("Sentiment Distribution Plot") + theme(plot.title = element_text(hjust = 0.5, size = 12, face = 'bold'), panel.grid.major = element_blank(), panel.grid.minor = element_blank(),panel.background = element_blank(), axis.line = element_line(colour = "black")))
      } else if (graph_type == 'boxplot') {
        return(ggplot(data, aes(y = Score, x = Sentiment, fill = Sentiment)) + geom_boxplot(alpha =graph_alpha) + ggtitle("Sentiment Boxplot") + theme(plot.title = element_text(hjust = 0.5, size = 12, face = 'bold'), panel.grid.major = element_blank(), panel.grid.minor = element_blank(),panel.background = element_blank(), axis.line = element_line(colour = "black")))
      } else {
        return("Error: Invalid Graph Type Passed")
      }
    # generating error message based on type of error in user input
    } else if(positive_start>1) {
      return("Error: Sentiment Cutoff values cannot be greater than 1.0")
    } else {
      return("Error: Either non numeric input for sentiment score cutoff or negative score cutoff is greater than positive score cutoff")
    }
  } else {
    return("Error: 'Sentiment Score' column not provided in data.")
  }
}
Ritayu09/Test_R_Package documentation built on June 6, 2020, 2:29 a.m.