## code to prepare `DATASET` dataset goes here
library(readr)
library(stringr)
library(stringi)
library(dplyr)
#'
#' Remove non english tweets
#' remove short tweets
#' cleanup tweets
#' remove extra white spaces
#' subsample the data
#'
filter_and_sub_sample_data <- function(data, data_size=50000) {
data <- data %>%
filter(stringi::stri_enc_mark(.data$text) == "ASCII") %>%
mutate(text = stringr::str_replace(text, "&\\w+;", ""),
text = stringr::str_replace(text, "^\\s+|\\s+$", ""),
text = stringr::str_replace(text, "\\s+", " "),
text = stringr::str_replace(text, "[^:|[:punct:]+]", ""),
text = stringr::str_replace(text, " [^[:alnum:]+] ", " ")) %>%
filter(nchar(text) > 20,
polarity != 2) %>%
mutate(polarity = ifelse(polarity == 4, "Positive", "Negative"))
if (dim(data)[1] < data_size) {
data_size = dim(data)[1]
}
set.seed(314159)
data <- data %>%
sample_n(data_size)
return(data)
}
#' process training data
#'
#'
sentiment140_train <- readr::read_csv("./data-raw/sentiment140_train_pos_tagged.csv.bz2")
sentiment140_train <- sentiment140_train %>%
filter_and_sub_sample_data()
usethis::use_data(sentiment140_train, overwrite = TRUE)
#'
#' process test data
#'
#'
sentiment140_test <- readr::read_csv("./data-raw/sentiment140_test_pos_tagged.csv.bz2")
sentiment140_test <- sentiment140_test %>%
filter_and_sub_sample_data()
usethis::use_data(sentiment140_test, overwrite = TRUE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.