knitr::opts_chunk$set(echo = TRUE) pacman::p_load(tidyverse) pacman::p_load(textcat) pacman::p_load(stringr) pacman::p_load(text2vec) pacman::p_load(parallel) pacman::p_load(foreach) pacman::p_load(doMC) pacman::p_load(glmnet) pacman::p_load(here) source(here('R', 'stop_words.R')) source(here('R', 'cleaning.R')) source(here('R', 'text_model.R'))
reviews <- readr::read_csv(here('data', 'reviews_with_language_textcat.csv')) %>% filter(language == 'english') %>% select(-language) %>% inner_join(listings_with_id, by = c('listing_id' = 'id')) reviews
I ran this code to generate the language labels:
reviews <- reviews %>% mutate(language = textcat(comments))
# lower-case words prep_func <- tolower # tokenize text regex_pattern <- "(?u)\\b\\w\\w+\\b" token_func <- function(doc) { str_match_all(doc, regex_pattern) } # remove numers remove_numbers <- function(words) { non_numbers <- which(!str_detect(words, "\\d+")) words[non_numbers] } raw_documents <- reviews$comments %>% # lower-case prep_func %>% # tokenize by word token_func %>% # remove numbers lapply(remove_numbers) %>% as.matrix()
tfidf_features <- function(X_train, X_test = NULL) { # iterator over training documents it_train <- itoken(X_train, progressbar = FALSE) # vocabulary is created from training only vocab = create_vocabulary(it_train, ngram = c(1L, 2L), stopwords = stop_words) # prune vocabulary vocab = prune_vocabulary(vocab, doc_proportion_max = 0.8, doc_proportion_min = 0.001) # creates a document-term matrix vectorizer = vocab_vectorizer(vocab) # instantiate tfidf model tfidf = TfIdf$new() # fit model to train data and transform train data with fitted model tfidf_train <- create_dtm(it_train, vectorizer) %>% fit_transform(tfidf) if (!is.null(X_test)) { # iterator over test documents it_test <- itoken(X_test, progressbar = FALSE) # transform the test documents tfidf_test <- create_dtm(it_test, vectorizer) %>% transform(tfidf) } else { tfidf_test <- NULL } list(train=tfidf_train, test=tfidf_test) }
normit <- function(x) { x / max(abs(x)) } # tf-idf feature importance. Calculated as the sum of tf-idf weights # per term normalized by the maximum tf-idf weight. dtm <- tfidf_features(raw_documents)$train
tfidf_norm <- max(abs(colSums(dtm))) norm_tfidf <- colSums(dtm) / tfidf_norm tibble(term = names(norm_tfidf), importance = norm_tfidf) %>% mutate(term = reorder(term, importance, abs)) %>% arrange(desc(importance)) %>% top_n(40, importance) %>% ggplot(aes(x = importance, y = term)) + geom_segment(aes(x = 0, y = term, xend = importance, yend = term), color = "grey50") + geom_point() + loyalr::theme_pub()
N_FOLDS <- 5 doMC::registerDoMC(cores = parallel::detectCores()) glmnet_est <- cv.glmnet(x = dtm, y = reviews$nb, family = 'multinomial', alpha = 1, type.measure = "deviance", nfolds = N_FOLDS, thresh = 1e-3, maxit = 1e3)
plot(glmnet_est)
text_model <- init_text_model(glmnet_est, levels(reviews$nb)) saveRDS(text_model, here('data', 'text_model.rds'))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.