Computational text analysis for Twitter data

This package is for text analysis including preprocessing, topic modelling, semantic network, and sentiment analysis. The function includes both analysis and graphic visualization.

library(devtools)
install_github("BeaJJ/ComTxt")

library(ComTxt)
library(dplyr)
library(quanteda)
library(tidyr)
library(scales)
library(igraph)
library(topicmodels)
library(maps)
library(purrr)
library(mallet)
library(stopwords)
library(ggplot2)

Preprocessing

1. geo preprocessing with Premium account

Instead of "exmple_df" place your Data name. Multi_names indicate other names that user might write. I found users wrote Croatia mainly but some people also wrote republic of croatia. You can check from the geo_preprocess function.

## put all different names of your country check the value of multi_name
?geo_preprocess
example_df <- geo_preprocess(example_df, location_country = "spain" , ##"finland" ##"croatia"
                         multi_name = c("españa","spagna")) ##c("Suomi")  c("republic of croatia")

Geo information Description

geo_map(example_df, location_country = "Spain")
geo_table(example_df)

3. text preprocessing

Please check the function and ud_lang, stopwords you can choose according to your language. Select your own langauge for preprocessing

unique(example_df$lang) 
example_lang.1 <- example_df[example_df$lang == "es", ] 
example_lang.2 <- example_df[example_df$lang == "ca", ]

?twitter_preprocess
## preprocess : lemmatization, stopwords, urls, emoticons
stop_words <- c(stopwords::stopwords("es", source = "stopwords-iso"))
df_lang.1 <- twitter_preprocess(example_lang.1, ud_lang = "spanish", stop_words)

## stopwords 
stop_words <- c(stopwords::stopwords("ca", source = "stopwords-iso"))
df_lang.2 <- twitter_preprocess(example_lang.2, ud_lang = "catalan", stop_words)

df_pre <- rbind(df_lang.1, df_lang.2)

Twitter Description

1. Key words Description

keyword_network(df_pre, keyword_remove = "cultura", top_n = 40)
keyword_table(df_pre, keyword_remove= "cultura", top_n = 10)
##select your keyword to zoom in
keyword_network_hub(df_pre,hub="libro", top_n = 40 )
shiny_keyword(df_pre)

2. Hashtags Description

hashtags_network(df_pre, hashtag_remove = "#cultura", top_n = 40)
hashtags_table(df_pre, hashtag_remove = "#cultura", top_n = 10)
##cine #historia #literatura #música
##select your own key hashtags fromm  table
hashtags_network_hub(df_pre, hub = "#arte", top_n = 10)

3. User Description

Top n users based on follower, friends, tweets number

user_table(df_pre, top_n = 10 )
mention_table(df_pre, top_n = 10)
mention_network(df_pre, top_n = 20)

shiny_mention(df_pre)

Mallet Topic Modeling

1. Topic number K

n_topic is the topic number K you select.

## Topic modeling------------------------------------------------------
?topic_number
topic_number(df_pre)

2. Topic modelling

### mallet topic modeling
?twitter_topic
df_topics <- twitter_topic(df_pre, n_topic = 7)

3. Topic modelling visualization

### data frame words topic
?topic_getwords
topic_getwords(df_topics, n_words = 100)

### draw wordscloud for each topics 3*3 = 9 spots (depending on your topic numbers)
?topic_wordcloud
topic_wordcloud(df_topics, maximum_n = 200, cloud_scale = 2.5)

### draw top n words in each topic 
?topic_wordplot 
topic_wordplot(df_topics,  num_words = 10)

### draw topic radar map based on year distribution 
?topic_radarmap
topic_radarmap(df_pre, df_topics)

?shiny_topic
shiny_topic(df_topics, df_pre)


BeaJJ/ComTxt documentation built on Dec. 17, 2021, 10:46 a.m.