knitr::opts_chunk$set(echo = TRUE)
library(tm) library(NLP) library(jiebaR) library(plyr) library(readxl) library(HMM) #library(xlsx) library(Rwordseg) library(dplyr) library(tidytext) library(ggplot2) library(stringr) library(igraph) library(ggraph) library(tidyr)
#加载文件 file_1<-read.csv("~/Downloads/Rpkg/nlph/data/merge.csv", header = TRUE, sep = ",", quote="\"", encoding = "UTF-8", stringsAsFactors = TRUE) #将text列数据转成character类型 file_1$text <- as.character(file_1$text) txtFile<-file_1$text engin1<-worker() #对文本进行分词处理 segWords<-segment(txtFile,engin1) #停止词的处理 mystopwords<- readLines('~/Downloads/Rpkg/nlph/data/en-stopword.txt') segWords<-removeWords(segWords,mystopwords) #去除数字和标点符号 segWords<-gsub('\\d|\\.|,|\\!|:|;|\\?',"",segWords) #到此为止,清洗完成 sre<-sort(table(segWords),decreasing = TRUE) #输出组合后的文件wordFreq.csv到文件夹 #write.csv(sre,file = "~/Downloads/Rpkg/nlph/data/wordFreq2.csv",row.names=F) sre.df<-data.frame(sre) head(sre.df[sre.df$Freq>500,]) sredf.df <- data.frame(stringsAsFactors=FALSE, word = sre.df$segWords, freq = sre.df$Freq) # "1" blank sredf.df<-sredf.df[2:30,] library(ggplot2) ggplot(sredf.df, aes(x = reorder(word, freq), y = freq)) + geom_col() + labs(title="Twitter ", x = NULL, y = "Frequency") + coord_flip()
#注:若需要计算其他文件网络图,只需改变read.csv()函数中的文件路径和文件中需要提取的列 #加载文件 file_1<-read.csv("~/Downloads/Rpkg/nlph/data/merge.csv", header = TRUE, sep = ",", quote="\"", encoding = "UTF-8", stringsAsFactors = TRUE) #读取text列文本 fi_1<-file_1[7] austen_bigrams <- fi_1 %>% unnest_tokens(bigram, text, token = "ngrams", n = 2)%>% separate(bigram, c("word1", "word2"), sep = " ") %>% filter(!word1 %in% stop_words$word, !word2 %in% stop_words$word, !str_detect(word1,'https'),!str_detect(word1,'http'),!str_detect(word1,'www.ndtv.com'), !str_detect(word2,'https'),!str_detect(word2,'http') ) %>% # count(word1, word2) count(word1, word2, sort = TRUE) #输出组合后的文件n-gram.csv到文件夹 write.csv(austen_bigrams, file = "~/Downloads/Rpkg/nlph/data/n-gram-new.csv",row.names=F) #画图 set.seed(2016) austen_bigrams<-read.csv(file = "~/Downloads//Rpkg/nlph/data/n-gram-new.csv") austen_bigrams_p<-austen_bigrams[austen_bigrams$n>100,] head(austen_bigrams) a <- grid::arrow(type = "closed", length = unit(.15, "inches")) ggraph(austen_bigrams_p, layout = "fr") + geom_edge_link(aes(edge_alpha = n), show.legend = FALSE, arrow = a, end_cap = circle(.07, 'inches')) + geom_node_point(color = "lightblue", size = 5) + geom_node_text(aes(label = name), vjust = 0.5, hjust = 0.5) + theme_void()
#读入数据,为了代码简便,我直接在原数据文件中删除了其他列,只留下了username和text列,保存为emo_merge.xlsx xle<-read_excel("~/Downloads/Rpkg/nlph/data/emo_merge.xlsx") emo<-xle%>% #将此csv文件中的所有单词建立一个整洁的数据框架 unnest_tokens(word,TEXT) afin<-emo%>% #使用inner_join函数进行词与情感得分的连接 inner_join(get_sentiments("afinn"))%>% #根据进行分类 group_by(USERNAME)%>% #将每类的情感得分/有情感分数的单词字数*100 summarise(sentiment = sum(value)/nrow(emo)*100) #write.csv(afin,file = '~/Downloads/Rpkg/nlph/data/percent-emo2.csv',row.names=F) afin.df <- data.frame(stringsAsFactors=FALSE, word = afin$USERNAME, freq =afin$sentiment) library(ggplot2) ggplot(afin.df, aes(x = reorder(word, freq), y = freq)) + geom_col() + labs(title="Twitter ", x = NULL, y = "Sentiment") + coord_flip()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.