#===============================================================================
# File Names : 02.keywords-extraction.R
# Date : 31st Oct 2021
# Authors : David Yen-Cheih Liao
# Purpose : extract keywords feature and generate dictionary object
# Required Dataset : conll.csv, incident-group.csv
# Output Data : kyw_object.RData; redgaurds_wfm.RData; dict.RData
# redgaurds_dfm.RData
#===============================================================================
timer_task02 <- system.time({
# REQUIRED PACKAGES
#===============================================================================
# if (!require("pacman")) install.packages("pacman")
# pacman::p_load(
# tidyverse, lubridate, dplyr, purrr, tibble, # Tidyverse
# tidyr, readxl, data.table, # Data Pre-processings
# parallel, future, furrr, future.apply, # Parallel Toolkit
# doParallel, foreach, doFuture,
# quanteda, tmcn, austin, udpipe, textrank,tmcn, # NLP toolkit
# emIRT # Generalized Wordfish
# )
# REQUIRED DATASET
#===============================================================================
# load("data/incident.RData")
# load("data/conll.RData")
data(incident)
data(conll)
# EXTRACTIGN KEYWORD FEATURES
#===============================================================================
# split the dataframe by each incident and save them into list object
incident_list <- split(incident, incident$incident_index)
# re-index the incindent index in to numeric number in order
names(incident_list) <- seq(1, length(incident_list))
conll$keyword_doc_id <- conll$doc_id
for (i in 1:length(incident_list)){
conll$keyword_doc_id[ifelse(conll$keyword_doc_id %in% incident_list[[i]]$incident_index, TRUE, FALSE)] <- i
}
# number of incidents
num = length(unique(conll$keyword_doc_id))
# instantiate a list to store the data frame from the output of each incidents
kyw_object <- NULL
for (i in 1:num){
# kyw_object: extract and pair the keywords based on each historical incidnet
kyw_object[[i]] <- textrank::textrank_keywords(conll[conll$keyword_doc_id==i,]$token,
relevant = conll[conll$keyword_doc_id==i,]$upos %in% c("NOUN", "ADJ","VERB"),
p = 0.2,
ngram_max = 3,
sep = "")
keyw_list <- NULL
data(STOPWORDS, package = "tmcn")
for (j in 1:length(kyw_object)){
# keyw_list: exclude stopwords and the number of characters less than 1
keyw_list[[j]] <- as.data.frame(t(kyw_object[[j]]$keywords[!kyw_object[[j]]$keywords$keyword %in% STOPWORDS$word & nchar(kyw_object[[j]]$keywords$keyword)>1,][["keyword"]]))
colnames(keyw_list[[j]]) <- kyw_object[[j]]$keywords[!kyw_object[[j]]$keywords$keyword %in% STOPWORDS$word & nchar(kyw_object[[j]]$keywords$keyword)>1,][["keyword"]]
# deduction: remove repetitive terms/phrases generated by TextRank
deduction <- list()
for (g in 1:length(keyw_list)) {
deduction[[g]] <- keyw_list[[g]][!duplicated(colnames(keyw_list[[g]]))]
}
# dict_list: transpose them into list
dict_list <- NULL
for (k in 1:length(deduction)){
dict_list[k] <- list(as.list(deduction[[k]]))
}
# keyword: turn integer attributes into string
keyword <- NULL
for (p in 1:length(dict_list)){
keyword[[p]] <- purrr::map(dict_list[[p]], as.character)
}
}
}
# BUILDING DICTIONARY OBJECT USING QUANTEDA
#===============================================================================
doParallel::registerDoParallel(parallel::makeCluster(parallel::detectCores()-1))
dict <- foreach::foreach(i = 1:length(keyword), .combine = list,
.multicombine = TRUE) %dopar%
{quanteda::dictionary(keyword[[i]])}
parallel::stopCluster(parallel::makeCluster(parallel::detectCores()-1))
# SAVE OUTPUTS
#===============================================================================
# save(kyw_object, file="data/kyw_object.RData")
# save(keyw_list, file="data/keyw_list.RData")
# save(keyword, file="data/keyword.RData")
# save(dict, file="data/dict.RData")
# CLEAN UNUSED OBJECTS TO SAVE MEMORIES
#===============================================================================
rm(list = setdiff(ls(), c("kyw_object","conll", "incident", "incident_list",
"dict", "keyword", "keyw_list" )))
})
#====================================END========================================
cat("\n ----------------------------------------- \n",
"Task 02 is done..", "",
"\n", names(timer_task02[1]), ": ", timer_task02[[1]],
"\n", names(timer_task02[2]), " : ", timer_task02[[2]],
"\n", names(timer_task02[3]), " :", timer_task02[[3]],
"\n", "Core used :",parallel::detectCores())
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.