replication-code/02.keywords-extraction.R

#===============================================================================
# File Names       : 02.keywords-extraction.R 
# Date             : 31st Oct 2021
# Authors          : David Yen-Cheih Liao
# Purpose          : extract keywords feature and generate dictionary object
# Required Dataset : conll.csv, incident-group.csv     
# Output Data      : kyw_object.RData; redgaurds_wfm.RData; dict.RData
#                    redgaurds_dfm.RData
#===============================================================================

timer_task02 <- system.time({
# REQUIRED PACKAGES
#===============================================================================
# if (!require("pacman")) install.packages("pacman")
# pacman::p_load(
#   tidyverse, lubridate, dplyr, purrr, tibble,           # Tidyverse
#   tidyr, readxl, data.table,                            # Data Pre-processings
#   parallel, future, furrr, future.apply,                # Parallel Toolkit
#   doParallel, foreach, doFuture,
#   quanteda, tmcn, austin, udpipe, textrank,tmcn,        # NLP toolkit
#   emIRT                                                 # Generalized Wordfish
# )

# REQUIRED DATASET 
#===============================================================================
# load("data/incident.RData")
# load("data/conll.RData")
data(incident)
data(conll)
  
# EXTRACTIGN KEYWORD FEATURES
#===============================================================================

# split the dataframe by each incident and save them into list object
incident_list <- split(incident, incident$incident_index)

# re-index the incindent index in to numeric number in order 
names(incident_list) <- seq(1, length(incident_list)) 
conll$keyword_doc_id <- conll$doc_id

for (i in 1:length(incident_list)){
  conll$keyword_doc_id[ifelse(conll$keyword_doc_id %in% incident_list[[i]]$incident_index, TRUE, FALSE)]  <- i  
}

# number of incidents
num = length(unique(conll$keyword_doc_id))

# instantiate a list to store the data frame from the output of each incidents
kyw_object <- NULL
for (i in 1:num){
  # kyw_object: extract and pair the keywords based on each historical incidnet
  kyw_object[[i]] <- textrank::textrank_keywords(conll[conll$keyword_doc_id==i,]$token,
                                       relevant = conll[conll$keyword_doc_id==i,]$upos %in% c("NOUN", "ADJ","VERB"), 
                                       p = 0.2,
                                       ngram_max = 3,
                                       sep = "")
  keyw_list <- NULL
  data(STOPWORDS, package = "tmcn")
  for (j in 1:length(kyw_object)){
    # keyw_list: exclude stopwords and the number of characters less than 1 
    keyw_list[[j]] <- as.data.frame(t(kyw_object[[j]]$keywords[!kyw_object[[j]]$keywords$keyword %in% STOPWORDS$word & nchar(kyw_object[[j]]$keywords$keyword)>1,][["keyword"]]))
    colnames(keyw_list[[j]]) <- kyw_object[[j]]$keywords[!kyw_object[[j]]$keywords$keyword %in% STOPWORDS$word & nchar(kyw_object[[j]]$keywords$keyword)>1,][["keyword"]]
    # deduction: remove repetitive terms/phrases generated by TextRank 
    deduction <- list()
    for (g in 1:length(keyw_list)) {
      deduction[[g]] <- keyw_list[[g]][!duplicated(colnames(keyw_list[[g]]))]
    } 
    # dict_list: transpose them into list
    dict_list <- NULL
    for (k in 1:length(deduction)){
      dict_list[k] <- list(as.list(deduction[[k]])) 
    } 
    # keyword: turn integer attributes into string 
    keyword <- NULL  
    for (p in 1:length(dict_list)){
      keyword[[p]] <-  purrr::map(dict_list[[p]], as.character) 
    }
  }
} 


# BUILDING  DICTIONARY OBJECT USING QUANTEDA
#===============================================================================

doParallel::registerDoParallel(parallel::makeCluster(parallel::detectCores()-1))   
dict <- foreach::foreach(i = 1:length(keyword), .combine =  list, 
                         .multicombine = TRUE) %dopar% 
  {quanteda::dictionary(keyword[[i]])}
parallel::stopCluster(parallel::makeCluster(parallel::detectCores()-1))                  

# SAVE OUTPUTS
#===============================================================================
# save(kyw_object, file="data/kyw_object.RData")
# save(keyw_list, file="data/keyw_list.RData")
# save(keyword, file="data/keyword.RData")
# save(dict, file="data/dict.RData")

# CLEAN UNUSED OBJECTS TO SAVE MEMORIES
#===============================================================================
rm(list = setdiff(ls(), c("kyw_object","conll", "incident", "incident_list",
                          "dict", "keyword", "keyw_list" )))

})

#====================================END========================================

cat("\n ----------------------------------------- \n",
    "Task 02 is done..", "",  
    "\n", names(timer_task02[1]), ": ", timer_task02[[1]], 
    "\n", names(timer_task02[2]), " : ", timer_task02[[2]], 
    "\n", names(timer_task02[3]), "  :", timer_task02[[3]], 
    "\n", "Core used :",parallel::detectCores())
davidycliao/redguards documentation built on Feb. 28, 2023, 11:30 p.m.