#' Term extraction tool from textual fields of a manuscript
#'
#' It extracts terms from a text field (abstract, title, author's keywords, etc.) of a bibliographic data frame.
#' @param M is a data frame obtained by the converting function \code{\link{convert2df}}.
#' It is a data matrix with cases corresponding to articles and variables to Field Tag in the original WoS or SCOPUS file.
#' @param Field is a character object. It indicates the field tag of textual data :
#' \tabular{lll}{
#' \code{"TI"}\tab \tab Manuscript title\cr
#' \code{"AB"}\tab \tab Manuscript abstract\cr
#' \code{"ID"}\tab \tab Manuscript keywords plus\cr
#' \code{"DE"}\tab \tab Manuscript author's keywords}
#' The default is \code{Field = "TI"}.
#'
#' @param ngrams is an integer between 1 and 3. It indicates the type of n-gram to extract from texts.
#' An n-gram is a contiguous sequence of n terms. The function can extract n-grams composed by 1, 2, 3 or 4 terms. Default value is \code{ngrams=1}.
#' @param stemming is logical. If TRUE the Porter Stemming algorithm is applied to all extracted terms. The default is \code{stemming = FALSE}.
#' @param language is a character. It is the language of textual contents ("english", "german","italian","french","spanish"). The default is \code{language="english"}.
#' @param remove.numbers is logical. If TRUE all numbers are deleted from the documents before term extraction. The default is \code{remove.numbers = TRUE}.
#' @param remove.terms is a character vector. It contains a list of additional terms to delete from the documents before term extraction. The default is \code{remove.terms = NULL}.
#' @param keep.terms is a character vector. It contains a list of compound words "formed by two or more terms" to keep in their original form in the term extraction process. The default is \code{keep.terms = NULL}.
#' @param synonyms is a character vector. Each element contains a list of synonyms, separated by ";", that will be merged into a single term (the first word contained in the vector element). The default is \code{synonyms = NULL}.
#' @param verbose is logical. If TRUE the function prints the most frequent terms extracted from documents. The default is \code{verbose=TRUE}.
#' @return the bibliometric data frame with a new column containing terms about the field tag indicated in the argument \code{Field}.
#'
#'
#' @examples
#' # Example 1: Term extraction from titles
#'
#' data(scientometrics, package = "bibliometrixData")
#'
#' # vector of compound words
#' keep.terms <- c("co-citation analysis","bibliographic coupling")
#'
#' # term extraction
#' scientometrics <- termExtraction(scientometrics, Field = "TI", ngrams = 1,
#' remove.numbers=TRUE, remove.terms=NULL, keep.terms=keep.terms, verbose=TRUE)
#'
#' # terms extracted from the first 10 titles
#' scientometrics$TI_TM[1:10]
#'
#'
#' #Example 2: Term extraction from abstracts
#'
#' data(scientometrics)
#'
#' # term extraction
#' scientometrics <- termExtraction(scientometrics, Field = "AB", ngrams = 2,
#' stemming=TRUE,language="english",
#' remove.numbers=TRUE, remove.terms=NULL, keep.terms=NULL, verbose=TRUE)
#'
#' # terms extracted from the first abstract
#' scientometrics$AB_TM[1]
#'
#' # Example 3: Term extraction from keywords with synonyms
#'
#' data(scientometrics)
#'
#' # vector of synonyms
#' synonyms <- c("citation; citation analysis", "h-index; index; impact factor")
#'
#' # term extraction
#' scientometrics <- termExtraction(scientometrics, Field = "ID", ngrams = 1,
#' synonyms=synonyms, verbose=TRUE)
#'
#'
#' @seealso \code{\link{convert2df}} to import and convert an WoS or SCOPUS Export file in a bibliographic data frame.
#' @seealso \code{\link{biblioAnalysis}} function for bibliometric analysis
#'
#' @export
termExtraction <- function(M, Field="TI", ngrams = 1, stemming=FALSE, language="english",remove.numbers=TRUE, remove.terms=NULL, keep.terms=NULL, synonyms=NULL, verbose=TRUE){
# ngrams imposed = 1 for keywords
if (Field %in% c("ID","DE")){ngrams <- 1}
# load stopwords
data("stopwords",envir=environment())
data("stop_words", envir=environment(), package = "tidytext")
stop_words <- stop_words %>% as.data.frame()
switch(language,
english={stopwords=(stop_words$word)},
italian={stopwords=stopwords$it},
german={stopwords=stopwords$de},
french={stopwords=stopwords$fr},
spanish={stopwords=stopwords$es}
)
stopwords <- tolower(stopwords)
# remove all special characters (except "-" becoming "_")
TERMS <- M %>%
select(.data$SR,!!Field)
names(TERMS) <- c("SR","text")
# save original multi-words keywords
if (Field %in% c("ID","DE")){
listTerms <- strsplit(TERMS$text,";")
TERMS$text <- unlist(lapply(listTerms, function(l){
l <- tolower(paste(gsub(" ","_",trimES(trimws(l))), sep="", collapse=";"))
}))
} else {
TERMS <- TERMS %>%
mutate(text = tolower(gsub("[^[:alnum:][:blank:]\\-]", "", .data$text)),
text = gsub("-", "_",.data$text))
}
# remove numbers
if (remove.numbers==TRUE){
TERMS <- TERMS %>%
mutate(text = gsub("[[:digit:]]","",.data$text))
}
# keep terms in the vector keep.terms
if (length(keep.terms)>0 & class(keep.terms)=="character"){
keep.terms <- tolower(keep.terms)
if (Field %in% c("DE","ID")){
kt <- gsub(" |-","_",keep.terms)
} else {
kt <- gsub("-","_",keep.terms)
}
for (i in 1:length(keep.terms)){
TERMS <- TERMS %>%
mutate(text = gsub(keep.terms[i],kt[i],.data$text))
}
}
# keep terms in the vector keep.terms
if (length(synonyms)>0 & class(synonyms)=="character"){
s <- strsplit(tolower(synonyms),";")
snew <- paste(" ",unlist(lapply(s,function(l) l[1]))," ",sep="")
sold <- (lapply(s,function(l) paste(" ",l[-1]," ",sep="")))
TERMS <- TERMS %>% mutate(text = paste(" ",.data$text," ", sep=""))
for (i in 1:length(s)){
TERMS <- TERMS %>%
mutate(
text = str_replace_all(.data$text, paste(sold[[i]], collapse="|",sep=""),snew[i]))
}
}
if (is.null(remove.terms)) remove.terms <- ""
TERMS <- extractNgrams(text=TERMS, Var="text", nword=ngrams,
stopwords=stopwords, custom_stopwords=tolower(remove.terms),
stemming=stemming, language=language)
TERMS <- TERMS %>%
dplyr::filter(!(.data$ngram %in% paste(rep("NA",ngrams),sep="",collapse=" "))) %>%
group_by(.data$SR) %>%
summarize(text = paste(.data$ngram, collapse=";"))
if (Field %in% c("ID","DE")){
TERMS <- TERMS %>%
mutate(text = gsub("_", " ", .data$text))
}
# assign the vector to the bibliographic data frame
col_name <- paste(Field,"_TM",sep="")
M <- M[!names(M) %in% col_name]
M <- TERMS %>%
right_join(M, by = "SR")
names(M)[which(names(M) %in% "text")] <- col_name
# display results
if (verbose==TRUE){
s <- tableTag(M,col_name)
if (length(s>25)){print(s[1:25])}else{print(s)}
}
class(M) <- c("bibliometrixDB", "data.frame")
row.names(M) <- M$SR
return(M)
}
extractNgrams <- function(text, Var, nword, stopwords, custom_stopwords, stemming, language){
# text is data frame containing the corpus data text = M %>% select(.data$SR,.data$AB)
# Var is a string indicating the column name. I.e. Var = "AB"
# nword is a integer vector indicating the ngrams to extract. I.e. nword = c(2,3)
custom_stopwords <- c(stopwords, custom_stopwords, "elsevier", "springer", "wiley", "mdpi", "emerald")
custom_stopngrams <- c("rights reserved", "john wiley", "john wiley sons", "science bv", "mdpi basel",
"mdpi licensee", "emerald publishing", "taylor francis", "paper proposes",
"we proposes", "paper aims", "articles published", "study aims")
ngram <- NULL
ngrams <- text %>%
drop_na(any_of(Var)) %>%
unnest_tokens(ngram, !!Var, token = "ngrams", n = nword) %>%
separate(.data$ngram, paste("word",1:nword,sep=""), sep = " ") %>%
dplyr::filter(if_all(starts_with("word"), ~ !.x %in% custom_stopwords))
if (isTRUE(stemming)){
ngrams <- ngrams %>%
mutate(across(paste("word",1:nword,sep=""), ~SnowballC::wordStem(.x,language=language)))
}
#filter(if_all(starts_with("word"), ~ !str_detect(.x, "\\d"))) %>%
ngrams <- ngrams %>%
unite(ngram, paste("word",1:nword,sep=""), sep = " ") %>%
dplyr::filter(!.data$ngram %in% custom_stopngrams) %>%
mutate(ngram = toupper(.data$ngram))
return(ngrams)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.