bibliometrixfullname: Comprehensive Science Mapping Analysis

Documented in termExtraction

#' Term extraction tool from textual fields of a manuscript
#'
#' It extracts terms from a text field (abstract, title, author's keywords, etc.) of a bibliographic data frame.
#' @param M is a data frame obtained by the converting function \code{\link{convert2df}}.
#'        It is a data matrix with cases corresponding to articles and variables to Field Tag in the original WoS or SCOPUS file.
#' @param Field is a character object. It indicates the field tag of textual data :
#' \tabular{lll}{
#' \code{"TI"}\tab   \tab Manuscript title\cr
#' \code{"AB"}\tab   \tab Manuscript abstract\cr
#' \code{"ID"}\tab   \tab Manuscript keywords plus\cr
#' \code{"DE"}\tab   \tab Manuscript author's keywords}
#' The default is \code{Field = "TI"}.
#'
#' @param ngrams is an integer between 1 and 3. It indicates the type of n-gram to extract from texts. 
#' An n-gram is a contiguous sequence of n terms. The function can extract n-grams composed by 1, 2, 3 or 4 terms. Default value is \code{ngrams=1}.
#' @param stemming is logical. If TRUE the Porter Stemming algorithm is applied to all extracted terms. The default is \code{stemming = FALSE}.
#' @param language is a character. It is the language of textual contents ("english", "german","italian","french","spanish"). The default is \code{language="english"}.
#' @param remove.numbers is logical. If TRUE all numbers are deleted from the documents before term extraction. The default is \code{remove.numbers = TRUE}.
#' @param remove.terms is a character vector. It contains a list of additional terms to delete from the documents before term extraction. The default is \code{remove.terms = NULL}.
#' @param keep.terms is a character vector. It contains a list of compound words "formed by two or more terms" to keep in their original form in the term extraction process. The default is \code{keep.terms = NULL}.
#' @param synonyms is a character vector. Each element contains a list of synonyms, separated by ";",  that will be merged into a single term (the first word contained in the vector element). The default is \code{synonyms = NULL}.
#' @param verbose is logical. If TRUE the function prints the most frequent terms extracted from documents. The default is \code{verbose=TRUE}.
#' @return the bibliometric data frame with a new column containing terms about the field tag indicated in the argument \code{Field}.
#'
#'
#' @examples
#' # Example 1: Term extraction from titles
#'
#' data(scientometrics, package = "bibliometrixData")
#' 
#' # vector of compound words
#' keep.terms <- c("co-citation analysis","bibliographic coupling")
#' 
#' # term extraction
#' scientometrics <- termExtraction(scientometrics, Field = "TI", ngrams = 1,
#' remove.numbers=TRUE, remove.terms=NULL, keep.terms=keep.terms, verbose=TRUE)
#' 
#' # terms extracted from the first 10 titles
#' scientometrics$TI_TM[1:10]
#'
#'
#' #Example 2: Term extraction from abstracts
#'
#' data(scientometrics)
#' 
#' # term extraction
#' scientometrics <- termExtraction(scientometrics, Field = "AB", ngrams = 2, 
#'  stemming=TRUE,language="english",
#'  remove.numbers=TRUE, remove.terms=NULL, keep.terms=NULL, verbose=TRUE)
#' 
#' # terms extracted from the first abstract
#' scientometrics$AB_TM[1]
#' 
#' # Example 3: Term extraction from keywords with synonyms
#'
#' data(scientometrics)
#' 
#' # vector of synonyms 
#' synonyms <- c("citation; citation analysis", "h-index; index; impact factor")
#' 
#' # term extraction
#' scientometrics <- termExtraction(scientometrics, Field = "ID", ngrams = 1,
#' synonyms=synonyms, verbose=TRUE)
#'
#'
#' @seealso \code{\link{convert2df}} to import and convert an WoS or SCOPUS Export file in a bibliographic data frame.
#' @seealso \code{\link{biblioAnalysis}} function for bibliometric analysis
#' 
#' @export



termExtraction <- function(M, Field="TI", ngrams = 1, stemming=FALSE, language="english",remove.numbers=TRUE, remove.terms=NULL, keep.terms=NULL, synonyms=NULL, verbose=TRUE){
  
  # ngrams imposed = 1 for keywords
  if (Field %in% c("ID","DE")){ngrams <- 1}
  
  # load stopwords
  data("stopwords",envir=environment())
  data("stop_words", envir=environment(), package = "tidytext")
  stop_words <- stop_words %>% as.data.frame()
  
  switch(language,
    english={stopwords=(stop_words$word)},
    italian={stopwords=stopwords$it},
    german={stopwords=stopwords$de},
    french={stopwords=stopwords$fr},
    spanish={stopwords=stopwords$es}
    )
  stopwords <- tolower(stopwords)

  # remove all special characters (except "-" becoming "_")
  TERMS <- M %>% 
    select(.data$SR,!!Field)
  
  names(TERMS) <- c("SR","text")
  
  # save original multi-words keywords
  if (Field %in% c("ID","DE")){
    listTerms <- strsplit(TERMS$text,";")
    TERMS$text <- unlist(lapply(listTerms, function(l){
      l <- tolower(paste(gsub(" ","_",trimES(trimws(l))), sep="", collapse=";"))  
    }))
  } else {
  
  TERMS <- TERMS %>%
    mutate(text = tolower(gsub("[^[:alnum:][:blank:]\\-]", "", .data$text)),
           text = gsub("-", "_",.data$text))
  }

  
  # remove numbers
  if (remove.numbers==TRUE){
    TERMS <- TERMS %>%
      mutate(text = gsub("[[:digit:]]","",.data$text))
    }
  
  # keep terms in the vector keep.terms
  if (length(keep.terms)>0 & class(keep.terms)=="character"){
    keep.terms <- tolower(keep.terms)
    if (Field %in% c("DE","ID")){
      kt <- gsub(" |-","_",keep.terms)
    } else {
      kt <- gsub("-","_",keep.terms)
    }
    for (i in 1:length(keep.terms)){
      TERMS <- TERMS %>%
        mutate(text = gsub(keep.terms[i],kt[i],.data$text))
    }
  }

  
  # keep terms in the vector keep.terms
  if (length(synonyms)>0 & class(synonyms)=="character"){
    s <- strsplit(tolower(synonyms),";")
    snew <- paste(" ",unlist(lapply(s,function(l) l[1]))," ",sep="")
    sold <- (lapply(s,function(l) paste(" ",l[-1]," ",sep="")))
    TERMS <- TERMS %>% mutate(text = paste(" ",.data$text," ", sep=""))
    for (i in 1:length(s)){
      TERMS <- TERMS %>%
        mutate(
          text = str_replace_all(.data$text, paste(sold[[i]], collapse="|",sep=""),snew[i]))
    }
  }
  if (is.null(remove.terms)) remove.terms <- ""
  
  TERMS <- extractNgrams(text=TERMS, Var="text", nword=ngrams, 
                             stopwords=stopwords, custom_stopwords=tolower(remove.terms),
                             stemming=stemming, language=language)
  
  TERMS <- TERMS %>%
    dplyr::filter(!(.data$ngram %in% paste(rep("NA",ngrams),sep="",collapse=" "))) %>% 
    group_by(.data$SR) %>%
    summarize(text = paste(.data$ngram, collapse=";"))
  
  if (Field %in% c("ID","DE")){
    TERMS <- TERMS %>% 
      mutate(text = gsub("_", " ", .data$text))
  }
  
  # assign the vector to the bibliographic data frame
  col_name <- paste(Field,"_TM",sep="")
  M <- M[!names(M) %in% col_name]
  
  M <- TERMS %>%
    right_join(M, by = "SR") 
  names(M)[which(names(M) %in% "text")] <- col_name
  
  
  # display results
  if (verbose==TRUE){
    s <- tableTag(M,col_name)
    
    if (length(s>25)){print(s[1:25])}else{print(s)}
    }
  
  class(M) <- c("bibliometrixDB", "data.frame")
  row.names(M) <- M$SR
  return(M)
  
}

extractNgrams <- function(text, Var, nword, stopwords, custom_stopwords, stemming, language){
  # text is data frame containing the corpus data text = M %>% select(.data$SR,.data$AB)
  # Var is a string indicating the column name. I.e. Var = "AB"
  # nword is a integer vector indicating the ngrams to extract. I.e. nword = c(2,3)
  
  custom_stopwords <- c(stopwords, custom_stopwords, "elsevier", "springer", "wiley", "mdpi", "emerald")
  custom_stopngrams <- c("rights reserved", "john wiley", "john wiley sons", "science bv", "mdpi basel", 
                         "mdpi licensee", "emerald publishing", "taylor francis", "paper proposes", 
                         "we proposes", "paper aims", "articles published", "study aims")
  ngram <- NULL
  
  ngrams <- text %>%
    drop_na(any_of(Var)) %>%
    unnest_tokens(ngram, !!Var, token = "ngrams", n = nword) %>%
    separate(.data$ngram, paste("word",1:nword,sep=""), sep = " ") %>%
    dplyr::filter(if_all(starts_with("word"), ~ !.x %in% custom_stopwords)) 
  
  if (isTRUE(stemming)){
    ngrams <- ngrams %>% 
      mutate(across(paste("word",1:nword,sep=""), ~SnowballC::wordStem(.x,language=language)))
    }
    #filter(if_all(starts_with("word"), ~ !str_detect(.x, "\\d"))) %>%
    ngrams <- ngrams %>% 
      unite(ngram, paste("word",1:nword,sep=""), sep = " ") %>%
      dplyr::filter(!.data$ngram %in% custom_stopngrams) %>%
      mutate(ngram = toupper(.data$ngram))

  return(ngrams)
}

xiangtuochen/bibliometrixfullname documentation built on Jan. 18, 2022, 7:05 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

xiangtuochen/bibliometrixfullname
Comprehensive Science Mapping Analysis

R/termExtraction.R
In xiangtuochen/bibliometrixfullname: Comprehensive Science Mapping Analysis

Defines functions extractNgrams termExtraction

Documented in termExtraction

R Package Documentation

Browse R Packages

We want your feedback!

xiangtuochen/bibliometrixfullname Comprehensive Science Mapping Analysis

R/termExtraction.R In xiangtuochen/bibliometrixfullname: Comprehensive Science Mapping Analysis

Defines functions extractNgrams termExtraction

Documented in termExtraction

R Package Documentation

Browse R Packages

We want your feedback!

xiangtuochen/bibliometrixfullname
Comprehensive Science Mapping Analysis

R/termExtraction.R
In xiangtuochen/bibliometrixfullname: Comprehensive Science Mapping Analysis