R/keywordGrowth.R

Defines functions trim.years KeywordGrowth

Documented in KeywordGrowth

#' Yearly occurrences of top keywords/terms 
#'
#' It calculates yearly occurrences of top keywords/terms.
#'
#' @param M is a data frame obtained by the converting function \code{\link{convert2df}}.
#'        It is a data matrix with cases corresponding to articles and variables to Field Tag in the original WoS or SCOPUS file.
#' @param Tag is a character object. It indicates one of the keyword field tags of the
#'   standard ISI WoS Field Tag codify (ID or DE) or a field tag created by \code{\link{termExtraction}} function (TI_TM, AB_TM, etc.).
#' @param sep is the field separator character. This character separates strings in each keyword column of the data frame. The default is \code{sep = ";"}.
#' @param top is a numeric. It indicates the number of top keywords to analyze. The default value is 10.
#' @param cdf is a logical. If TRUE, the function calculates the cumulative occurrences distribution. 
#' @param remove.terms is a character vector. It contains a list of additional terms to delete from the documents before term extraction. The default is \code{remove.terms = NULL}.
#' @param synonyms is a character vector. Each element contains a list of synonyms, separated by ";",  that will be merged into a single term (the first word contained in the vector element). The default is \code{synonyms = NULL}.
#' @return an object of class \code{data.frame}
#' @examples
#'
#' data(scientometrics, package = "bibliometrixData")
#' topKW=KeywordGrowth(scientometrics, Tag = "ID", sep = ";", top=5, cdf=TRUE)
#' topKW
#' 
#' # Plotting results
#' \dontrun{
#' install.packages("reshape2")
#' library(reshape2)
#' library(ggplot2)
#' DF=melt(topKW, id='Year')
#' ggplot(DF,aes(Year,value, group=variable, color=variable))+geom_line
#' }
#'
#' @export
KeywordGrowth <- function(M, Tag = "ID", sep = ";", top=10, cdf=TRUE, remove.terms=NULL, synonyms=NULL){
  i<-which(names(M)==Tag)
  PY=as.numeric(M$PY)
  Tab<-(strsplit(as.character(M[,i]),sep))
  Y=rep(PY,lengths(Tab))
  A=data.frame(Tab=unlist(Tab),Y=Y)
  A$Tab=trim.leading(A$Tab)
  A=A[A$Tab!="",]
  A=A[!is.na(A$Y),]
  
  ### remove terms
  terms <- data.frame(Tab=toupper(remove.terms))
  A <- anti_join(A,terms)
  # end of block
  
  ### Merge synonyms in the vector synonyms
  if (length(synonyms)>0 & is.character(synonyms)){
    s <- strsplit(toupper(synonyms),";")
    snew <- trimws(unlist(lapply(s,function(l) l[1])))
    sold <- (lapply(s,function(l) trimws(l[-1])))
    for (i in 1:length(s)){
      A <- A %>% 
        mutate(
          # Tab = str_replace_all(Tab, paste(sold[[i]], collapse="|",sep=""),snew[i])
          #Tab= str_replace_all(Tab, str_replace_all(str_replace_all(paste(sold[[i]], collapse="|",sep=""),"\\(","\\\\("),"\\)","\\\\)"),snew[i]),
          Tab= stringi::stri_replace_all_regex(Tab, stringi::stri_replace_all_regex(stringi::stri_replace_all_regex(paste(sold[[i]], collapse="|",sep=""),"\\(","\\\\("),"\\)","\\\\)"),snew[i])
        )
    }
  }
  # end of block
  
  Ymin=min(A$Y)
  Ymax=max(A$Y)
  Year=Ymin:Ymax
  Tab<-names(sort(table(A$Tab),decreasing=TRUE))[1:top]

  words=matrix(0,length(Year),top+1)
  words=data.frame(words)
  names(words)=c("Year",Tab)
  words[,1]=Year
  for (j in 1:length(Tab)){
    word=(table(A[A$Tab %in% Tab[j],2]))
    words[,j+1]=trim.years(word,Year,cdf)
    
  }
  return(words)
}

trim.years<-function(w,Year,cdf){

  Y=as.numeric(names(w))
  W=matrix(0,length(Year),1)
  
  for (i in 1:length(Year)){
    if (Y[1]==Year[i] & length(Y)>0){W[i,1]=w[1]
    Y=Y[-1]
    w=w[-1]}
  }
  if (isTRUE(cdf)) W=cumsum(W)
  names(W)=Year
  W=data.frame(W)
  return(W)}
massimoaria/bibliometrix documentation built on April 29, 2024, 2:15 p.m.