R/rpys.R

Defines functions refCleaning reduceRefs yearExtract rpys

Documented in rpys

utils::globalVariables(c("Year", "diffMedian", "Citations", "citedYears",
                         "Reference", "citingYears", "benchmark", "status", "citations",
                         "Freq", "diffMedian5"))
#' Reference Publication Year Spectroscopy
#'
#' \code{rpys} computes a Reference Publication Year Spectroscopy for detecting 
#' the Historical Roots of Research Fields.
#' The method was introduced by Marx et al., 2014.\cr\cr
#' 
#' Reference:\cr
#' Marx, W., Bornmann, L., Barth, A., & Leydesdorff, L. (2014). 
#' Detecting the historical roots of research fields by reference publication 
#' year spectroscopy (RPYS). Journal of the Association for Information Science and Technology, 
#' 65(4), 751-764.\cr\cr
#' 
#' @param M is a data frame obtained by the converting function
#'   \code{\link{convert2df}}. It is a data matrix with cases corresponding to
#'   articles and variables to Field Tag in the original ISI or SCOPUS file.
#' @param sep is the cited-references separator character. This character separates cited-references in the CR
#' column of the data frame. The default is \code{sep = ";"}.
#' @param timespan is a numeric vector c(min year,max year). The default value is NULL (the entire timespan is considered).
#' @param graph is a logical. If TRUE the function plot the spectroscopy otherwise the plot is created but not drawn down.
#' @return a list containing the spectroscopy (class ggplot2) and three dataframes with the number of citations
#' per year, the list of the cited references for each year, and the reference list with citations recorded year by year, respectively.
#'  
#'
#' @examples
#' 
#'
#' data(scientometrics, package = "bibliometrixData")
#' res <- rpys(scientometrics, sep=";", graph = TRUE)
#'
#' @seealso \code{\link{convert2df}} to import and convert an ISI or SCOPUS
#'   Export file in a data frame.
#' @seealso \code{\link{biblioAnalysis}} to perform a bibliometric analysis.
#' @seealso \code{\link{biblioNetwork}} to compute a bibliographic network.
#' @export

rpys <- function(M, sep=";", timespan=NULL, graph=T){

  options(dplyr.summarise.inform = FALSE)
  
  M$CR<-gsub("DOI;","DOI ", as.character(M$CR))
  
  Fi<-strsplit(M[,"CR"],sep)
  Fi<-lapply(Fi,trim.leading)
  Fi<-lapply(Fi,function(l) l<-l[nchar(l)>10])
  citingYears <- rep(M$PY,lengths(Fi))
  Fi<-(unlist(Fi))
  
  df <- data.frame(Reference=Fi, citingYears=citingYears) %>% 
    mutate(Reference = refCleaning(Reference, db=M$DB[1]))
  df$citedYears <- as.numeric(yearExtract(df$Reference, db=M$DB[1]))
  
  df <- df %>% 
    dplyr::filter(!is.na(Reference) & citedYears>1700 & citedYears<=as.numeric(substr(Sys.Date(),1,4))) %>% 
    group_by(citedYears,citingYears, Reference) %>% 
    summarize(citations = n()) %>% 
    group_by(citedYears,citingYears) %>% 
    mutate(benchmark = mean(citations,na.rm=T),
           status = sign(citations-benchmark)) %>% 
    ungroup() %>% 
    arrange(citedYears,Reference,citingYears) 

  
  
  CR <- df %>% 
    group_by(citedYears,Reference) %>% 
    select(-citingYears, -status) %>% 
    summarize(Freq = sum(citations))

RPYS <- CR %>% 
  select(-Reference) %>% 
  group_by(citedYears) %>% 
  summarize(n = sum(Freq, na.rm=TRUE))
  
yearSeq <- RPYS$citedYears
missingYears <- setdiff(seq(min(yearSeq),max(yearSeq)), yearSeq)
RPYS[(nrow(RPYS)+1):(nrow(RPYS)+length(missingYears)),] <- rbind(cbind(missingYears,rep(0,length(missingYears))))
RPYS <- RPYS %>% arrange(citedYears)


## calculating running median
YY <- c(rep(0,4),RPYS$n)
Median <- numeric(nrow(RPYS))
for (i in 5:length(YY)){
  Median[i-4]=median(YY[(i-4):i])
}
####
#Median=runmed(Y,5)
RPYS$diffMedian <- RPYS$n-Median



if (length(timespan)==2){
  RPYS <- RPYS %>% 
    dplyr::filter(citedYears>=min(timespan) & 
                    citedYears<=max(timespan))
}
names(RPYS) <- c("Year", "Citations", "diffMedian5")

RPYS <- RPYS %>% 
  mutate(diffMedian = ifelse(diffMedian5>0,diffMedian5,0))

data("logo",envir=environment())
 <- grid::rasterGrob(,interpolate = TRUE)

x <- c(min(RPYS$Year),min(RPYS$Year)+diff(range(RPYS$Year))*0.125)+1
y <- c(min(c(RPYS$Citations,RPYS$diffMedian)),min(c(RPYS$Citations,RPYS$diffMedian))+diff(range(c(RPYS$Citations,RPYS$diffMedian)))*0.125)*1.05



g=ggplot(RPYS, aes(x=Year ,y=Citations,text=paste("Year: ",Year,"\nN. of References: ",Citations)))+
  geom_line(aes(group="NA")) +
  #geom_area(aes(group="NA"),fill = 'grey90', alpha = .5) +
  #geom_hline(aes(yintercept=0, color = 'grey'))+
  geom_line(aes(x=Year,y=diffMedian, color="firebrick", group="NA"))+
  labs(x = 'Year'
       , y = 'Cited References'
       , title = "Reference Publication Year Spectroscopy",
       caption = "Number of Cited References (black line) - Deviation from the 5-Year Median (red line)") +
  scale_x_continuous(breaks= (RPYS$Year[seq(1,length(RPYS$Year),by=round(length(RPYS$Year)/30))])) +
  theme(text = element_text(color = "#444444"), legend.position="none"
        ,plot.caption = element_text(size = 9, hjust = 0.5,
                                     color = "black", face = "bold")
        ,panel.background = element_rect(fill = '#FFFFFF')
        #,panel.grid.minor = element_line(color = '#FFFFFF')
        ,panel.grid.major = element_line(color = '#EFEFEF')
        ,plot.title = element_text(size = 24)
        ,axis.title = element_text(size = 14, color = '#555555')
        ,axis.title.y = element_text(vjust = 1, angle = 90)
        ,axis.title.x = element_text(hjust = 0.95, angle = 0)
        ,axis.text.x = element_text(size=8,angle = 90)
        ,axis.line.x = element_line(color="black", size=0.5)
        ,axis.line.y = element_line(color="black", size=0.5)
  ) + annotation_custom(, xmin = x[1], xmax = x[2], ymin = y[1], ymax = y[2]) 

    if (isTRUE(graph)){plot(g)}
    CR$Reference <- reduceRefs(CR$Reference)
    CR <- CR %>% 
      rename(Year = citedYears) %>% 
      ungroup()
    result=list(spectroscopy=g, 
                rpysTable=RPYS, 
                CR=CR %>% mutate(Year = as.character(Year)), 
                df=df)
    return(result)
}

yearExtract <- function(string,db){
  if (db=="ISI"){
  ind=regexpr(" [[:digit:]]{4} ",string)
  ind[is.na(ind)]=-1
  string[ind==-1]=" 0000 "
  ind[ind==-1]=1
  attr(ind[ind==-1],"match.length")=6
  y=trim(unlist(regmatches(string,ind)))
  }else{
    ind=regexpr("\\([[:digit:]]{4}\\)",string)
    ind[is.na(ind)]=-1
    string[ind==-1]="(0000)"
    ind[ind==-1]=1
    attr(ind[ind==-1],"match.length")=6
    y=unlist(regmatches(string,ind))
    y=substr(y,2,5)
  }
  return(y)
}

reduceRefs<- function(A){
  
  ind=unlist(regexec("*V[0-9]", A))
  A[ind>-1]=substr(A[ind>-1],1,(ind[ind>-1]-1))
  ind=unlist(regexec("*DOI ", A))
  A[ind>-1]=substr(A[ind>-1],1,(ind[ind>-1]-1))
  return(A)
}

refCleaning <- function(l,db){
  if (db=="ISI"){
    #ref<-unlist(lapply(Fi, function(l){
      l<-gsub("\\).*",")",l)
      l<-gsub(","," ",l)
      l<-gsub(";"," ",l)
      l <- gsub("\\."," ",l)
      l <- trimws(trimES(l))
      l<-l[nchar(l)>0]
      #return(l)
   # }))
  }else{
    #ref<-unlist(lapply(Fi, function(l){
      l<-gsub(","," ",l)
      l<-gsub(";"," ",l) 
      l <- gsub("\\."," ",l)
      l <- trimws(trimES(l))
      l<-l[nchar(l)>0]
      return(l)
    #}))
  }
  return(l)
}
massimoaria/bibliometrix documentation built on April 29, 2024, 2:15 p.m.