R/metaTagExtraction.R

Defines functions ND removeLastChar lastChar AU_UN AU1_CO AU_CO CR_SO CR_AU SR metaTagExtraction

Documented in metaTagExtraction

#' Meta-Field Tag Extraction
#'
#' It extracts other field tags, different from the standard WoS/SCOPUS codify.
#' @param M is a data frame obtained by the converting function \code{\link{convert2df}}.
#'        It is a data matrix with cases corresponding to articles and variables to Field Tag in the original WoS or SCOPUS file.
#' @param Field is a character object. New tag extracted from aggregated data is specified by this string.
#' Field can be equal to one of these tags:
#' \tabular{lll}{
#' \code{"CR_AU"}\tab   \tab First Author of each cited reference\cr
#' \code{"CR_SO"}\tab   \tab Source of each cited reference\cr
#' \code{"AU_CO"}\tab   \tab Country of affiliation for co-authors \cr
#' \code{"AU1_CO"}\tab   \tab Country of affiliation for the first author\cr
#' \code{"AU_UN"}\tab   \tab University of affiliation for each co-author and the corresponding author (AU1_UN)\cr
#' \code{"SR"}\tab     \tab Short tag of the document (as used in reference lists)}
#'
#' @param sep is the field separator character. This character separates strings in each column of the data frame. The default is \code{sep = ";"}.
#' @param aff.disamb is a logical. If TRUE and Field="AU_UN", then a disambiguation algorithm is used to identify and match scientific affiliations
#' (univ, research centers, etc.). The default is \code{aff.disamb=TRUE}.
#' @return the bibliometric data frame with a new column containing data about new field tag indicated in the argument \code{Field}.
#'
#'
#'
#' @examples
#' # Example 1: First Authors for each cited reference
#'
#' data(scientometrics, package = "bibliometrixData")
#' scientometrics <- metaTagExtraction(scientometrics, Field = "CR_AU", sep = ";")
#' unlist(strsplit(scientometrics$CR_AU[1], ";"))
#'
#'
#' # Example 2: Source for each cited reference
#'
#' data(scientometrics)
#' scientometrics <- metaTagExtraction(scientometrics, Field = "CR_SO", sep = ";")
#' unlist(strsplit(scientometrics$CR_SO[1], ";"))
#'
#' # Example 3: Affiliation country for co-authors
#'
#' data(scientometrics)
#' scientometrics <- metaTagExtraction(scientometrics, Field = "AU_CO", sep = ";")
#' scientometrics$AU_CO[1:10]
#'
#' @seealso \code{\link{convert2df}} for importing and converting bibliographic files into a data frame.
#' @seealso \code{\link{biblioAnalysis}} function for bibliometric analysis
#'
#' @export

metaTagExtraction <- function(M, Field = "CR_AU", sep = ";", aff.disamb = TRUE) {
  ### data cleaning
  if ("CR" %in% names(M)) {
    M$CR <- gsub("DOI;", "DOI ", as.character(M$CR))
  }


  ### SR field creation

  if (Field == "SR") {
    M <- SR(M)
  }


  if (Field == "CR_AU") {
    M <- CR_AU(M, sep)
  }

  ### CR_SO field creation

  if (Field == "CR_SO") {
    M <- CR_SO(M, sep)
  }
  ### AU_CO field creation

  if (Field == "AU_CO") {
    M <- AU_CO(M)
  }

  if (Field == "AU1_CO") {
    M <- AU1_CO(M, sep)
  }


  # UNIVERSITY AFFILIATION OF ALL AUTHORS AND CORRESPONDING AUTHOR
  if (Field == "AU_UN") {
    ### with disambiguation
    if (isTRUE(aff.disamb)) {
      M <- AU_UN(M, sep)
    } else {
      ### without disambiguation
      M$AU_UN <- gsub("\\[.*?\\] ", "", M$C1)
      M$AU1_UN <- unlist(lapply(strsplit(M$RP, sep), function(l) {
        l <- l[1]
        return(l)
      }))
      ind <- regexpr("\\),", M$AU1_UN)
      a <- which(ind > -1)
      M$AU1_UN[a] <- trim(substr(M$AU1_UN[a], ind[a] + 2, nchar(M$AU1_UN[a])))
    }
  }

  return(M)
}


### SR field
SR <- function(M) {
  listAU <- strsplit(as.character(M$AU), ";")
  listAU <- lapply(listAU, function(l) trim.leading(l))
  if (M$DB[1] == "scopus") {
    listAU <- lapply(listAU, function(l) {
      l <- trim(l)
      l <- sub(" ", ",", l, fixed = TRUE)
      l <- sub(",,", ",", l, fixed = TRUE)
      l <- gsub(" ", "", l, fixed = TRUE)
    })
  }
  FirstAuthors <- gsub(",", " ", unlist(lapply(listAU, function(l) {
    if (length(l > 0)) {
      l <- l[[1]]
    } else {
      (l <- "NA")
    }
    return(l)
  })))

  if (!is.null(M$J9)) {
    ## replace full title in no iso names
    no_art <- which(is.na(M$J9) & is.na(M$JI))
    M$J9[no_art] <- M$SO[no_art]
    ## repleace NA in J9 with JIO
    ind <- which(is.na(M$J9))
    M$J9[ind] <- trim(gsub("\\.", " ", M$JI[ind]))
    SR <- paste(FirstAuthors, M$PY, M$J9, sep = ", ")
  } else {
    no_art <- which(is.na(M$JI))
    M$JI[no_art] <- M$SO[no_art]
    J9 <- trim(gsub("\\.", " ", M$JI))
    SR <- paste(FirstAuthors, M$PY, J9, sep = ", ")
  }

  M$SR_FULL <- gsub("\\s+", " ", SR)

  ## assign an unique name to each document
  SR <- gsub("\\s+", " ", SR)
  st <- i <- 0
  while (st == 0) {
    ind <- which(duplicated(SR))
    if (length(ind) > 0) {
      i <- i + 1
      SR[ind] <- paste0(SR[ind], "-", letters[i], sep = "")
    } else {
      st <- 1
    }
  }
  M$SR <- SR
  # M$SR<- gsub("\\s+", " ", SR)
  return(M)
}

### CR_AU field
CR_AU <- function(M, sep) {
  FCAU <- list(NULL)
  CCR <- NULL
  size <- dim(M)
  CR <- M$CR
  listCAU <- strsplit(as.character(CR), sep)
  listCAU <- lapply(listCAU, function(l) l <- l[nchar(l) > 10]) ## delete not congruent references

  # vector of cited authors
  for (i in 1:size[1]) {
    FCAU[[i]] <- gsub("[[:punct:]]", "", trim.leading(sub(",.*", "", listCAU[[i]])))
    CCR[i] <- paste(FCAU[[i]], collapse = ";")
  }

  M$CR_AU <- CCR
  return(M)
}

### CR_SO field
CR_SO <- function(M, sep) {
  FCAU <- list(NULL)
  CCR <- NULL
  size <- dim(M)
  CR <- M$CR
  listCAU <- strsplit(as.character(CR), sep)


  # vector of cited Journals
  if (M$DB[1] == "ISI") {
    for (i in 1:size[1]) {
      elem <- strsplit(as.character(listCAU[[i]]), ",")
      ind <- lengths(elem)
      if (max(ind) > 2) {
        elem <- elem[ind > 2]
        FCAU[[i]] <- trim.leading(unlist(lapply(elem, function(l) l[[3]])))
        CCR[i] <- paste(FCAU[[i]], collapse = ";")
      } else {
        CCR[[i]] <- NA
      }
    }
  } else if (M$DB[1] == "SCOPUS") {
    for (i in 1:size[1]) {
      listCAU[[i]] <- gsub(".*?\\) ", "", listCAU[[i]])
      elem <- strsplit(as.character(listCAU[[i]]), ",")
      ind <- lengths(elem)
      CCR[[i]] <- NA
      if (length(ind) > 0) {
        if (max(ind) > 2) {
          elem <- elem[ind > 2]
          FCAU[[i]] <- trim.leading(unlist(lapply(elem, function(l) l[[1]])))
          CCR[i] <- paste(FCAU[[i]], collapse = ";")
        }
      }
    }
  }

  M$CR_SO <- unlist(CCR)
  return(M)
}

### AU_CO field
AU_CO <- function(M) {
  # Countries
  size <- dim(M)[1]
  data("countries", envir = environment())
  countries <- as.character(countries[[1]])
  if (M$DB[1] %in% c("ISI", "PUBMED")) {
    countries <- as.character(sapply(countries, function(s) paste0(s, ".", collapse = "")))
  } else if (M$DB[1] == "SCOPUS") {
    countries <- as.character(sapply(countries, function(s) paste0(s, ";", collapse = "")))
  } else if (M$DB[1] == "DIMENSIONS") {
    countries <- as.character(sapply(countries, function(s) paste0(s, ")", collapse = "")))
  }


  M$AU_CO <- NA
  C1 <- M$C1
  ## must replace all NA before "removing reprint info", or NA_character_ became string "NA"
  if ("RP" %in% names(M)) {
    C1[which(is.na(C1))] <- M$RP[which(is.na(C1))]
  } else {
    M$RP <- NA
  }

  ## remove reprint information from C1
  C1 <- unlist(lapply(C1, function(l) {
    if (!is.na(l)) {
      l <- unlist(strsplit(l, ";"))
      # l=l[regexpr("REPRINT AUTHOR",l)==-1]
      l <- paste0(l, collapse = ";")
    } else {
      "NA"
    }
  }))
  ## above changes all NA to 'NA'

  C1 <- gsub("\\[.*?\\] ", "", C1)
  ## change 'NA' back to NA
  C1[which(C1 == "NA")] <- NA
  if (M$DB[1] == "ISI") {
    C1 <- lastChar(C1, last = ".")
  }
  if (M$DB[1] == "SCOPUS") {
    C1 <- lastChar(C1, last = ";")
  }
  if (M$DB[1] == "DIMENSIONS") {
    C1 <- gsub("\\.)", ")", C1)
  }

  # C1=gsub("[[:punct:][:blank:]]+", " ", C1)
  RP <- M$RP
  # RP[which(is.na(RP))]=M$RRP)
  RP <- paste(RP, ";", sep = "")
  # RP = gsub("[[:punct:][:blank:]]+", " ", RP)
  ## this will make gregexpr(l, RP[i], fixed=TRUE) fail as countries has the format '_COUNTRY_.' or '_COUNTRY_;'

  for (i in 1:size[1]) {
    if (!is.na(C1[i])) {
      ind <- unlist(sapply(countries, function(l) (gregexpr(l, C1[i], fixed = TRUE))))
      if (sum(ind > -1) > 0) {
        M$AU_CO[i] <- paste(unique(names(ind[ind > -1])), collapse = ";")
      }
    }
    if (is.na(M$AU_CO[i])) {
      ind <- unlist(sapply(countries, function(l) (gregexpr(l, RP[i], fixed = TRUE))))
      if (sum(ind > -1) > 0) {
        M$AU_CO[i] <- paste(unique(names(ind[ind > -1])), collapse = ";")
      }
    }
  }

  M$AU_CO <- gsub("[[:digit:]]", "", M$AU_CO)
  M$AU_CO <- gsub("\\(|)", "", M$AU_CO)
  M$AU_CO <- gsub(".", "", M$AU_CO, fixed = TRUE)
  M$AU_CO <- gsub(";;", ";", M$AU_CO, fixed = TRUE)
  M$AU_CO <- gsub("UNITED STATES", "USA", M$AU_CO)
  M$AU_CO <- gsub("RUSSIAN FEDERATION", "RUSSIA", M$AU_CO)
  M$AU_CO <- gsub("TAIWAN", "CHINA", M$AU_CO)
  M$AU_CO <- gsub("ENGLAND", "UNITED KINGDOM", M$AU_CO)
  M$AU_CO <- gsub("SCOTLAND", "UNITED KINGDOM", M$AU_CO)
  M$AU_CO <- gsub("WALES", "UNITED KINGDOM", M$AU_CO)
  M$AU_CO <- gsub("NORTH IRELAND", "UNITED KINGDOM", M$AU_CO)

  if (M$DB[1] == "ISI") {
    M$AU_CO <- removeLastChar(M$AU_CO, last = ".")
  }
  if (M$DB[1] == "SCOPUS") {
    M$AU_CO <- removeLastChar(M$AU_CO, last = ";")
  }

  return(M)
}

### AU1_CO field
AU1_CO <- function(M, sep) {
  if (M$DB[1] == "DIMENSIONS") {
    M2 <- AU_CO(M)
    M$AU1_CO <- unlist(lapply(strsplit(M2$AU_CO, ";"), function(l) l[1]))
  } else {
    size <- dim(M)[1]
    # Countries
    data("countries", envir = environment())
    countries <- as.character(countries[[1]])
    countries <- paste(" ", countries, " ", sep = "")
    M$AU1_CO <- NA
    C1 <- M$C1
    C1[which(!is.na(M$RP))] <- M$RP[which(!is.na(M$RP))]
    ## do this before strsplit(), otherwise entries with multiple (reprint) author would be split between first group of authors
    C1 <- gsub("\\[.*?\\] ", "", C1)
    ## remove string before the first "(REPRINT AUTHOR)", otherwise C1 may get split between first group of authors, thus removing address, forcing it to default to RP.



    C1 <- gsub("^.*?\\(REPRINT\\sAUTHOR\\)", "", C1)
    C1 <- unlist(lapply(strsplit(C1, sep), function(l) l[1]))
    ## remove all characters before the last comma, thus constantly leaving only country, or in the case of US, state + zip_code + country.
    ## this way the need to distinguish Georgia and Georgia, US is eliminated.
    C1 <- gsub("^(.+)?,", "", C1)
    C1 <- gsub("[[:punct:][:blank:]]+", " ", C1)
    C1 <- paste(trim(C1), " ", sep = "")
    if (M$DB[1] != "PUBMED") {
      RP <- M$RP
      # RP[which(is.na(RP))]=M$RRP)
      RP <- paste(RP, ";", sep = "")
      RP <- gsub("[[:punct:][:blank:]]+", " ", RP)
    } else {
      RP <- C1 <- paste(" ", gsub("[[:punct:]]", "", C1), sep = "")
    }

    for (i in 1:size[1]) {
      if (!is.na(C1[i])) {
        ind <- unlist(sapply(countries, function(l) (gregexpr(l, C1[i], fixed = TRUE))))
        if (sum(ind > -1) > 0) {
          M$AU1_CO[i] <- paste(unique(names(ind[ind > -1][1])), collapse = ";")
          # print(i)
          # print(M$AU1_CO[i])
        }
      }
      if (is.na(M$AU1_CO[i])) {
        ind <- unlist(sapply(countries, function(l) (gregexpr(l, RP[i], fixed = TRUE))))
        if (sum(ind > -1) > 0) {
          M$AU1_CO[i] <- paste(unique(names(ind[ind > -1][1])), collapse = ";")
        }
      }
    }
    M$AU1_CO <- trim(gsub("[[:digit:]]", "", M$AU1_CO))
    M$AU1_CO <- gsub("UNITED STATES", "USA", M$AU1_CO)
    M$AU1_CO <- gsub("RUSSIAN FEDERATION", "RUSSIA", M$AU1_CO)
    M$AU1_CO <- gsub("TAIWAN", "CHINA", M$AU1_CO)
    M$AU1_CO <- gsub("ENGLAND", "UNITED KINGDOM", M$AU1_CO)
    M$AU1_CO <- gsub("SCOTLAND", "UNITED KINGDOM", M$AU1_CO)
    M$AU1_CO <- gsub("WALES", "UNITED KINGDOM", M$AU1_CO)
    M$AU1_CO <- gsub("NORTH IRELAND", "UNITED KINGDOM", M$AU1_CO)
    # M$AU1_CO=gsub(".", "", M$AU1_CO, fixed = TRUE)
    # M$AU1_CO=gsub(";;", ";", M$AU1_CO, fixed = TRUE)
  }
  return(M)
}

### AU_UN field
AU_UN <- function(M, sep) {
  ## remove reprint information from C1
  C1 <- M$C1
  if (!("RP" %in% names(M))) {
    M$RP <- NA
  }
  # C1=unlist(lapply(C1,function(l){
  #   l=unlist(strsplit(l,";"))
  #   #l=l[regexpr("REPRINT AUTHOR",l)==-1]
  #   l=paste0(l,collapse=";")
  # }))
  ###
  AFF <- gsub("\\[.*?\\] ", "", C1)
  indna <- which(is.na(AFF))
  if (length(indna) > 0) {
    AFF[indna] <- M$RP[indna]
  }
  nc <- nchar(AFF)
  AFF[nc == 0] <- NA

  listAFF <- strsplit(AFF, sep, fixed = TRUE)

  uTags <- c(
    "UNIV", "COLL", "SCH", "INST", "ACAD", "ECOLE", "CTR", "SCI", "CENTRE", "CENTER", "CENTRO", "HOSP", "ASSOC", "COUNCIL",
    "FONDAZ", "FOUNDAT", "ISTIT", "LAB", "TECH", "RES", "CNR", "ARCH", "SCUOLA", "PATENT OFF", "CENT LIB", "HEALTH", "NATL",
    "LIBRAR", "CLIN", "FDN", "OECD", "FAC", "WORLD BANK", "POLITECN", "INT MONETARY FUND", "CLIMA", "METEOR", "OFFICE", "ENVIR",
    "CONSORTIUM", "OBSERVAT", "AGRI", "MIT ", "INFN", "SUNY "
  )

  AFFL <- lapply(listAFF, function(l) {
    # l=gsub(","," ,",l)
    l <- gsub("\\(REPRINT AUTHOR\\)", "", l)
    index <- NULL

    for (i in 1:length(l)) {
      # ind=list()
      affL <- unlist(strsplit(l[i], ",", fixed = TRUE))

      indd <- unlist(lapply(uTags, function(x) which(regexpr(x, affL, fixed = TRUE) != -1)))


      # if (length(indd)==0){index=append(index,"NR")
      #   } else if (grepl("[[:digit:]]", affL[indd[1]])){index=append(index,"ND")
      #   } else {index=append(index,affL[indd[1]])}

      if (length(indd) == 0) {
        index <- append(index, "NOTREPORTED")
      } else if (isTRUE(ND(affL, indd)$cond)) {
        index <- append(index, "NOTDECLARED")
      } else {
        index <- append(index, ND(affL, indd)$affL)
      }
    }
    # index=unique(c(ind1,ind2,ind3,ind4,ind5,ind6,ind7,ind8))
    x <- ""
    if (length(index) > 0) {
      # x=paste0(trim.leading((affL[index])),collapse=",")
      x <- paste0(trim.leading(index), collapse = ";")
      x <- gsub(" ,", ";", x)
    }
    return(x)
  })
  AFFL <- unlist(AFFL)
  M$AU_UN <- AFFL
  if (M$DB[1] %in% c("ISI", "OPENALEX") & "C3" %in% names(M)) {
    M$AU_UN[!is.na(M$C3) & M$C3 != ""] <- M$C3[!is.na(M$C3) & M$C3 != ""]
    M$AU_UN <- unlist(lapply(strsplit(M$AU_UN, sep), function(l) {
      l <- paste(trim(l), collapse = sep)
    }))
  }
  M$AU_UN <- gsub("\\\\&", "AND", M$AU_UN)
  M$AU_UN <- gsub("\\&", "AND", M$AU_UN)


  ## identification of Corresponding author affiliation
  RP <- M$RP
  RP[is.na(RP)] <- M$C1[is.na(RP)]
  AFF <- gsub("\\[.*?\\] ", "", RP)
  indna <- which(is.na(AFF))
  if (length(indna) > 0) {
    AFF[indna] <- M$RP[indna]
  }
  nc <- nchar(AFF)
  AFF[nc == 0] <- NA

  listAFF <- strsplit(AFF, sep, fixed = TRUE)

  AFFL <- lapply(listAFF, function(l) {
    # l=gsub(","," ,",l)
    l <- gsub("\\(REPRINT AUTHOR\\)", "", l)
    index <- NULL

    for (i in 1:length(l)) {
      # ind=list()
      affL <- unlist(strsplit(l[i], ",", fixed = TRUE))

      indd <- unlist(lapply(uTags, function(x) which(regexpr(x, affL, fixed = TRUE) != -1)))

      if (length(indd) == 0) {
        index <- append(index, "NOTREPORTED")
      } else if (grepl("[[:digit:]]", affL[indd[1]])) {
        index <- append(index, "NOTDECLARED")
      } else {
        index <- append(index, affL[indd[1]])
      }
    }
    # index=unique(c(ind1,ind2,ind3,ind4,ind5,ind6,ind7,ind8))
    x <- ""
    if (length(index) > 0) {
      # x=paste0(trim.leading((affL[index])),collapse=",")
      x <- paste0(trim.leading(index), collapse = ";")
      x <- gsub(" ,", ";", x)
    }
    return(x)
  })
  AFFL <- unlist(AFFL)
  M$AU1_UN <- AFFL
  M$AU1_UN <- gsub("\\\\&", "AND", M$AU1_UN)
  M$AU1_UN <- gsub("\\&", "AND", M$AU1_UN)

  ## identification of NR affiliations
  M$AU_UN_NR <- NA
  listAFF2 <- strsplit(M$AU_UN, sep)
  cont <- lapply(listAFF2, function(l) {
    l <- unlist(l)
    ind <- which(l %in% "NR")
  })

  for (i in 1:length(cont)) {
    if (length(cont[[i]]) > 0) {
      M$AU_UN_NR[i] <- paste(trim(listAFF[[i]][cont[[i]]]), collapse = ";")
    }
  }
  M$AU_UN[is.na(AFF)] <- NA
  M$AU_UN[M$AU_UN == "NOTDECLARED"] <- NA
  M$AU_UN[M$AU_UN == "NOTREPORTED"] <- NA
  M$AU_UN <- gsub("NOTREPORTED;", "", M$AU_UN)
  M$AU_UN <- gsub(";NOTREPORTED", "", M$AU_UN)
  M$AU_UN <- gsub("NOTDECLARED;", "", M$AU_UN)
  M$AU_UN <- gsub("NOTDECLARED", "", M$AU_UN)

  return(M)
}


### tools
lastChar <- function(C, last = ".") {
  A <- substr(C, nchar(C), nchar(C))
  ind <- which((A != last) & (!is.na(A)))
  C[ind] <- paste0(C[ind], last)
  return(C)
}

removeLastChar <- function(C, last = ".") {
  A <- substr(C, nchar(C), nchar(C))
  ind <- which((A == last) & (!is.na(A)))
  C[ind] <- substr(C[ind], 1, (nchar(C[ind]) - 1))
  return(C)
}
### remove non interesting field
ND <- function(affL, indd) {
  aff <- affL[!grepl("[[:digit:]]", affL)]
  ind <- indd[!grepl("[[:digit:]]", affL[indd])]
  cond <- length(ind) < 1
  r <- list(affL = aff[ind[1]], cond = cond)
  return(r)
}
massimoaria/bibliometrix documentation built on June 15, 2025, 2:06 a.m.