#' Meta-Field Tag Extraction
#'
#' It extracts other field tags, different from the standard WoS/SCOPUS codify.
#' @param M is a data frame obtained by the converting function \code{\link{convert2df}}.
#' It is a data matrix with cases corresponding to articles and variables to Field Tag in the original WoS or SCOPUS file.
#' @param Field is a character object. New tag extracted from aggregated data is specified by this string.
#' Field can be equal to one of these tags:
#' \tabular{lll}{
#' \code{"CR_AU"}\tab \tab First Author of each cited reference\cr
#' \code{"CR_SO"}\tab \tab Source of each cited reference\cr
#' \code{"AU_CO"}\tab \tab Country of affiliation for co-authors \cr
#' \code{"AU1_CO"}\tab \tab Country of affiliation for the first author\cr
#' \code{"AU_UN"}\tab \tab University of affiliation for each co-author and the corresponding author (AU1_UN)\cr
#' \code{"SR"}\tab \tab Short tag of the document (as used in reference lists)}
#'
#' @param sep is the field separator character. This character separates strings in each column of the data frame. The default is \code{sep = ";"}.
#' @param aff.disamb is a logical. If TRUE and Field="AU_UN", then a disambiguation algorithm is used to identify and match scientific affiliations
#' (univ, research centers, etc.). The default is \code{aff.disamb=TRUE}.
#' @return the bibliometric data frame with a new column containing data about new field tag indicated in the argument \code{Field}.
#'
#'
#'
#' @examples
#' # Example 1: First Authors for each cited reference
#'
#' data(scientometrics, package = "bibliometrixData")
#' scientometrics <- metaTagExtraction(scientometrics, Field = "CR_AU", sep = ";")
#' unlist(strsplit(scientometrics$CR_AU[1], ";"))
#'
#'
#' # Example 2: Source for each cited reference
#'
#' data(scientometrics)
#' scientometrics <- metaTagExtraction(scientometrics, Field = "CR_SO", sep = ";")
#' unlist(strsplit(scientometrics$CR_SO[1], ";"))
#'
#' # Example 3: Affiliation country for co-authors
#'
#' data(scientometrics)
#' scientometrics <- metaTagExtraction(scientometrics, Field = "AU_CO", sep = ";")
#' scientometrics$AU_CO[1:10]
#'
#' @seealso \code{\link{convert2df}} for importing and converting bibliographic files into a data frame.
#' @seealso \code{\link{biblioAnalysis}} function for bibliometric analysis
#'
#' @export
metaTagExtraction <- function(M, Field = "CR_AU", sep = ";", aff.disamb = TRUE) {
### data cleaning
if ("CR" %in% names(M)) {
M$CR <- gsub("DOI;", "DOI ", as.character(M$CR))
}
### SR field creation
if (Field == "SR") {
M <- SR(M)
}
if (Field == "CR_AU") {
M <- CR_AU(M, sep)
}
### CR_SO field creation
if (Field == "CR_SO") {
M <- CR_SO(M, sep)
}
### AU_CO field creation
if (Field == "AU_CO") {
M <- AU_CO(M)
}
if (Field == "AU1_CO") {
M <- AU1_CO(M, sep)
}
# UNIVERSITY AFFILIATION OF ALL AUTHORS AND CORRESPONDING AUTHOR
if (Field == "AU_UN") {
### with disambiguation
if (isTRUE(aff.disamb)) {
M <- AU_UN(M, sep)
} else {
### without disambiguation
M$AU_UN <- gsub("\\[.*?\\] ", "", M$C1)
M$AU1_UN <- unlist(lapply(strsplit(M$RP, sep), function(l) {
l <- l[1]
return(l)
}))
ind <- regexpr("\\),", M$AU1_UN)
a <- which(ind > -1)
M$AU1_UN[a] <- trim(substr(M$AU1_UN[a], ind[a] + 2, nchar(M$AU1_UN[a])))
}
}
return(M)
}
### SR field
SR <- function(M) {
listAU <- strsplit(as.character(M$AU), ";")
listAU <- lapply(listAU, function(l) trim.leading(l))
if (M$DB[1] == "scopus") {
listAU <- lapply(listAU, function(l) {
l <- trim(l)
l <- sub(" ", ",", l, fixed = TRUE)
l <- sub(",,", ",", l, fixed = TRUE)
l <- gsub(" ", "", l, fixed = TRUE)
})
}
FirstAuthors <- gsub(",", " ", unlist(lapply(listAU, function(l) {
if (length(l > 0)) {
l <- l[[1]]
} else {
(l <- "NA")
}
return(l)
})))
if (!is.null(M$J9)) {
## replace full title in no iso names
no_art <- which(is.na(M$J9) & is.na(M$JI))
M$J9[no_art] <- M$SO[no_art]
## repleace NA in J9 with JIO
ind <- which(is.na(M$J9))
M$J9[ind] <- trim(gsub("\\.", " ", M$JI[ind]))
SR <- paste(FirstAuthors, M$PY, M$J9, sep = ", ")
} else {
no_art <- which(is.na(M$JI))
M$JI[no_art] <- M$SO[no_art]
J9 <- trim(gsub("\\.", " ", M$JI))
SR <- paste(FirstAuthors, M$PY, J9, sep = ", ")
}
M$SR_FULL <- gsub("\\s+", " ", SR)
## assign an unique name to each document
SR <- gsub("\\s+", " ", SR)
st <- i <- 0
while (st == 0) {
ind <- which(duplicated(SR))
if (length(ind) > 0) {
i <- i + 1
SR[ind] <- paste0(SR[ind], "-", letters[i], sep = "")
} else {
st <- 1
}
}
M$SR <- SR
# M$SR<- gsub("\\s+", " ", SR)
return(M)
}
### CR_AU field
CR_AU <- function(M, sep) {
FCAU <- list(NULL)
CCR <- NULL
size <- dim(M)
CR <- M$CR
listCAU <- strsplit(as.character(CR), sep)
listCAU <- lapply(listCAU, function(l) l <- l[nchar(l) > 10]) ## delete not congruent references
# vector of cited authors
for (i in 1:size[1]) {
FCAU[[i]] <- gsub("[[:punct:]]", "", trim.leading(sub(",.*", "", listCAU[[i]])))
CCR[i] <- paste(FCAU[[i]], collapse = ";")
}
M$CR_AU <- CCR
return(M)
}
### CR_SO field
CR_SO <- function(M, sep) {
FCAU <- list(NULL)
CCR <- NULL
size <- dim(M)
CR <- M$CR
listCAU <- strsplit(as.character(CR), sep)
# vector of cited Journals
if (M$DB[1] == "ISI") {
for (i in 1:size[1]) {
elem <- strsplit(as.character(listCAU[[i]]), ",")
ind <- lengths(elem)
if (max(ind) > 2) {
elem <- elem[ind > 2]
FCAU[[i]] <- trim.leading(unlist(lapply(elem, function(l) l[[3]])))
CCR[i] <- paste(FCAU[[i]], collapse = ";")
} else {
CCR[[i]] <- NA
}
}
} else if (M$DB[1] == "SCOPUS") {
for (i in 1:size[1]) {
listCAU[[i]] <- gsub(".*?\\) ", "", listCAU[[i]])
elem <- strsplit(as.character(listCAU[[i]]), ",")
ind <- lengths(elem)
CCR[[i]] <- NA
if (length(ind) > 0) {
if (max(ind) > 2) {
elem <- elem[ind > 2]
FCAU[[i]] <- trim.leading(unlist(lapply(elem, function(l) l[[1]])))
CCR[i] <- paste(FCAU[[i]], collapse = ";")
}
}
}
}
M$CR_SO <- unlist(CCR)
return(M)
}
### AU_CO field
AU_CO <- function(M) {
# Countries
size <- dim(M)[1]
data("countries", envir = environment())
countries <- as.character(countries[[1]])
if (M$DB[1] %in% c("ISI", "PUBMED")) {
countries <- as.character(sapply(countries, function(s) paste0(s, ".", collapse = "")))
} else if (M$DB[1] == "SCOPUS") {
countries <- as.character(sapply(countries, function(s) paste0(s, ";", collapse = "")))
} else if (M$DB[1] == "DIMENSIONS") {
countries <- as.character(sapply(countries, function(s) paste0(s, ")", collapse = "")))
}
M$AU_CO <- NA
C1 <- M$C1
## must replace all NA before "removing reprint info", or NA_character_ became string "NA"
if ("RP" %in% names(M)) {
C1[which(is.na(C1))] <- M$RP[which(is.na(C1))]
} else {
M$RP <- NA
}
## remove reprint information from C1
C1 <- unlist(lapply(C1, function(l) {
if (!is.na(l)) {
l <- unlist(strsplit(l, ";"))
# l=l[regexpr("REPRINT AUTHOR",l)==-1]
l <- paste0(l, collapse = ";")
} else {
"NA"
}
}))
## above changes all NA to 'NA'
C1 <- gsub("\\[.*?\\] ", "", C1)
## change 'NA' back to NA
C1[which(C1 == "NA")] <- NA
if (M$DB[1] == "ISI") {
C1 <- lastChar(C1, last = ".")
}
if (M$DB[1] == "SCOPUS") {
C1 <- lastChar(C1, last = ";")
}
if (M$DB[1] == "DIMENSIONS") {
C1 <- gsub("\\.)", ")", C1)
}
# C1=gsub("[[:punct:][:blank:]]+", " ", C1)
RP <- M$RP
# RP[which(is.na(RP))]=M$RRP)
RP <- paste(RP, ";", sep = "")
# RP = gsub("[[:punct:][:blank:]]+", " ", RP)
## this will make gregexpr(l, RP[i], fixed=TRUE) fail as countries has the format '_COUNTRY_.' or '_COUNTRY_;'
for (i in 1:size[1]) {
if (!is.na(C1[i])) {
ind <- unlist(sapply(countries, function(l) (gregexpr(l, C1[i], fixed = TRUE))))
if (sum(ind > -1) > 0) {
M$AU_CO[i] <- paste(unique(names(ind[ind > -1])), collapse = ";")
}
}
if (is.na(M$AU_CO[i])) {
ind <- unlist(sapply(countries, function(l) (gregexpr(l, RP[i], fixed = TRUE))))
if (sum(ind > -1) > 0) {
M$AU_CO[i] <- paste(unique(names(ind[ind > -1])), collapse = ";")
}
}
}
M$AU_CO <- gsub("[[:digit:]]", "", M$AU_CO)
M$AU_CO <- gsub("\\(|)", "", M$AU_CO)
M$AU_CO <- gsub(".", "", M$AU_CO, fixed = TRUE)
M$AU_CO <- gsub(";;", ";", M$AU_CO, fixed = TRUE)
M$AU_CO <- gsub("UNITED STATES", "USA", M$AU_CO)
M$AU_CO <- gsub("RUSSIAN FEDERATION", "RUSSIA", M$AU_CO)
M$AU_CO <- gsub("TAIWAN", "CHINA", M$AU_CO)
M$AU_CO <- gsub("ENGLAND", "UNITED KINGDOM", M$AU_CO)
M$AU_CO <- gsub("SCOTLAND", "UNITED KINGDOM", M$AU_CO)
M$AU_CO <- gsub("WALES", "UNITED KINGDOM", M$AU_CO)
M$AU_CO <- gsub("NORTH IRELAND", "UNITED KINGDOM", M$AU_CO)
if (M$DB[1] == "ISI") {
M$AU_CO <- removeLastChar(M$AU_CO, last = ".")
}
if (M$DB[1] == "SCOPUS") {
M$AU_CO <- removeLastChar(M$AU_CO, last = ";")
}
return(M)
}
### AU1_CO field
AU1_CO <- function(M, sep) {
if (M$DB[1] == "DIMENSIONS") {
M2 <- AU_CO(M)
M$AU1_CO <- unlist(lapply(strsplit(M2$AU_CO, ";"), function(l) l[1]))
} else {
size <- dim(M)[1]
# Countries
data("countries", envir = environment())
countries <- as.character(countries[[1]])
countries <- paste(" ", countries, " ", sep = "")
M$AU1_CO <- NA
C1 <- M$C1
C1[which(!is.na(M$RP))] <- M$RP[which(!is.na(M$RP))]
## do this before strsplit(), otherwise entries with multiple (reprint) author would be split between first group of authors
C1 <- gsub("\\[.*?\\] ", "", C1)
## remove string before the first "(REPRINT AUTHOR)", otherwise C1 may get split between first group of authors, thus removing address, forcing it to default to RP.
C1 <- gsub("^.*?\\(REPRINT\\sAUTHOR\\)", "", C1)
C1 <- unlist(lapply(strsplit(C1, sep), function(l) l[1]))
## remove all characters before the last comma, thus constantly leaving only country, or in the case of US, state + zip_code + country.
## this way the need to distinguish Georgia and Georgia, US is eliminated.
C1 <- gsub("^(.+)?,", "", C1)
C1 <- gsub("[[:punct:][:blank:]]+", " ", C1)
C1 <- paste(trim(C1), " ", sep = "")
if (M$DB[1] != "PUBMED") {
RP <- M$RP
# RP[which(is.na(RP))]=M$RRP)
RP <- paste(RP, ";", sep = "")
RP <- gsub("[[:punct:][:blank:]]+", " ", RP)
} else {
RP <- C1 <- paste(" ", gsub("[[:punct:]]", "", C1), sep = "")
}
for (i in 1:size[1]) {
if (!is.na(C1[i])) {
ind <- unlist(sapply(countries, function(l) (gregexpr(l, C1[i], fixed = TRUE))))
if (sum(ind > -1) > 0) {
M$AU1_CO[i] <- paste(unique(names(ind[ind > -1][1])), collapse = ";")
# print(i)
# print(M$AU1_CO[i])
}
}
if (is.na(M$AU1_CO[i])) {
ind <- unlist(sapply(countries, function(l) (gregexpr(l, RP[i], fixed = TRUE))))
if (sum(ind > -1) > 0) {
M$AU1_CO[i] <- paste(unique(names(ind[ind > -1][1])), collapse = ";")
}
}
}
M$AU1_CO <- trim(gsub("[[:digit:]]", "", M$AU1_CO))
M$AU1_CO <- gsub("UNITED STATES", "USA", M$AU1_CO)
M$AU1_CO <- gsub("RUSSIAN FEDERATION", "RUSSIA", M$AU1_CO)
M$AU1_CO <- gsub("TAIWAN", "CHINA", M$AU1_CO)
M$AU1_CO <- gsub("ENGLAND", "UNITED KINGDOM", M$AU1_CO)
M$AU1_CO <- gsub("SCOTLAND", "UNITED KINGDOM", M$AU1_CO)
M$AU1_CO <- gsub("WALES", "UNITED KINGDOM", M$AU1_CO)
M$AU1_CO <- gsub("NORTH IRELAND", "UNITED KINGDOM", M$AU1_CO)
# M$AU1_CO=gsub(".", "", M$AU1_CO, fixed = TRUE)
# M$AU1_CO=gsub(";;", ";", M$AU1_CO, fixed = TRUE)
}
return(M)
}
### AU_UN field
AU_UN <- function(M, sep) {
## remove reprint information from C1
C1 <- M$C1
if (!("RP" %in% names(M))) {
M$RP <- NA
}
# C1=unlist(lapply(C1,function(l){
# l=unlist(strsplit(l,";"))
# #l=l[regexpr("REPRINT AUTHOR",l)==-1]
# l=paste0(l,collapse=";")
# }))
###
AFF <- gsub("\\[.*?\\] ", "", C1)
indna <- which(is.na(AFF))
if (length(indna) > 0) {
AFF[indna] <- M$RP[indna]
}
nc <- nchar(AFF)
AFF[nc == 0] <- NA
listAFF <- strsplit(AFF, sep, fixed = TRUE)
uTags <- c(
"UNIV", "COLL", "SCH", "INST", "ACAD", "ECOLE", "CTR", "SCI", "CENTRE", "CENTER", "CENTRO", "HOSP", "ASSOC", "COUNCIL",
"FONDAZ", "FOUNDAT", "ISTIT", "LAB", "TECH", "RES", "CNR", "ARCH", "SCUOLA", "PATENT OFF", "CENT LIB", "HEALTH", "NATL",
"LIBRAR", "CLIN", "FDN", "OECD", "FAC", "WORLD BANK", "POLITECN", "INT MONETARY FUND", "CLIMA", "METEOR", "OFFICE", "ENVIR",
"CONSORTIUM", "OBSERVAT", "AGRI", "MIT ", "INFN", "SUNY "
)
AFFL <- lapply(listAFF, function(l) {
# l=gsub(","," ,",l)
l <- gsub("\\(REPRINT AUTHOR\\)", "", l)
index <- NULL
for (i in 1:length(l)) {
# ind=list()
affL <- unlist(strsplit(l[i], ",", fixed = TRUE))
indd <- unlist(lapply(uTags, function(x) which(regexpr(x, affL, fixed = TRUE) != -1)))
# if (length(indd)==0){index=append(index,"NR")
# } else if (grepl("[[:digit:]]", affL[indd[1]])){index=append(index,"ND")
# } else {index=append(index,affL[indd[1]])}
if (length(indd) == 0) {
index <- append(index, "NOTREPORTED")
} else if (isTRUE(ND(affL, indd)$cond)) {
index <- append(index, "NOTDECLARED")
} else {
index <- append(index, ND(affL, indd)$affL)
}
}
# index=unique(c(ind1,ind2,ind3,ind4,ind5,ind6,ind7,ind8))
x <- ""
if (length(index) > 0) {
# x=paste0(trim.leading((affL[index])),collapse=",")
x <- paste0(trim.leading(index), collapse = ";")
x <- gsub(" ,", ";", x)
}
return(x)
})
AFFL <- unlist(AFFL)
M$AU_UN <- AFFL
if (M$DB[1] %in% c("ISI", "OPENALEX") & "C3" %in% names(M)) {
M$AU_UN[!is.na(M$C3) & M$C3 != ""] <- M$C3[!is.na(M$C3) & M$C3 != ""]
M$AU_UN <- unlist(lapply(strsplit(M$AU_UN, sep), function(l) {
l <- paste(trim(l), collapse = sep)
}))
}
M$AU_UN <- gsub("\\\\&", "AND", M$AU_UN)
M$AU_UN <- gsub("\\&", "AND", M$AU_UN)
## identification of Corresponding author affiliation
RP <- M$RP
RP[is.na(RP)] <- M$C1[is.na(RP)]
AFF <- gsub("\\[.*?\\] ", "", RP)
indna <- which(is.na(AFF))
if (length(indna) > 0) {
AFF[indna] <- M$RP[indna]
}
nc <- nchar(AFF)
AFF[nc == 0] <- NA
listAFF <- strsplit(AFF, sep, fixed = TRUE)
AFFL <- lapply(listAFF, function(l) {
# l=gsub(","," ,",l)
l <- gsub("\\(REPRINT AUTHOR\\)", "", l)
index <- NULL
for (i in 1:length(l)) {
# ind=list()
affL <- unlist(strsplit(l[i], ",", fixed = TRUE))
indd <- unlist(lapply(uTags, function(x) which(regexpr(x, affL, fixed = TRUE) != -1)))
if (length(indd) == 0) {
index <- append(index, "NOTREPORTED")
} else if (grepl("[[:digit:]]", affL[indd[1]])) {
index <- append(index, "NOTDECLARED")
} else {
index <- append(index, affL[indd[1]])
}
}
# index=unique(c(ind1,ind2,ind3,ind4,ind5,ind6,ind7,ind8))
x <- ""
if (length(index) > 0) {
# x=paste0(trim.leading((affL[index])),collapse=",")
x <- paste0(trim.leading(index), collapse = ";")
x <- gsub(" ,", ";", x)
}
return(x)
})
AFFL <- unlist(AFFL)
M$AU1_UN <- AFFL
M$AU1_UN <- gsub("\\\\&", "AND", M$AU1_UN)
M$AU1_UN <- gsub("\\&", "AND", M$AU1_UN)
## identification of NR affiliations
M$AU_UN_NR <- NA
listAFF2 <- strsplit(M$AU_UN, sep)
cont <- lapply(listAFF2, function(l) {
l <- unlist(l)
ind <- which(l %in% "NR")
})
for (i in 1:length(cont)) {
if (length(cont[[i]]) > 0) {
M$AU_UN_NR[i] <- paste(trim(listAFF[[i]][cont[[i]]]), collapse = ";")
}
}
M$AU_UN[is.na(AFF)] <- NA
M$AU_UN[M$AU_UN == "NOTDECLARED"] <- NA
M$AU_UN[M$AU_UN == "NOTREPORTED"] <- NA
M$AU_UN <- gsub("NOTREPORTED;", "", M$AU_UN)
M$AU_UN <- gsub(";NOTREPORTED", "", M$AU_UN)
M$AU_UN <- gsub("NOTDECLARED;", "", M$AU_UN)
M$AU_UN <- gsub("NOTDECLARED", "", M$AU_UN)
return(M)
}
### tools
lastChar <- function(C, last = ".") {
A <- substr(C, nchar(C), nchar(C))
ind <- which((A != last) & (!is.na(A)))
C[ind] <- paste0(C[ind], last)
return(C)
}
removeLastChar <- function(C, last = ".") {
A <- substr(C, nchar(C), nchar(C))
ind <- which((A == last) & (!is.na(A)))
C[ind] <- substr(C[ind], 1, (nchar(C[ind]) - 1))
return(C)
}
### remove non interesting field
ND <- function(affL, indd) {
aff <- affL[!grepl("[[:digit:]]", affL)]
ind <- indd[!grepl("[[:digit:]]", affL[indd])]
cond <- length(ind) < 1
r <- list(affL = aff[ind[1]], cond = cond)
return(r)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.