bibliometrix: Comprehensive Science Mapping Analysis

Documented in convert2df

#' Import and Convert bibliographic export files and API objects.
#'
#' It converts a SCOPUS, Clarivate Analytics WoS, Dimensions, Lens.org, PubMed and COCHRANE Database export files or pubmedR and dimensionsR JSON/XML 
#' objects into a data frame, with cases corresponding to articles and variables to Field Tags as used in WoS.
#'
#' @param file a character array containing a sequence of filenames coming from WoS, Scopus, Dimensions, Lens.org, and Pubmed. Alternatively, \code{file} can be 
#' an object resulting from an API query fetched from Dimensions, PubMed or OpenAlex databases: 
#' \tabular{lll}{
#' a)\tab 'wos' \tab Clarivate Analytics WoS (in plaintext '.txt', Endnote Desktop '.ciw', or bibtex formats '.bib');\cr
#' b)\tab 'scopus' \tab SCOPUS (exclusively in bibtex format '.bib');\cr
#' c)\tab 'dimensions' \tab Digital Science Dimensions (in csv '.csv' or excel '.xlsx' formats);\cr
#' d)\tab 'lens' \tab Lens.org (in csv '.csv');\cr
#' e)\tab 'pubmed' \tab an object of the class \code{pubmedR (package pubmedR)} containing a collection obtained from a query performed with pubmedR package;\cr
#' f)\tab 'dimensions' \tab an object of the class \code{dimensionsR (package dimensionsR)} containing a collection obtained from a query performed with dimensionsR package;\cr
#' g)\tab 'openalex' \tab OpenAlex .csv file;\cr
#' h)\tab 'openalex_api' \tab a data frame object returned by openalexR package, containing a collection of works resulting from a query fetched from OpenAlex database.}
#' @param dbsource is a character indicating the bibliographic database. \code{dbsource} can be \code{dbsource = c('cochrane','dimensions','generic','isi','openalex', 'pubmed','scopus','wos', 'lens')} . Default is \code{dbsource = "isi"}.
#' @param format is a character indicating the SCOPUS, Clarivate Analytics WoS, and other databases export file format. \code{format} can be \code{c('api', 'bibtex', 'csv', 'endnote','excel','plaintext', 'pubmed')}. Default is \code{format = "plaintext"}.
#' @param remove.duplicates is logical. If TRUE, the function will remove duplicated items checking by DOI and database ID.
#' @return a data frame with cases corresponding to articles and variables to Field Tags in the original export file.
#' 
#' I.e We have three files download from Web of Science in plaintext format, file will be:
#' 
#' file <- c("filename1.txt", "filename2.txt", "filename3.txt") 
#' 
#' data frame columns are named using the standard Clarivate Analytics WoS Field Tag codify. The main field tags are:
#'
#' \tabular{lll}{
#' \code{AU}\tab   \tab Authors\cr
#' \code{TI}\tab   \tab Document Title\cr
#' \code{SO}\tab   \tab Publication Name (or Source)\cr
#' \code{JI}\tab   \tab ISO Source Abbreviation\cr
#' \code{DT}\tab   \tab Document Type\cr
#' \code{DE}\tab   \tab Authors' Keywords\cr
#' \code{ID}\tab   \tab Keywords associated by SCOPUS or WoS database \cr
#' \code{AB}\tab   \tab Abstract\cr
#' \code{C1}\tab   \tab Author Address\cr
#' \code{RP}\tab   \tab Reprint Address\cr
#' \code{CR}\tab   \tab Cited References\cr
#' \code{TC}\tab   \tab Times Cited\cr
#' \code{PY}\tab   \tab Year\cr
#' \code{SC}\tab   \tab Subject Category\cr
#' \code{UT}\tab   \tab Unique Article Identifier\cr
#' \code{DB}\tab   \tab Database\cr}
#'
#' for a complete list of field tags see: \href{https://www.bibliometrix.org/documents/Field_Tags_bibliometrix.pdf}{Field Tags used in bibliometrix}
#' 
#' @examples
#' 
#' # Example:
#' # Import and convert a Web of Science collection form an export file in plaintext format:
#' 
#' \dontrun{
#' files <- 'https://www.bibliometrix.org/datasets/wos_plaintext.txt'
#' 
#' M <- convert2df(file = files, dbsource = 'wos', format = "plaintext")
#' }
#'
#' 
#' @export

convert2df<-function(file,dbsource="wos",format="plaintext", remove.duplicates=TRUE){

  allowed_formats <- c('api', 'bibtex', 'csv', 'endnote','excel','plaintext', 'pubmed') 
  allowed_db <- c('cochrane','dimensions','generic','isi','openalex', 'openalex_api','pubmed','scopus','wos', 'lens')
  
  cat("\nConverting your",dbsource,"collection into a bibliographic dataframe\n\n")
  if (length(setdiff(dbsource,allowed_db))>0){
    cat("\n 'dbsource' argument is not properly specified")
    cat("\n 'dbsource' argument has to be a character string matching one among:",allowed_db, "\n")
    }
  if (length(setdiff(format,allowed_formats))>0){
    cat("\n 'format' argument is not properly specified")
    cat("\n 'format' argument has to be a character string matching one among:",allowed_formats,"\n")
  }

  if (dbsource=="wos") dbsource <- "isi"
  if (format=="endnote") format <- "plaintext"
  if (format == "lens") format <- "csv"
  
  
  switch(
    dbsource,
    ## db WOS
    isi = {
      switch(format,
             bibtex = {
               D <- importFiles(file)
               M <- bib2df(D, dbsource = "isi") 
             },
             plaintext = {
               D <- importFiles(file)
               M <- isi2df(D) 
             })
    },
    ## db SCOPUS
    scopus = {
      switch(format,
             bibtex = {
               D <- importFiles(file)
               M <- bib2df(D, dbsource = "scopus")
             },
             csv = {
               M <- csvScopus2df(file)
             })
    },
    ## db GENERIC BIBTEX
    generic = {
      D <- importFiles(file)
      M <- bib2df(D, dbsource = "generic")
    },
    ## db LENS
    lens = {
      M <- csvLens2df(file)
    },
    ## db PUBMED
    pubmed = {
      switch(format,
             api = {
               M <- pmApi2df(file)
               M$DB <- "PUBMED"
             },
             {
               D <- importFiles(file)
               M <- pubmed2df(D) 
             })
    },
    ## db COCHRANE
    cochrane = {
      D <- importFiles(file)
      M <- cochrane2df(D)
    },
    ## db DIMENSIONS
    dimensions = {
      switch(format,
             api = {
               M <- dsApi2df(file)
               M$DB <- "DIMENSIONS"
             },
             {
               M <- dimensions2df(file, format = format)
               M$DB <- "DIMENSIONS"
             })
      
    },
    openalex={
      M <- csvOA2df(file)
    },
    openalex_api = {
      if (!"bibliometrixDB" %in% class(file)){
        M <- openalexR::oa2bibliometrix(file)
      } else {
        M <- file
      }
    }
  )
  if ("PY" %in% names(M)){M$PY=as.numeric(M$PY)} else {M$PY <- NA}
  
  if ("TC" %in% names(M)){
    M$TC=as.numeric(M$TC)
    M$TC[is.na(M$TC)] <- 0
  } else {M$TC <- 0}
  
  if (!("CR" %in% names(M))){
    M$CR="none"
  } else {
    M$CR <- trim.leading(trimES(gsub("\\[,||\\[||\\]|| \\.\\. || \\. ","",M$CR)))  # remove foreign characters from CR (i.e. Chinese, Russian characters)
  }
  
  if (dbsource!="cochrane"){M$AU <- gsub(intToUtf8(8217),intToUtf8(39),M$AU)}
  
  cat("Done!\n\n")
  
  if (!(dbsource %in% c("pubmed", "lens", "openalex_api"))) {
    ## AU_UN field creation
    if ("C1" %in% names(M)) {
      cat("\nGenerating affiliation field tag AU_UN from C1:  ")
      
      if (!"AU_UN" %in% names(M)) M <- metaTagExtraction(M, Field = "AU_UN")
      cat("Done!\n\n")
    } else{
      M$C1 = NA
      M$AU_UN = NA
    }
    
    ## AU normalization
    M$AU = unlist(lapply(strsplit(M$AU, ";"), function(x) {
      x = trimws(trimES(gsub("[^[:alnum:][-]']", " ", x)))
      x = paste(x, collapse = ";")
    }))
  }
  
  if ((dbsource == "pubmed") & (format == "pubmed")) {
    if ("C1" %in% names(M)) {
      cat("\nGenerating affiliation field tag AU_UN from C1:  ")

      M <- metaTagExtraction(M, Field = "AU_UN")
      cat("Done!\n\n")
    } else{
      M$C1 = NA
      M$AU_UN = NA
    }
  }
  
  # remove double ;
  M <- M %>% 
    mutate_if(is.character, ~gsub(";;",";",.x))
  
  ### SR field creation
  if (isTRUE(remove.duplicates)){
    switch(dbsource,
           isi={
             id_field <- "UT"
           },
           scopus={
             if (format=="csv"){
               id_field <- "UT"
             } else {
               id_field <- "TI"
             }
             
           },
           openalex={
             id_field <- "id_oa"
           },
           openalex_api={
             id_field <- "id_oa"
           },
           dimneisons={
             id_field <- "UT"
           },
           pubmed={
             id_field <- "PMID"
           },
           lens={
             id_field <- "UT"
           },
           {
             id_field <- "TI"
           })
    d <- duplicated(M[id_field]) 
    if (sum(d)>0) cat("\nRemoved ",sum(d),"duplicated documents\n")
    M <- M[!d,]
    }
  suppressWarnings(M <- metaTagExtraction(M, Field="SR"))
  row.names(M) <- M$SR
  
 
  
  ### bibliometrix>DB class
  class(M) <- c("bibliometrixDB", "data.frame")
  
  return(M)

}